diff --git a/.drone.yml b/.drone.yml
index 5331fea2ad0..6633204592b 100644
--- a/.drone.yml
+++ b/.drone.yml
@@ -2,20 +2,30 @@
 kind: pipeline
 name: unit-tests
 
+node:
+  freeswitch: public
+
 steps:
     - name: run-tests
       image: signalwire/freeswitch-public-base
       pull: true
       commands:
+      - cat /proc/sys/kernel/core_pattern
       - ./bootstrap.sh -j
       - echo "applications/mod_test" >> modules.conf
-      - ./configure
+      - sed -i '/applications\/mod_http_cache/s/^#//g' modules.conf
+      - sed -i '/event_handlers\/mod_rayo/s/^#//g' modules.conf
+      - sed -i '/formats\/mod_opusfile/s/^#//g' modules.conf
+      - sed -i '/languages\/mod_lua/s/^#//g' modules.conf
+      - export ASAN_OPTIONS=log_path=stdout:disable_coredump=0:unmap_shadow_on_exit=1;
+      - ./configure --enable-address-sanitizer
       - echo "#!/bin/bash\nmake -j`nproc --all` |& tee ./unit-tests-build-result.txt\nexitstatus=\${PIPESTATUS[0]}\necho \$exitstatus > ./build-status.txt\nmake install\n" > build.sh
       - chmod +x build.sh
       - ./build.sh
       - cd tests/unit
       - ./run-tests.sh
-      - mkdir logs && (mv log_run-tests_*.html logs || true)
+      - ls -la /cores
+      - mkdir logs && (mv log_run-tests_*.html logs || true) && (mv backtrace_*.txt logs || true)
       - echo 0 > run-tests-status.txt
       - ./collect-test-logs.sh && exit 0 || echo 'Some tests failed'
       - echo 1 > run-tests-status.txt
@@ -40,8 +50,73 @@ trigger:
   event:
   - pull_request
   - push
+
+---
+kind: pipeline
+name: scan-build
+
+node:
+  freeswitch: public
+
+steps:
+    - name: scan-build
+      image: signalwire/freeswitch-public-base:stretch
+      pull: true
+      commands:
+      - ./bootstrap.sh -j
+      - cp build/modules.conf.most modules.conf
+      #Enable/Uncomment mods
+      - sed -i "/mod_mariadb/s/^#//g" modules.conf
+      - sed -i "/mod_v8/s/^#//g" modules.conf
+      #Disable/Comment out mods
+      - sed -i '/mod_ilbc/s/^/#/g' modules.conf
+      - sed -i '/mod_isac/s/^/#/g' modules.conf
+      - sed -i '/mod_mp4/s/^/#/g' modules.conf
+      - sed -i '/mod_mongo/s/^/#/g' modules.conf
+      - sed -i '/mod_pocketsphinx/s/^/#/g' modules.conf
+      - sed -i '/mod_sangoma_codec/s/^/#/g' modules.conf
+      - sed -i '/mod_siren/s/^/#/g' modules.conf
+      #Comment out mods for a while
+      - sed -i '/mod_avmd/s/^/#/g' modules.conf
+      - sed -i '/mod_basic/s/^/#/g' modules.conf
+      - sed -i '/mod_cdr_mongodb/s/^/#/g' modules.conf
+      - sed -i '/mod_cv/s/^/#/g' modules.conf
+      - sed -i '/mod_erlang_event/s/^/#/g' modules.conf
+      - sed -i '/mod_perl/s/^/#/g' modules.conf
+      - sed -i '/mod_rtmp/s/^/#/g' modules.conf
+      - sed -i '/mod_unimrcp/s/^/#/g' modules.conf
+      - sed -i '/mod_xml_rpc/s/^/#/g' modules.conf
+      - ./configure
+      - mkdir -p scan-build
+      - echo "#!/bin/bash\nscan-build-4.0 -o ./scan-build/ make -j`nproc --all` |& tee ./scan-build-result.txt\nexitstatus=\${PIPESTATUS[0]}\necho \$exitstatus > ./scan-build-status.txt\n" > scan.sh
+      - chmod +x scan.sh
+      - ./scan.sh
+      - exitstatus=`cat ./scan-build-status.txt`
+      - echo "*** Exit status is $exitstatus"
+
+    - name: notify
+      image: signalwire/scan-build-notify
+      pull: true
+      environment:
+        GITHUB_CI_APP_PEM:
+          from_secret: github_ci_app_pem
+        SSH_KEY:
+          from_secret: ssh_key
+        SLACK_WEBHOOK_URL:
+          from_secret: slack_webhook_url
+      commands:
+      - /root/notify.sh
+
+      
+trigger:
+  branch:
+  - master
+  event:
+  - pull_request
+  - push
+
 ---
 kind: signature
-hmac: a34718dd1e2b9468a845962219ff05cac0c922ddf90d885af557a937a9e412e0
+hmac: d354f6d232ae6417b539fb9b40fc15765c3247ab58c87a5135a0ac6c448e1cd0
 
 ...
diff --git a/.gitconfig b/.gitconfig
new file mode 100644
index 00000000000..c1edb813655
--- /dev/null
+++ b/.gitconfig
@@ -0,0 +1,9 @@
+# This repository contains two commits with multiple authors.
+
+# By default, fsck will show errors for those commits.
+# Run the following command to ignore them:
+
+# git config --local include.path ../.gitconfig
+
+[fsck]
+	multipleAuthors=ignore
diff --git a/.gitignore b/.gitignore
index fb53ac2cf06..053abd7afb6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -271,4 +271,11 @@ src/mod/applications/mod_http_cache/test/test-suite.log
 src/mod/applications/mod_http_cache/test/test_aws
 src/mod/applications/mod_http_cache/test/test_aws.log
 src/mod/applications/mod_http_cache/test/test_aws.trs
-
+src/mod/formats/mod_sndfile/test/test_sndfile
+src/mod/formats/mod_sndfile/test/test_sndfile_conf
+src/mod/*/*/test/*.log
+src/mod/*/*/test/*.trs
+src/mod/*/*/test/[0-9]*/*
+test-suite.log
+src/mod/applications/mod_av/test/test_BT7.mp4
+src/mod/applications/mod_av/test/test_RGB.mp4
diff --git a/Makefile.am b/Makefile.am
index 61b2c609dac..8e3a2e852e7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -547,6 +547,8 @@ $(switch_builddir)/modules.conf:
 src/mod/modules.inc: $(switch_builddir)/modules.conf
 	@echo "OUR_MODULES=$(OUR_MODS)" > $(switch_builddir)/src/mod/modules.inc
 	@echo "OUR_CLEAN_MODULES=$(OUR_CLEAN_MODS)" >> $(switch_builddir)/src/mod/modules.inc
+	@echo "OUR_TEST_MODULES=$(OUR_TEST_MODS)" >> $(switch_builddir)/src/mod/modules.inc
+	@echo "OUR_CHECK_MODULES=$(OUR_CHECK_MODS)" >> $(switch_builddir)/src/mod/modules.inc
 	@echo "OUR_INSTALL_MODULES=$(OUR_INSTALL_MODS)" >> $(switch_builddir)/src/mod/modules.inc
 	@echo "OUR_UNINSTALL_MODULES=$(OUR_UNINSTALL_MODS)" >> $(switch_builddir)/src/mod/modules.inc
 	@echo "OUR_DISABLED_MODULES=$(OUR_DISABLED_MODS)" >> $(switch_builddir)/src/mod/modules.inc
@@ -848,6 +850,10 @@ modclean: $(switch_builddir)/modules.conf src/mod/modules.inc
 modwipe:
 	rm -f $(modulesdir)/*.so $(modulesdir)/*.la $(modulesdir)/*.dll $(modulesdir)/*.dylib
 
+print_tests: libfreeswitch.la $(switch_builddir)/modules.conf src/mod/modules.inc
+	@cd tests/unit && $(MAKE) $(AM_MAKEFLAGS) print_tests
+	@cd src/mod && $(MAKE) $(AM_MAKEFLAGS) print_tests
+
 dox:
 	cd docs && doxygen $(PWD)/docs/Doxygen.conf
 
diff --git a/build/modmake.rules.in b/build/modmake.rules.in
index 6ebcf7a6b31..dbce9bd48ec 100644
--- a/build/modmake.rules.in
+++ b/build/modmake.rules.in
@@ -107,6 +107,7 @@ install-modules: all-modules depend_install local_install mod_install
 uninstall-modules: local_uninstall mod_uninstall
 distclean-modules: clean-modules local_distclean
 extraclean-modules: distclean local_extraclean
+print_tests:
 
 Makefile:
 	@if test ! -f $@; then \
@@ -270,4 +271,4 @@ $(switch_srcdir)/src/include/switch_xml.h:
 .PHONY: all clean depend install distclean extraclean \
 	all-modules clean-modules depend-modules install-modules distclean-modules extraclean-modules \
 	local_all local_clean local_depend local_install local_distclean local_extraclean \
-	mod_clean mod_install mod_uninstall depend_install
+	mod_clean mod_install mod_uninstall depend_install print_tests
diff --git a/build/modmake.rulesam b/build/modmake.rulesam
index bffa0db4ac0..5ccdf8b5c20 100644
--- a/build/modmake.rulesam
+++ b/build/modmake.rulesam
@@ -14,3 +14,8 @@ install-modules: install
 uninstall-modules: uninstall
 distclean-modules: distclean
 extraclean-modules: extraclean
+
+print_tests:
+	@set +e; \
+	test -z "$(TESTS)" || for i in $(TESTS); do echo $(subdir)/$$i; done;
+	
\ No newline at end of file
diff --git a/build/next-release.txt b/build/next-release.txt
index 7c64495cce4..145fe2dd008 100644
--- a/build/next-release.txt
+++ b/build/next-release.txt
@@ -1 +1 @@
-1.10.2-release
+1.10.3-release
diff --git a/conf/testing/autoload_configs/sndfile.conf.xml b/conf/testing/autoload_configs/sndfile.conf.xml
new file mode 100644
index 00000000000..d9e0cff75d8
--- /dev/null
+++ b/conf/testing/autoload_configs/sndfile.conf.xml
@@ -0,0 +1,9 @@
+<configuration name="sndfile.conf">
+	<settings>
+		<!-- Allow only these file extensions. Default: allow all sndfile provided extensions + FS custom extra -->
+		<!--
+		<param name="allowed-extensions" value="wav,raw,r8,r16"/>
+		-->
+	</settings>
+</configuration>
+
diff --git a/conf/testing/autoload_configs/switch.conf.xml b/conf/testing/autoload_configs/switch.conf.xml
index 69783e62c58..d07c1b2d6e2 100644
--- a/conf/testing/autoload_configs/switch.conf.xml
+++ b/conf/testing/autoload_configs/switch.conf.xml
@@ -190,7 +190,9 @@
 
     <!-- Allow multiple registrations to the same account in the central registration table -->
     <!-- <param name="multiple-registrations" value="true"/> -->
-
+    
+    <!-- <param name="max-audio-channels" value="2"/> -->
+   
   </settings>
 
 </configuration>
diff --git a/conf/vanilla/autoload_configs/httapi.conf.xml b/conf/vanilla/autoload_configs/httapi.conf.xml
index e19210b4da5..c3566d148a1 100644
--- a/conf/vanilla/autoload_configs/httapi.conf.xml
+++ b/conf/vanilla/autoload_configs/httapi.conf.xml
@@ -105,6 +105,8 @@
 	<!-- <param name="ssl-key-password" value="MyPrivateKeyPassword"/> -->
 	<!-- optional timeout -->
 	<!-- <param name="timeout" value="10"/> -->
+	<!-- optional: maximum amount of time in seconds that is allowed to make the connection to the server -->
+	<!-- <param name="connect-timeout" value="2"/> -->
 
 	<!-- optional: use a custom CA certificate in PEM format to verify the peer
 	     with. This is useful if you are acting as your own certificate authority.
diff --git a/conf/vanilla/autoload_configs/sndfile.conf.xml b/conf/vanilla/autoload_configs/sndfile.conf.xml
new file mode 100644
index 00000000000..d9e0cff75d8
--- /dev/null
+++ b/conf/vanilla/autoload_configs/sndfile.conf.xml
@@ -0,0 +1,9 @@
+<configuration name="sndfile.conf">
+	<settings>
+		<!-- Allow only these file extensions. Default: allow all sndfile provided extensions + FS custom extra -->
+		<!--
+		<param name="allowed-extensions" value="wav,raw,r8,r16"/>
+		-->
+	</settings>
+</configuration>
+
diff --git a/conf/vanilla/autoload_configs/sofia.conf.xml b/conf/vanilla/autoload_configs/sofia.conf.xml
index d764a59e51c..629a0019fa2 100644
--- a/conf/vanilla/autoload_configs/sofia.conf.xml
+++ b/conf/vanilla/autoload_configs/sofia.conf.xml
@@ -2,6 +2,7 @@
 
   <global_settings>
     <param name="log-level" value="0"/>
+    <!-- <param name="abort-on-empty-external-ip" value="true"/> -->
     <!-- <param name="auto-restart" value="false"/> -->
     <param name="debug-presence" value="0"/>
     <!-- <param name="capture-server" value="udp:homer.domain.com:5060"/> -->
diff --git a/conf/vanilla/autoload_configs/switch.conf.xml b/conf/vanilla/autoload_configs/switch.conf.xml
index 5b3930e7965..714b62c54cc 100644
--- a/conf/vanilla/autoload_configs/switch.conf.xml
+++ b/conf/vanilla/autoload_configs/switch.conf.xml
@@ -200,6 +200,8 @@
     <!-- Allow multiple registrations to the same account in the central registration table -->
     <!-- <param name="multiple-registrations" value="true"/> -->
 
+    <!-- <param name="max-audio-channels" value="2"/> -->
+
   </settings>
 
 </configuration>
diff --git a/conf/vanilla/sip_profiles/external.xml b/conf/vanilla/sip_profiles/external.xml
index 1bf1b04595c..e2d161d5983 100644
--- a/conf/vanilla/sip_profiles/external.xml
+++ b/conf/vanilla/sip_profiles/external.xml
@@ -64,7 +64,7 @@
     <param name="rtp-ip" value="$${local_ip_v4}"/>
     <param name="sip-ip" value="$${local_ip_v4}"/>
     <param name="ext-rtp-ip" value="$${external_rtp_ip}"/>
-    <param name="ext-sip-ip" value="$${external_rtp_ip}"/>
+    <param name="ext-sip-ip" value="$${external_sip_ip}"/>
     <param name="rtp-timeout-sec" value="300"/>
     <param name="rtp-hold-timeout-sec" value="1800"/>
     <!--<param name="enable-3pcc" value="true"/>-->
diff --git a/conf/vanilla/sip_profiles/internal.xml b/conf/vanilla/sip_profiles/internal.xml
index 3205e179b97..da7abc07a62 100644
--- a/conf/vanilla/sip_profiles/internal.xml
+++ b/conf/vanilla/sip_profiles/internal.xml
@@ -282,7 +282,7 @@
          auto-nat              - Use ip learned from NAT-PMP or UPNP
     -->
     <param name="ext-rtp-ip" value="$${external_rtp_ip}"/>
-    <param name="ext-sip-ip" value="$${external_rtp_ip}"/>
+    <param name="ext-sip-ip" value="$${external_sip_ip}"/>
 
     <!-- rtp inactivity timeout -->
     <param name="rtp-timeout-sec" value="300"/>
diff --git a/configure.ac b/configure.ac
index 57373a6b1e4..c5f10ef2adf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3,10 +3,10 @@
 
 # Must change all of the below together
 # For a release, set revision for that tagged release as well and uncomment
-AC_INIT([freeswitch], [1.10.2-release], bugs@freeswitch.org)
+AC_INIT([freeswitch], [1.10.3-release], bugs@freeswitch.org)
 AC_SUBST(SWITCH_VERSION_MAJOR, [1])
 AC_SUBST(SWITCH_VERSION_MINOR, [10])
-AC_SUBST(SWITCH_VERSION_MICRO, [2-release])
+AC_SUBST(SWITCH_VERSION_MICRO, [3-release])
 AC_SUBST(SWITCH_VERSION_REVISION, [])
 AC_SUBST(SWITCH_VERSION_REVISION_HUMAN, [])
 
@@ -708,7 +708,11 @@ AC_SUBST([POSTGRESQL_LIBDIR], [$POSTGRESQL_LIBDIR])
 
 PKG_CHECK_MODULES([MARIADB], [libmariadb >= 3.0.9],[
   AM_CONDITIONAL([HAVE_MARIADB],[true])],[
-  AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_MARIADB],[false])])
+  PKG_CHECK_MODULES([MARIADB], [mariadb >= 3.0.9],[
+    AM_CONDITIONAL([HAVE_MARIADB],[true])],[
+    AC_MSG_RESULT([no]); AM_CONDITIONAL([HAVE_MARIADB],[false])
+  ])
+])
 
 AC_ARG_ENABLE(deprecated-core-db-events,
 	[AS_HELP_STRING([--enable-deprecated-core-db-events], [Keep deprecated core db events])],,[enable_deprecated_core_db_events="no"])
@@ -1873,7 +1877,6 @@ AC_CONFIG_FILES([Makefile
 		src/mod/applications/mod_hiredis/Makefile
 		src/mod/applications/mod_httapi/Makefile
 		src/mod/applications/mod_http_cache/Makefile
-		src/mod/applications/mod_http_cache/test/Makefile
 		src/mod/applications/mod_ladspa/Makefile
 		src/mod/applications/mod_lcr/Makefile
 		src/mod/applications/mod_limit/Makefile
@@ -1901,7 +1904,6 @@ AC_CONFIG_FILES([Makefile
 		src/mod/applications/mod_spy/Makefile
 		src/mod/applications/mod_stress/Makefile
 		src/mod/applications/mod_test/Makefile
-		src/mod/applications/mod_test/test/Makefile
 		src/mod/applications/mod_translate/Makefile
 		src/mod/applications/mod_valet_parking/Makefile
 		src/mod/applications/mod_vmd/Makefile
@@ -1970,7 +1972,6 @@ AC_CONFIG_FILES([Makefile
 		src/mod/event_handlers/mod_radius_cdr/Makefile
 		src/mod/event_handlers/mod_odbc_cdr/Makefile
 		src/mod/event_handlers/mod_rayo/Makefile
-		src/mod/event_handlers/mod_rayo/test/Makefile
 		src/mod/event_handlers/mod_smpp/Makefile
 		src/mod/event_handlers/mod_snmp/Makefile
 		src/mod/event_handlers/mod_event_zmq/Makefile
@@ -1978,6 +1979,7 @@ AC_CONFIG_FILES([Makefile
 		src/mod/formats/mod_local_stream/Makefile
 		src/mod/formats/mod_native_file/Makefile
 		src/mod/formats/mod_opusfile/Makefile
+		src/mod/formats/mod_png/Makefile
 		src/mod/formats/mod_shell_stream/Makefile
 		src/mod/formats/mod_shout/Makefile
 		src/mod/formats/mod_sndfile/Makefile
@@ -1987,7 +1989,6 @@ AC_CONFIG_FILES([Makefile
 		src/mod/formats/mod_portaudio_stream/Makefile
 		src/mod/languages/mod_java/Makefile
 		src/mod/languages/mod_lua/Makefile
-		src/mod/languages/mod_lua/test/Makefile
 		src/mod/languages/mod_managed/Makefile
 		src/mod/languages/mod_perl/Makefile
 		src/mod/languages/mod_python/Makefile
@@ -2061,6 +2062,8 @@ OUR_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp
 OUR_CLEAN_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp_mods="$(MODULES)" ; fi ; mods="$$(for i in $$tmp_mods ; do echo $$i-clean ; done )"; echo $$mods )'
 OUR_INSTALL_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp_mods="$(MODULES)" ; fi ; mods="$$(for i in $$tmp_mods ; do echo $$i-install ; done)"; echo $$mods )'
 OUR_UNINSTALL_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp_mods="$(MODULES)" ; fi ; mods="$$(for i in $$tmp_mods ; do echo $$i-uninstall ; done)"; echo $$mods )'
+OUR_TEST_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp_mods="$(MODULES)" ; fi ; mods="$$(for i in $$tmp_mods ; do echo $$i-print_tests ; done )"; echo $$mods )'
+OUR_CHECK_MODS='$$(if test -z "$(MODULES)" ; then tmp_mods="$(CONF_MODULES)"; else tmp_mods="$(MODULES)" ; fi ; mods="$$(for i in $$tmp_mods ; do echo $$i-check ; done )"; echo $$mods )'
 OUR_DISABLED_MODS='$$(tmp_mods="$(CONF_DISABLED_MODULES)"; mods="$$(for i in $$tmp_mods ; do echo $$i-all ; done )"; echo $$mods )'
 OUR_DISABLED_CLEAN_MODS='$$(tmp_mods="$(CONF_DISABLED_MODULES)";  mods="$$(for i in $$tmp_mods ; do echo $$i-clean ; done )"; echo $$mods )'
 OUR_DISABLED_INSTALL_MODS='$$(tmp_mods="$(CONF_DISABLED_MODULES)"; mods="$$(for i in $$tmp_mods ; do echo $$i-install ; done)"; echo $$mods )'
@@ -2075,6 +2078,8 @@ AC_SUBST(CONF_MODULES)
 
 AC_SUBST(OUR_MODS)
 AC_SUBST(OUR_CLEAN_MODS)
+AC_SUBST(OUR_TEST_MODS)
+AC_SUBST(OUR_CHECK_MODS)
 AC_SUBST(OUR_INSTALL_MODS)
 AC_SUBST(OUR_UNINSTALL_MODS)
 AC_SUBST(OUR_DISABLED_MODS)
diff --git a/debian/bootstrap.sh b/debian/bootstrap.sh
index 9ca918bc036..a19eb272813 100755
--- a/debian/bootstrap.sh
+++ b/debian/bootstrap.sh
@@ -44,7 +44,6 @@ avoid_mods=(
   applications/mod_cluechoo
   asr_tts/mod_cepstral
   codecs/mod_com_g729
-  codecs/mod_ilbc
   codecs/mod_openh264
   codecs/mod_siren
   codecs/mod_sangoma_codec
@@ -943,7 +942,7 @@ Description: Cross-Platform Scalable Multi-Protocol Soft Switch
 
 Package: freeswitch-all-dbg
 Section: debug
-Priority: extra
+Priority: optional
 Architecture: amd64 armhf
 Depends: \${misc:Depends}, freeswitch-meta-all (= \${binary:Version}), freeswitch-meta-all-dbg (= \${binary:Version})
 Description: debugging symbols for FreeSWITCH
@@ -953,7 +952,7 @@ Description: debugging symbols for FreeSWITCH
 
 Package: freeswitch-dbg
 Section: debug
-Priority: extra
+Priority: optional
 Architecture: amd64 armhf
 Depends: \${misc:Depends}, freeswitch (= \${binary:Version})
 Description: debugging symbols for FreeSWITCH
@@ -963,7 +962,7 @@ Description: debugging symbols for FreeSWITCH
 
 Package: libfreeswitch1-dbg
 Section: debug
-Priority: extra
+Priority: optional
 Architecture: amd64 armhf
 Depends: \${misc:Depends}, libfreeswitch1 (= \${binary:Version})
 Description: debugging symbols for FreeSWITCH
@@ -1066,7 +1065,7 @@ Description: ${description} for FreeSWITCH
 
 Package: freeswitch-${module_name//_/-}-dbg
 Section: debug
-Priority: extra
+Priority: optional
 Architecture: amd64 armhf
 Depends: \${misc:Depends},
  freeswitch-${module_name//_/-} (= \${binary:Version})
diff --git a/debian/control-modules b/debian/control-modules
index dcf8f29a219..56772fa780b 100644
--- a/debian/control-modules
+++ b/debian/control-modules
@@ -117,7 +117,7 @@ Module: applications/mod_hiredis
 Description: Redis client support
  This module provides a mechanism to use Redis as a datastore.
 Build-Depends: libhiredis-dev
-Depends: libhiredis0.10 | libhiredis0.13
+Depends: libhiredis0.10 | libhiredis0.13 | libhiredis0.14
 
 Module: applications/mod_httapi
 Description: HT-TAPI Hypertext Telephony API
diff --git a/docs/phrase/phrase_de.xml b/docs/phrase/phrase_de.xml
index afe7dda430f..52d453ea011 100644
--- a/docs/phrase/phrase_de.xml
+++ b/docs/phrase/phrase_de.xml
@@ -5,6 +5,34 @@
       <prompt phrase="Raute" filename="35.wav"/>
       <prompt phrase="Stern" filename="42.wav"/>
       <prompt phrase="Punkt" filename="46.wav"/>
+      <prompt phrase="Bindestrich" filename="45.wav"/>
+      <prompt phrase="Ausrufezeichen" filename="33.wav"/>
+      <prompt phrase="Ät" filename="64.wav"/>
+      <prompt phrase="Dollar-Zeichen" filename="36.wav"/>
+      <prompt phrase="Prozent-Zeichen" filename="37.wav"/>
+      <prompt phrase="Kaufmännisches Und-Zeichen" filename="38.wav"/>
+      <prompt phrase="Doppelte Anführungszeichen" filename="34.wav"/>
+      <prompt phrase="Einfache Anführungszeichen" filename="39.wav"/>
+      <prompt phrase="Schrägstrich" filename="47.wav"/>
+      <prompt phrase="Unterstrich" filename="95.wav"/>
+      <prompt phrase="Backslash" filename="92.wav"/>
+      <prompt phrase="Tilde" filename="126.wav"/>
+      <prompt phrase="Gleichheitszeichen" filename="61.wav"/>
+      <prompt phrase="Doppelpunkt" filename="58.wav"/>
+      <prompt phrase="Strichpunkt" filename="59.wav"/>
+      <prompt phrase="Zirkumflex" filename="94.wav"/>
+      <prompt phrase="senkrechter Strich" filename="124.wav"/>
+      <prompt phrase="Plus" filename="43.wav"/>
+      <prompt phrase="linke Klammer" filename="40.wav"/>
+      <prompt phrase="rechte Klammer" filename="41.wav"/>
+      <prompt phrase="linke geschweifte Klammer" filename="123.wav"/>
+      <prompt phrase="rechte geschweifte Klammer" filename="125.wav"/>
+      <prompt phrase="linke eckige Klammer" filename="91.wav"/>
+      <prompt phrase="rechte eckige Klammer" filename="93.wav"/>
+      <prompt phrase="Komma" filename="44.wav"/>
+      <prompt phrase="Gravis-Zeichen" filename="96.wav"/>
+      <prompt phrase="kleiner als Zeichen" filename="60.wav"/>
+      <prompt phrase="größer als Zeichen" filename="62.wav"/>
       <prompt phrase="A" filename="97.wav"/>
       <prompt phrase="B" filename="98.wav"/>
       <prompt phrase="C" filename="99.wav"/>
@@ -67,6 +95,8 @@
       <prompt phrase="Ökonom"  filename="201.wav"/>
       <prompt phrase="Übermut" filename="202.wav"/>
       <prompt phrase="Eszet"   filename="203.wav"/>
+      <prompt phrase="Stern" filename="42.wav"/>
+      <prompt phrase="Raute" filename="35.wav"/>
     </phonetic-ascii>
     <digits>
       <prompt phrase="Null" filename="0.wav"/>
@@ -98,49 +128,125 @@
       <prompt phrase="Achtzig" filename="80.wav"/>
       <prompt phrase="Neunzig" filename="90.wav"/>
       <prompt phrase="Punkt" filename="dot.wav"/>
-      <prompt phrase="Erstes" filename="h-1.wav"/>
-      <prompt phrase="Zweites" filename="h-2.wav"/>
-      <prompt phrase="Drittes" filename="h-3.wav"/>
-      <prompt phrase="Viertes" filename="h-4.wav"/>
-      <prompt phrase="Fünftes" filename="h-5.wav"/>
-      <prompt phrase="Sechstes" filename="h-6.wav"/>
-      <prompt phrase="Siebtes" filename="h-7.wav"/>
-      <prompt phrase="Achtes" filename="h-8.wav"/>
-      <prompt phrase="Neuntes" filename="h-9.wav"/>
-      <prompt phrase="Zehntes" filename="h-10.wav"/>
-      <prompt phrase="Elftes" filename="h-11.wav"/>
-      <prompt phrase="Zwölftes" filename="h-12.wav"/>
-      <prompt phrase="Dreizehntes" filename="h-13.wav"/>
-      <prompt phrase="Vierzehntes" filename="h-14.wav"/>
-      <prompt phrase="Fünfzehntes" filename="h-15.wav"/>
-      <prompt phrase="Sechzehntes" filename="h-16.wav"/>
-      <prompt phrase="Siebzehntes" filename="h-17.wav"/>
-      <prompt phrase="Achtzehntes" filename="h-18.wav"/>
-      <prompt phrase="Neunzehntes" filename="h-19.wav"/>
-      <prompt phrase="Zwanzigstes" filename="h-20.wav"/>
-      <prompt phrase="Dreissigstes" filename="h-30.wav"/>
+      <prompt phrase="Erste" filename="h-1.wav"/>
+      <prompt phrase="Zweite" filename="h-2.wav"/>
+      <prompt phrase="Dritte" filename="h-3.wav"/>
+      <prompt phrase="Vierte" filename="h-4.wav"/>
+      <prompt phrase="Fünfte" filename="h-5.wav"/>
+      <prompt phrase="Sechste" filename="h-6.wav"/>
+      <prompt phrase="Siebte" filename="h-7.wav"/>
+      <prompt phrase="Achte" filename="h-8.wav"/>
+      <prompt phrase="Neunte" filename="h-9.wav"/>
+      <prompt phrase="Zehnte" filename="h-10.wav"/>
+      <prompt phrase="Elfte" filename="h-11.wav"/>
+      <prompt phrase="Zwölfte" filename="h-12.wav"/>
+      <prompt phrase="Dreizehnte" filename="h-13.wav"/>
+      <prompt phrase="Vierzehnte" filename="h-14.wav"/>
+      <prompt phrase="Fünfzehnte" filename="h-15.wav"/>
+      <prompt phrase="Sechzehnte" filename="h-16.wav"/>
+      <prompt phrase="Siebzehnte" filename="h-17.wav"/>
+      <prompt phrase="Achtzehnte" filename="h-18.wav"/>
+      <prompt phrase="Neunzehnte" filename="h-19.wav"/>
+      <prompt phrase="Zwanzigste" filename="h-20.wav"/>
+      <prompt phrase="Dreissigste" filename="h-30.wav"/>
+      <prompt phrase="Erster" filename="h-1_m.wav"/>
+      <prompt phrase="Zweiter" filename="h-2_m.wav"/>
+      <prompt phrase="Dritter" filename="h-3_m.wav"/>
+      <prompt phrase="Vierter" filename="h-4_m.wav"/>
+      <prompt phrase="Fünfter" filename="h-5_m.wav"/>
+      <prompt phrase="Sechster" filename="h-6_m.wav"/>
+      <prompt phrase="Siebter" filename="h-7_m.wav"/>
+      <prompt phrase="Achter" filename="h-8_m.wav"/>
+      <prompt phrase="Neunter" filename="h-9_m.wav"/>
+      <prompt phrase="Zehnter" filename="h-10_m.wav"/>
+      <prompt phrase="Elfter" filename="h-11_m.wav"/>
+      <prompt phrase="Zwölfter" filename="h-12_m.wav"/>
+      <prompt phrase="Dreizehnter" filename="h-13_m.wav"/>
+      <prompt phrase="Vierzehnter" filename="h-14_m.wav"/>
+      <prompt phrase="Fünfzehnter" filename="h-15_m.wav"/>
+      <prompt phrase="Sechzehnter" filename="h-16_m.wav"/>
+      <prompt phrase="Siebzehnter" filename="h-17_m.wav"/>
+      <prompt phrase="Achtzehnter" filename="h-18_m.wav"/>
+      <prompt phrase="Neunzehnter" filename="h-19_m.wav"/>
+      <prompt phrase="Zwanzigster" filename="h-20_m.wav"/>
+      <prompt phrase="Dreissigster" filename="h-30_m.wav"/>
+      <prompt phrase="Erste" filename="h-1_f.wav"/>
+      <prompt phrase="Zweite" filename="h-2_f.wav"/>
+      <prompt phrase="Dritte" filename="h-3_f.wav"/>
+      <prompt phrase="Vierte" filename="h-4_f.wav"/>
+      <prompt phrase="Fünfte" filename="h-5_f.wav"/>
+      <prompt phrase="Sechste" filename="h-6_f.wav"/>
+      <prompt phrase="Siebte" filename="h-7_f.wav"/>
+      <prompt phrase="Achte" filename="h-8_f.wav"/>
+      <prompt phrase="Neunte" filename="h-9_f.wav"/>
+      <prompt phrase="Zehnte" filename="h-10_f.wav"/>
+      <prompt phrase="Elfte" filename="h-11_f.wav"/>
+      <prompt phrase="Zwölfte" filename="h-12_f.wav"/>
+      <prompt phrase="Dreizehnte" filename="h-13_f.wav"/>
+      <prompt phrase="Vierzehnte" filename="h-14_f.wav"/>
+      <prompt phrase="Fünfzehnte" filename="h-15_f.wav"/>
+      <prompt phrase="Sechzehnte" filename="h-16_f.wav"/>
+      <prompt phrase="Siebzehnte" filename="h-17_f.wav"/>
+      <prompt phrase="Achtzehnte" filename="h-18_f.wav"/>
+      <prompt phrase="Neunzehnte" filename="h-19_f.wav"/>
+      <prompt phrase="Zwanzigste" filename="h-20_f.wav"/>
+      <prompt phrase="Dreissigste" filename="h-30_f.wav"/>
+      <prompt phrase="Erstes" filename="h-1_n.wav"/>
+      <prompt phrase="Zweites" filename="h-2_n.wav"/>
+      <prompt phrase="Drittes" filename="h-3_n.wav"/>
+      <prompt phrase="Viertes" filename="h-4_n.wav"/>
+      <prompt phrase="Fünftes" filename="h-5_n.wav"/>
+      <prompt phrase="Sechstes" filename="h-6_n.wav"/>
+      <prompt phrase="Siebtes" filename="h-7_n.wav"/>
+      <prompt phrase="Achtes" filename="h-8_n.wav"/>
+      <prompt phrase="Neuntes" filename="h-9_n.wav"/>
+      <prompt phrase="Zehntes" filename="h-10_n.wav"/>
+      <prompt phrase="Elftes" filename="h-11_n.wav"/>
+      <prompt phrase="Zwölftes" filename="h-12_n.wav"/>
+      <prompt phrase="Dreizehntes" filename="h-13_n.wav"/>
+      <prompt phrase="Vierzehntes" filename="h-14_n.wav"/>
+      <prompt phrase="Fünfzehntes" filename="h-15_n.wav"/>
+      <prompt phrase="Sechzehntes" filename="h-16_n.wav"/>
+      <prompt phrase="Siebzehntes" filename="h-17_n.wav"/>
+      <prompt phrase="Achtzehntes" filename="h-18_n.wav"/>
+      <prompt phrase="Neunzehntes" filename="h-19_n.wav"/>
+      <prompt phrase="Zwanzigstes" filename="h-20_n.wav"/>
+      <prompt phrase="Dreissigstes" filename="h-30_n.wav"/>
       <prompt phrase="Hundert" filename="hundred.wav"/>
+      <prompt phrase="Tausend" filename="thousand.wav"/>
       <prompt phrase="Millionen" filename="million.wav"/>
+      <prompt phrase="Millionen" filename="millionen.wav"/>
+      <prompt phrase="Milliarden" filename="milliarden.wav"/>
       <prompt phrase="Komma" filename="period.wav"/>
       <prompt phrase="Punkt" filename="point.wav"/>
       <prompt phrase="Raute" filename="pound.wav"/>
       <prompt phrase="Stern" filename="star.wav"/>
-      <prompt phrase="Tausend" filename="thousand.wav"/>
+      <prompt phrase="Ein" filename="1_m.wav"/>
+      <prompt phrase="Eine" filename="1_f.wav"/>
+      <prompt phrase="Ein" filename="1_n.wav"/>
+      <prompt phrase="Ein und" filename="1-and.wav"/>
+      <prompt phrase="Zwei und" filename="2-and.wav"/>
+      <prompt phrase="Drei und" filename="3-and.wav"/>
+      <prompt phrase="Vier und" filename="4-and.wav"/>
+      <prompt phrase="Fünf und" filename="5-and.wav"/>
+      <prompt phrase="Sechs und" filename="6-and.wav"/>
+      <prompt phrase="Sieben und" filename="7-and.wav"/>
+      <prompt phrase="Acht und" filename="8-and.wav"/>
+      <prompt phrase="Neun und" filename="9-and.wav"/>
     </digits>
     <currency>
       <prompt phrase="Und" filename="and.wav"/>
       <prompt phrase="Cent" filename="cent.wav"/>
-      <!-- vvv ??? -->
-      <prompt phrase="Zentral" filename="central.wav"/>
+      <prompt phrase="Zentrale" filename="central.wav"/>
       <prompt phrase="Cent pro Minute" filename="cents-per-minute.wav"/>
       <prompt phrase="Cent" filename="cents.wav"/>
       <prompt phrase="Euro" filename="dollar.wav"/>
       <prompt phrase="Euro" filename="dollars.wav"/>
       <prompt phrase="Minus" filename="minus.wav"/>
-      <prompt phrase="Negativ" filename="negative.wav"/>
+      <prompt phrase="Minus" filename="negative.wav"/>
     </currency>
     <time>
-      <prompt phrase="A.M." filename="a-m.wav"/>
+      <prompt phrase="vormittags" filename="a-m.wav"/>
       <prompt phrase="um" filename="at.wav"/>
       <prompt phrase="Sonntag" filename="day-0.wav"/>
       <prompt phrase="Montag" filename="day-1.wav"/>
@@ -153,6 +259,8 @@
       <prompt phrase="Stunden" filename="hours.wav"/>
       <prompt phrase="Minute" filename="minute.wav"/>
       <prompt phrase="Minuten" filename="minutes.wav"/>
+      <prompt phrase="Sekunde" filename="second.wav"/>
+      <prompt phrase="Sekunden" filename="seconds.wav"/>
       <prompt phrase="Januar" filename="mon-0.wav"/>
       <prompt phrase="Febuar" filename="mon-1.wav"/>
       <prompt phrase="März" filename="mon-2.wav"/>
@@ -167,55 +275,573 @@
       <prompt phrase="Dezember" filename="mon-11.wav"/>
       <prompt phrase="Uhr" filename="oclock.wav"/>
       <prompt phrase="oh" filename="oh.wav"/>
-      <prompt phrase="P.M." filename="p-m.wav"/>
-      <prompt phrase="Sekunde" filename="second.wav"/>
-      <prompt phrase="Sekunden" filename="seconds.wav"/>
+      <prompt phrase="nachmittags" filename="p-m.wav"/>
       <prompt phrase="Heute" filename="today.wav"/>
       <prompt phrase="Morgen" filename="tomorrow.wav"/>
       <prompt phrase="Gestern" filename="yesterday.wav"/>
     </time>
     <voicemail>
+      <prompt phrase="Die Person unter der Durchwahl" filename="vm-person.wav"/>
+      <prompt phrase="als dringend markiert" filename="vm-marked-urgent.wav"/>
+      <prompt phrase="als neu markiert" filename="vm-marked_new.wav"/>
+      <prompt phrase="per E-Mail verschickt" filename="vm-emailed.wav"/>
       <prompt phrase="Sie haben" filename="vm-you_have.wav"/>
+      <prompt phrase="Nachricht von" filename="vm-from.wav"/>
       <prompt phrase="Bitte geben Sie Ihren Benutzernamen ein, gefolgt von" filename="vm-enter_id.wav"/>
       <prompt phrase="Bitte geben Sie Ihr Passwort ein, gefolgt von" filename="vm-enter_pass.wav"/>
       <prompt phrase="Benutzerdaten falsch" filename="vm-fail_auth.wav"/>
       <prompt phrase="Willkommen in Ihrem Postfach" filename="vm-hello.wav"/>
       <prompt phrase="Auf Wiedersehen" filename="vm-goodbye.wav"/>
       <prompt phrase="Sagen Sie Ihren Namen nach dem Ton, drücken Sie eine beliebige Taste oder hören Sie auf zu sprechen um die Aufnahme zu beenden." filename="vm-record_name1.wav"/>
-      <prompt phrase="Wählen Sie eine Ansage zwischen 1 und 3" filename="vm-choose_greeting.wav"/>
+      <prompt phrase="Wählen Sie eine Ansage zwischen 1 und 9" filename="vm-choose_greeting_choose.wav"/>
       <prompt phrase="Ungültige Eingabe" filename="vm-choose_greeting_fail.wav"/>
       <prompt phrase="Zu viele Fehlversuche" filename="vm-abort.wav"/>
-      <prompt phrase="Sprechen Sie nach dem Ton. Drücken Sie eine beliebige Taste oder hören Sie auf zu sprechen um die Aufnahme zu beenden" filename="vm-record_greeting.wav"/>
-      <prompt phrase="Sprechen Sie nach dem Ton. Drücken Sie eine beliebige Taste oder hören Sie auf zu sprechen um die Aufnahme zu beenden" filename="vm-record_message.wav"/>
+      <prompt phrase="Um Ihr Passwort zu ändern" filename="vm-change_password.wav"/>
+      <prompt phrase="Sprechen Sie Ihre Ansage nach dem Ton. Drücken Sie eine beliebige Taste oder hören Sie auf zu sprechen um die Aufnahme zu beenden." filename="vm-record_greeting.wav"/>
+      <prompt phrase="Sprechen Sie Ihre Nachricht nach dem Ton. Drücken Sie eine beliebige Taste oder hören Sie auf zu sprechen um die Aufnahme zu beenden." filename="vm-record_message.wav"/>
       <prompt phrase="ist nicht verfügbar" filename="vm-play_greeting.wav"/>
       <prompt phrase="dringend, neu" filename="vm-urgent-new.wav"/>
       <prompt phrase="neu" filename="vm-new.wav"/>
       <prompt phrase="dringend, gespeichert" filename="vm-urgent-saved.wav"/>
       <prompt phrase="gespeichert" filename="vm-saved.wav"/>
       <prompt phrase="Nachricht" filename="vm-message.wav"/>
+      <prompt phrase="Nachricht" filename="vm-message_alt.wav"/>
       <prompt phrase="Nachrichten" filename="vm-messages.wav"/>
+      <prompt phrase="Nachrichten" filename="vm-messages_alt.wav"/>
+      <prompt phrase="keine weiteren Nachrichten" filename="vm-no_more_messages.wav"/>
+      <prompt phrase="um diese Nachricht zu wiederholen" filename="vm-repeat_message.wav"/>
+      <prompt phrase="um diese Nachricht zu löschen" filename="vm-delete_message.wav"/>
+      <prompt phrase="um diese Nachricht zu speichern" filename="vm-save_message.wav"/>
+      <prompt phrase="um die nächste Nachricht abzuspielen" filename="vm-play_next_message.wav"/>
+      <prompt phrase="um die letzte Nachricht abzuspielen" filename="vm-play_previous_message.wav"/>
+      <prompt phrase="um das Löschen dieser Nachricht rückgängig zu machen" filename="vm-undelete_message.wav"/>
+      <prompt phrase="um diese Nachricht jetzt zu senden" filename="vm-send_message_now.wav"/>
+      <prompt phrase="um den Umschlag der Nachricht zu hören" filename="vm-message_envelope.wav"/>
+      <prompt phrase="wurde geändert in" filename="vm-has_been_changed_to.wav"/>
       <prompt phrase="drücken Sie" filename="vm-press.wav"/>
+      <prompt phrase="empfangene" filename="vm-received.wav"/>
+      <prompt phrase="gefolgt von" filename="vm-followed_by.wav"/>
+      <prompt phrase="Das war eine ungültige Durchwahl." filename="vm-that_was_an_invalid_ext.wav"/>
+      <prompt phrase="Löschen rückgängig gemacht." filename="vm-undeleted.wav"/>
+      <prompt phrase="gefolgt von der Raute Taste." filename="vm-followed_by_pound.wav"/>
+      <prompt phrase="nächste" filename="vm-next.wav"/>
       <prompt phrase="zum anhören neuer Nachrichten" filename="vm-listen_new.wav"/>
       <prompt phrase="zum anhören gespeicherter Nachrichten" filename="vm-listen_saved.wav"/>
       <prompt phrase="für erweiterte Optionen" filename="vm-advanced.wav"/>
+      <prompt phrase="für erweiterte Optionen" filename="vm-advanced_alt.wav"/>
       <prompt phrase="zum verlassen" filename="vm-to_exit.wav"/>
-      <prompt phrase="zum aufnehmen einer Ansage" filename="vm-record_greeting.wav"/>
+      <prompt phrase="zum verlassen" filename="vm-to_exit_alt.wav"/>
+      <prompt phrase="zum aufnehmen einer Ansage" filename="vm-to_record_greeting.wav"/>
       <prompt phrase="zum auswählen einer Ansage" filename="vm-choose_greeting.wav"/>
       <prompt phrase="zum aufnehmen Ihres Namens" filename="vm-record_name2.wav"/>
       <prompt phrase="für das Hauptmenü" filename="vm-main_menu.wav"/>
       <prompt phrase="zum anhören dieser Nachricht" filename="vm-listen_to_recording.wav"/>
       <prompt phrase="zum speichern der Nachricht" filename="vm-save_recording.wav"/>
       <prompt phrase="zum erneuten Aufnehmen" filename="vm-rerecord.wav"/>
-      <prompt phrase="um diese Nachricht als dringen zu markieren" filename="vm-mark_urgent.wav"/>
+      <prompt phrase="um diese Nachricht als dringend zu markieren" filename="vm-mark-urgent.wav"/>
+      <prompt phrase="um diese Nachricht als neu zu markieren" filename="vm-mark_message_new.wav"/>
       <prompt phrase="zum Fortfahren" filename="vm-continue.wav"/>
       <prompt phrase="zum erneuten anhören dieser Nachricht" filename="vm-listen_to_recording_again.wav"/>
       <prompt phrase="zum löschen dieser Nachricht" filename="vm-delete_recording.wav"/>
-      <prompt phrase="um diese Nachricht an Ihre EMailadresse weiter zu leiten" filename="vm-forward_to_email.wav"/>
+      <prompt phrase="um diese Nachricht an Ihre E-Mail-Adresse weiterzuleiten" filename="vm-forward_to_email.wav"/>
+      <prompt phrase="um diese Nachricht an ein anderes Postfach weiterzuleiten" filename="vm-to_forward.wav"/>
+      <prompt phrase="Bitte geben Die die Durchwahl an, an die diese Nachricht weitergeleitet werden soll" filename="vm-forward_enter_ext.wav"/>
+      <prompt phrase="Um eine Einleitung zu dieser Nachricht hinzuzufügen" filename="vm-forward_add_intro.wav"/>
       <prompt phrase="Ansage" filename="vm-greeting.wav"/>
       <prompt phrase="ausgewählt" filename="vm-selected.wav"/>
-      <prompt phrase="ist nicht verfügbar" filename="vm-not_avaliable.wav"/>
+      <prompt phrase="ist nicht verfügbar" filename="vm-not_available.wav"/>
       <prompt phrase="Nachricht nummer" filename="vm-message_number.wav"/>
       <prompt phrase="gelöscht" filename="vm-deleted.wav"/>
+      <prompt phrase="Ihre Aufnahme ist kürzer als die erlaubte Mindestlänge. Bitte versuchen Sie es erneut." filename="vm-too-small.wav"/>
+      <prompt phrase="im Posteingang" filename="vm-in_folder.wav"/>
+      <prompt phrase="um jetzt zurückzurufen" filename="vm-return_call.wav"/>
+      <prompt phrase="dringend" filename="vm-urgent.wav"/>
+      <prompt phrase="Dieses Postfach ist voll. Bitte versuchen Sie es später erneut." filename="vm-mailbox_full.wav"/>
+      <prompt phrase="Willkommen in Ihrem neuen Sprach-Postfach. Für eine Erklärung und um Ihr Postfach einzurichten, drücken Sie bitte 1, zum Überspringen 2." filename="vm-tutorial_yes_no.wav"/>
+      <prompt phrase="Bitte sprechen Sie Ihren Vor- und Nachnamen. Diese Aufnahme wird in der Telefonanlage auch für das Verzeichnis der Nebenstellen verwendet." filename="vm-tutorial_record_name.wav"/>
+      <prompt phrase="Ihr persönlicher Zugangs-Code, auch PIN genannt, wird verwendet, damit andere nicht auf Ihr Sprach-Postfach zugreifen können. Wollen Sie die PIN jetzt ändern?" filename="vm-tutorial_change_pin.wav"/>
+      <prompt phrase="Die Person, die Sie erreichen wollen, ist im Moment nicht verfügbar und hat kein Sprach-Postfach." filename="vm-not_available_no_voicemail.wav"/>
+      <prompt phrase="Die von Ihnen eingegebene PIN ist kürzer als die erlaubte Mindestlänge." filename="vm-pin_below_minimum_length.wav"/>
+      <prompt phrase="Die minimale PIN-Länge ist" filename="vm-minimum_pin_length_is.wav"/>
+      <prompt phrase="Ihr Passwort wurde geändert." filename="vm-password_has_been_changed.wav"/>
+      <prompt phrase="Bitte geben Sie Ihr neues Passwort ein, gefolgt von der Raute Taste. " filename="vm-enter_new_pin.wav"/>
+      <prompt phrase="Das von Ihnen eingegebene Passwort ist in diesem System nicht gültig." filename="vm-password_not_valid.wav"/>
+      <prompt phrase="Ein Passwort wie zum Beispiel 1 1 1 1 oder 1 2 3 4 ist nicht sicher." filename="vm-password_is_not_secure.wav"/>
+      <prompt phrase="Bitte geben wählen Sie ein Passwort aus, das nicht nur aus gleichen oder aufeinanderfolgenden Ziffern besteht." filename="vm-choose_password.wav"/>
+      <prompt phrase="Bitte denken Sie daran, dass das Passwort für Ihr Sprach-Postfach auf für das Web Interface genutzt wird." filename="vm-voicemail_password_is_web_password.wav"/>
+      <prompt phrase="Nachricht von" filename="vm-message_from.wav"/>
+      <prompt phrase="Niemand hinterlässt derzeit eine Nachricht in diesem Postfach." filename="vm-nobody_leaving_message.wav"/>
+      <prompt phrase="letzte" filename="vm-last.wav"/>
+      <prompt phrase="Um die Ansage zu löschen" filename="vm-delete_greeting.wav"/>
+      <prompt phrase="Um die Ansage anzuhören" filename="vm-listen_to_greeting.wav"/>
+      <prompt phrase="Für das Hauptmenü" filename="vm-main_menu_alt.wav"/>
+      <prompt phrase="Nachricht weitergeleitet" filename="vm-message_forwarded.wav"/>
+      <prompt phrase="Der von Ihnen angerufene Teilnehmer antwortet nicht und hat auch keinen Anrufbeantworter." filename="vm-no_answer_no_vm.wav"/>
     </voicemail>
+    <directory>
+      <prompt phrase="Bitte geben Sie ein die ersten paar Buchstaben des" filename="dir-enter_person.wav"/>
+      <prompt phrase="Nachnamen" filename="dir-last_name.wav"/>
+      <prompt phrase="Zur Suche nach" filename="dir-to_search_by.wav"/>
+      <prompt phrase="Vornamen" filename="dir-first_name.wav"/>
+      <prompt phrase="Es gab keine übereinstimmenden Ergebnisse." filename="dir-no_matching_results.wav"/>
+      <prompt phrase="Ergebnisse stimmen mit Ihrer Suche überein." filename="dir-result_match.wav"/>
+      <prompt phrase="Ihre Suche ergab zu viele Ergebnisse." filename="dir-too_many_result.wav"/>
+      <prompt phrase="Keine weiteren Ergebnisse." filename="dir-no_more_results.wav"/>
+      <prompt phrase="Ergebnis Nummer" filename="dir-result_number.wav"/>
+      <prompt phrase="mit der Durchwahl" filename="dir-at_extension.wav"/>
+      <prompt phrase="Um diesen Eintrag auszuwählen" filename="dir-to_select_entry.wav"/>
+      <prompt phrase="Für den nächsten Eintrag" filename="dir-for_next.wav"/>
+      <prompt phrase="Für den vorhergehenden Eintrag" filename="dir-for_prev.wav"/>
+      <prompt phrase="Um eine neue Suche zu starten" filename="dir-start_new_search.wav"/>
+      <prompt phrase="Bitte nennen Sie mindestens" filename="dir-specify_mininum.wav"/>
+      <prompt phrase="Buchstaben des Namens der Person." filename="dir-letters_of_person_name.wav"/>
+      <prompt phrase="Bitte versuchen Sie es erneut." filename="dir-please_try_again.wav"/>
+      <prompt phrase="drücken Sie" filename="dir-press.wav" info="we can copy vm/vm-press.wav"/>
+      <prompt phrase="Für den nächsten Eintrag" filename="dir-for_next.wav" note="re-recorded because original was wrong"/>
+      <prompt phrase="Bitte geben Sie die ersten paar Buchstaben des Vor- oder Nachnamens der Person ein." filename="dir-enter_person_first_or_last.wav"/>
+      <prompt phrase="Bitte geben Sie ein die ersten paar Buchstaben des" filename="dir-enter-person.wav"/>
+      <prompt phrase="Ihre Suche hat keine Ergebnisse gefunden. Bitte versuchen Sie es erneut." filename="dir-no_match_entry.wav"/>
+      <prompt phrase="Keine weiteren Ergebnisse." filename="dir-no_more_result.wav"/>
+      <prompt phrase="Die Eingabe erfordert die Angabe von mindestens" filename="dir-specify_mininum_first.wav"/>
+      <prompt phrase="Um eine neue Suche zu beginnen" filename="dir-to_make_new_search.wav"/>
+      <prompt phrase="Ihre Suche ergab zu viele Ergebnisse. Bitte versuchen Sie es erneut." filename="dir-to_many_result.wav"/>
+    </directory>
+    <conference>
+      <prompt phrase="NULL" filename="conf-ack.wav" type="tone"/>
+      <prompt phrase="NULL" filename="conf-nack.wav" type="tone"/>
+      <prompt phrase="Sie sind jetzt stumm geschaltet." filename="conf-muted.wav"/>
+      <prompt phrase="Sie sind jetzt nicht mehr stumm geschaltet." filename="conf-unmuted.wav"/>
+      <prompt phrase="Sie sind derzeit die einzige Person in dieser Konferenz." filename="conf-alone.wav"/>
+      <prompt phrase="NULL" filename="conf-perpetual.wav" type="music"/>
+      <prompt phrase="NULL" filename="conf-enter.wav" type="tone"/>
+      <prompt phrase="NULL" filename="conf-exit.wav" type="tone"/>
+      <prompt phrase="Sie sind von der Konferenz ausgeschlossen worden." filename="conf-kicked.wav"/>
+      <prompt phrase="Diese Konferenz ist gesperrt." filename="conf-locked.wav"/>
+      <prompt phrase="Diese Konferenz ist jetzt gesperrt." filename="conf-is-locked.wav"/>
+      <prompt phrase="Diese Konferenz ist jetzt nicht mehr gesperrt." filename="conf-is-unlocked.wav"/>
+      <prompt phrase="Bitte geben Sie die PIN Nummer der Konferenz ein." filename="conf-pin.wav"/>
+      <prompt phrase="Ungültige PIN Nummer. Bitte versuchen Sie es erneut." filename="conf-bad-pin.wav"/>
+      <prompt phrase="Auf Wiedersehen." filename="conf-goodbye.wav"/>
+      <prompt phrase="Willkommen zur Konferenz." filename="conf-welcome.wav"/>
+      <prompt phrase="Bitte geben Sie die Konferenz Nummer ein, gefolgt von der Raute Taste." filename="conf-enter_conf_number.wav"/>
+      <prompt phrase="Bitte geben Sie die PIN Nummer der Konferenz ein, gefolgt von der Raute Taste." filename="conf-enter_conf_pin.wav"/>
+      <prompt phrase="nimmt jetzt an der Konferenz teil." filename="conf-has_joined.wav"/>
+      <prompt phrase="hat die Konferenz verlassen." filename="conf-has_left.wav"/>
+      <prompt phrase="Sie sind bereits stumm geschaltet." filename="conf-you_are_already_muted.wav"/>
+      <prompt phrase="Sie sind jetzt in beiden Richtungen stumm geschaltet." filename="conf-you_are_now_bidirectionally_muted.wav"/>
+      <prompt phrase="Geben Sie die Zahl der Zuhörer an diesem Ort ein, gefolgt von der Raute Taste." filename="conf-number_of_listeners.wav"/>
+      <prompt phrase="Zuhörer in dieser Konferenz." filename="conf-listeners_in_conference.wav"/>
+      <prompt phrase="Zuhörer in dieser Konferenz." filename="conf-listener_in_conference.wav"/>
+      <prompt phrase="Mitglieder in dieser Konferenz." filename="conf-members_in_conference.wav"/>
+      <prompt phrase="Diese Konferenz ist jetzt im Fragen und Antworten Modus." filename="conf-conference_is_in_qna_mode.wav"/>
+      <prompt phrase="Fragen und Antworten Modus" filename="conf-qna_mode.wav"/>
+      <prompt phrase="Signalton beim Betreten" filename="conf-entry_sound.wav"/>
+      <prompt phrase="Signalton beim Verlassen" filename="conf-exit_sound.wav"/>
+      <prompt phrase="Diese Konferenz ist voll. Bitte fragen Sie den Konferenz-Moderator." filename="conf-conference_is_full.wav"/>
+      <prompt phrase="Ihre Konferenz wird in Kürze beginnen. Bitte bleiben Sie in der Leitung." filename="conf-conference_will_start_shortly.wav"/>
+      <prompt phrase="andere Personen in dieser Konferenz." filename="conf-other_persons_conference.wav"/>
+      <prompt phrase="Es ist eine weitere Person in dieser Konferenz." filename="conf-one_other_person_conference.wav"/>
+      <prompt phrase="Es ist ein weiteres Mitglied in dieser Konferenz." filename="conf-one_other_member_conference.wav"/>
+      <prompt phrase="Diese Konferenz wird aufgezeichnet." filename="conf-being_recorded.wav"/>
+      <prompt phrase="Diese Konferenz wird nicht mehr aufgezeichnet." filename="conf-no_longer_being_recorded.wav"/>
+    </conference>
+    <ivr>
+      <prompt phrase="Kontonummer" filename="ivr-account_number.wav"/>
+      <prompt phrase="Mit dem Anrufer verbinden" filename="ivr-connect_to_caller.wav"/>
+      <prompt phrase="Wenn Sie die Durchwahl der gewünschten Person kennen, geben Sie diese bitte jetzt ein." filename="ivr-enter_ext_pound.wav"/>
+      <prompt phrase="Bitte geben Sie eine Durchwahl ein, gefolgt von der Raute Taste." filename="ivr-enter_ext.wav"/>
+      <prompt phrase="Um diese Person" filename="ivr-for_this_person.wav"/>
+      <prompt phrase="Bitte warten Sie, während Ihr Anruf verbunden wird." filename="ivr-hold_connect_call.wav"/>
+      <prompt phrase="Es tut uns leid." filename="ivr-im_sorry.wav"/>
+      <prompt phrase="Bitte" filename="ivr-please.wav"/>
+      <prompt phrase="Bitte rufen Sie zurück unter" filename="ivr-please_return_our_call_at.wav"/>
+      <prompt phrase="Bezüglich der Referenz Nummer" filename="ivr-regarding_reference_number.wav"/>
+      <prompt phrase="Dies ist ein Beispiel-Untermenü des Ai Wee Arr." filename="ivr-sample_submenu.wav"/>
+      <prompt phrase="Bitte sagen Sie Ihren Namen nach dem Ton." filename="ivr-say_name.wav"/>
+      <prompt phrase="Leite diesen Anrufer zum Anrufbeantworter weiter" filename="ivr-send_to_voicemail.wav"/>
+      <prompt phrase="Um mit einem Mitarbeiter des Kundenservice zu sprechen" filename="ivr-speak_to_a_customer_service_representative.wav"/>
+      <prompt phrase="Eine Nachricht aufnehmen" filename="ivr-take_a_message.wav"/>
+      <prompt phrase="Das war ein ungültiger Eintrag" filename="ivr-that_was_an_invalid_entry.wav"/>
+      <prompt phrase="Dies ist ein Anruf von" filename="ivr-this_is_a_call_from.wav"/>
+      <prompt phrase="Dieser Ai Wee Arr ermöglicht Ihnen den Test einiger Funktionen" filename="ivr-this_ivr_will_let_you_test_features.wav"/>
+      <prompt phrase="Um die FreeSWITCH Konferenz anzurufen" filename="ivr-to_call_the_freeswitch_conference.wav"/>
+      <prompt phrase="Um einen FreeSWITCH echo test zu machen" filename="ivr-to_do_a_freeswitch_echo_test.wav"/>
+      <prompt phrase="Um einen vorwärts echo test zu machen" filename="ivr-to_do_a_fwd_echo_test.wav"/>
+      <prompt phrase="Um ein Beispiel-Untermenü des Ai Wee Arr zu hören" filename="ivr-to_hear_sample_submenu.wav"/>
+      <prompt phrase="Um schreiende Affen zu hören" filename="ivr-to_hear_screaming_monkeys.wav"/>
+      <prompt phrase="Um Wartemusik zu hören" filename="ivr-to_listen_to_moh.wav"/>
+      <prompt phrase="Um diese Optionen zu wiederholen" filename="ivr-to_repeat_these_options.wav"/>
+      <prompt phrase="Um zum vorherigen Menü zurückzukehren" filename="ivr-to_return_to_previous_menu.wav"/>
+      <prompt phrase="Um mit der Vermittlung zu sprechen" filename="ivr-to_speak_with_an_operator.wav"/>
+      <prompt phrase="Willkommen bei FreeSWITCH, der Zukunft der Telefonie" filename="ivr-welcome_to_freeswitch.wav"/>
+      <prompt phrase="Sie können jederzeit beenden, indem Sie einfach auflegen." filename="ivr-you_may_exit_by_hanging_up.wav"/>
+      <prompt phrase="oder" filename="ivr-or.wav"/>
+      <prompt phrase="nicht" filename="ivr-not.wav"/>
+      <prompt phrase="Anruf" filename="ivr-call.wav"/>
+      <prompt phrase="Sie können" filename="ivr-you_may.wav"/>
+      <prompt phrase="Hallo" filename="ivr-hello.wav"/>
+      <prompt phrase="Indem Sie die Tasten Ihres Telefons benutzen" filename="ivr-use_telephone_keypad.wav"/>
+      <prompt phrase="Bitte buchstabieren Sie den Namen der Person, die Sie versuchen zu erreichen" filename="ivr-spell_name.wav"/>
+      <prompt phrase="Nachname zuerst" filename="ivr-last_name_first.wav"/>
+      <prompt phrase="Vorname zuerst" filename="ivr-first_name_first.wav"/>
+      <prompt phrase="Drücken Sie 1 für Q oder Z" filename="ivr-press_one_q_or_z.wav"/>
+      <prompt phrase="Vertrieb" filename="ivr-sales.wav"/>
+      <prompt phrase="Kundendienst" filename="ivr-customer_service.wav"/>
+      <prompt phrase="Technischer Support" filename="ivr-technical_support.wav"/>
+      <prompt phrase="Vermittlung" filename="ivr-operator.wav"/>
+      <prompt phrase="Sie haben eine ungültige Durchwahl gewählt." filename="ivr-you_have_dialed_an_invalid_extension.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer Ihrer Nebenstelle ein, gefolgt von der Raute Taste." filename="ivr-please_enter_extension_followed_by_pound.wav"/>
+      <prompt phrase="Bitte geben Sie Ihre PIN Nummer ein, gefolgt von der Raute Taste." filename="ivr-please_enter_pin_followed_by_pound.wav"/>
+      <prompt phrase="Ihre PIN Nummer oder Nebenstelle ist ungültig." filename="ivr-pin_or_extension_is-invalid.wav"/>
+      <prompt phrase="Sie sind gerade dabei, dieses Telefon für Ihr Nutzerkonto einzurichten." filename="ivr-you_are_about_to_provision_this_phone.wav"/>
+      <prompt phrase="Bitte geben Sie die PIN Nummer erneut ein, um zu bestätigen." filename="ivr-please_reenter_your_pin.wav"/>
+      <prompt phrase="Um sich für die ClueCon anzumelden, drücken Sie bitte" filename="ivr-register_for_cluecon.wav"/>
+      <prompt phrase="Um sich anzumelden" filename="ivr-to_log_in.wav"/>
+      <prompt phrase="Um sich abzumelden" filename="ivr-to_log_out.wav"/>
+      <prompt phrase="Sie sind jetzt angemeldet." filename="ivr-you_are_now_logged_in.wav"/>
+      <prompt phrase="Sie sind jetzt abgemeldet." filename="ivr-you_are_now_logged_out.wav"/>
+      <prompt phrase="Bitte versuchen Sie es erneut." filename="ivr-please_try_again.wav"/>
+      <prompt phrase="Sie sind gerade dabei, dieses Telefon permanent für Ihre Nebenstelle einzurichten." filename="ivr-provision_phone_permanently_to_extension.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer der Nebenstelle ein, für die Sie dieses Telefon einrichten wollen, gefolgt von der Raute Taste." filename="ivr-extension_to_provision_this_phone.wav"/>
+      <prompt phrase="Danke. Dieses Telefon wird jetzt neu gestartet." filename="ivr-this_phone_will_now_reboot.wav"/>
+      <prompt phrase="Danke. Ihre Aufnahme wurde gespeichert." filename="ivr-recording_saved.wav"/>
+      <prompt phrase="Drücken Sie 1, um Ihre Aufnahme zu speichern. Drücken Sie 2, um Ihre Aufnahme anzuhören. Drücken Sie 3, um sie erneut aufzunehmen." filename="ivr-save_review_record.wav"/>
+      <prompt phrase="Das System kann Ihre Aufnahmne derzeit nicht speichern. Bitte versuchen Sie es später erneut." filename="ivr-unable_save.wav"/>
+      <prompt phrase="Sie sind Nummer" filename="ivr-you_are_number.wav"/>
+      <prompt phrase="in der Warteschlange." filename="ivr-in_line.wav"/>
+      <prompt phrase="Vielen Dank für Ihre Geduld." filename="ivr-thank_you_for_holding.wav"/>
+      <prompt phrase="Die geschätzte Wartezeit ist" filename="ivr-estimated_hold_time.wav"/>
+      <prompt phrase="Mehr als" filename="ivr-more_than.wav"/>
+      <prompt phrase="Weniger als" filename="ivr-less_than.wav"/>
+      <prompt phrase="Beginne mit der Aufnahme" filename="ivr-begin_recording.wav"/>
+      <prompt phrase="Anrufweiterleitung wurde abgebrochen." filename="ivr-call_forwarding_has_been_cancelled.wav"/>
+      <prompt phrase="Anrufweiterleitung wurde aktiviert." filename="ivr-call_forwarding_has_been_set.wav"/>
+      <prompt phrase="Bitte geben Sie die Telefonnummer ein." filename="ivr-please_enter_the_phone_number.wav"/>
+      <prompt phrase="Bitte sagen Sie Ihren Namen und den Grund Ihres Anrufs." filename="ivr-please_state_your_name_and_reason_for_calling.wav"/>
+      <prompt phrase="Um zu akzeptieren, drücken Sie 1. Um abzulehnen, drücken Sie 2. Für Weiterleitung zum Anrufbeantworter drücken Sie 3." filename="ivr-accept_reject_voicemail.wav"/>
+      <prompt phrase="Um zu akzeptieren, drücken Sie 1. Um abzulehnen, drücken Sie 2." filename="ivr-accept_reject.wav"/>
+      <prompt phrase="Um zu akzeptieren, drücken Sie 1." filename="ivr-to_accept_press_one.wav"/>
+      <prompt phrase="Eingehender Anruf." filename="ivr-incoming_call.wav"/>
+      <prompt phrase="Vielen Dank für Ihren Anruf." filename="ivr-thank_you_for_calling.wav"/>
+      <prompt phrase="Bitte warten Sie. Ihr Anruf wird sofort beantwortet." filename="ivr-stay_on_line_call_answered_momentarily.wav"/>
+      <prompt phrase="Ihr Anruf wird weitergeleitet." filename="ivr-call_being_transferred.wav"/>
+      <prompt phrase="Bitte genießen Sie die Musik, während Ihr Anruf weitergeleitet wird." filename="ivr-enjoy_music_while_transfer.wav"/>
+      <prompt phrase="Sie haben einen Anruf von" filename="ivr-call_from.wav"/>
+      <prompt phrase="Dieses Menü enthält keine Einträge. Bitte kontaktieren Sie den Administrator." filename="ivr-no_menu_items.wav"/>
+      <prompt phrase="Für ein Verzeichnis, drücken Sie" filename="ivr-for_directory_press.wav"/>
+      <prompt phrase="Indem Sie die Tasten Ihres Telefons benutzen" filename="ivr-using_telephone_keypad.wav"/>
+      <prompt phrase="Bitte geben Sie die ersten paar Buchstaben des Nachnamens der Person ein." filename="ivr-enter_letters_last_name.wav"/>
+      <prompt phrase="Bitte geben Sie die ersten paar Buchstaben des Vornamens der Person ein." filename="ivr-enter_letters_first_name.wav"/>
+      <prompt phrase="Bitte geben Sie die ersten paar Buchstaben des Vor oder Nachnamens der Person ein." filename="ivr-enter_letters_first_or_last_name.wav"/>
+      <prompt phrase="Wenn dies korrekt ist, drücken Sie" filename="ivr-if_correct_press.wav"/>
+      <prompt phrase="Wenn nicht, drücken Sie" filename="ivr-if_not_press.wav"/>
+      <prompt phrase="Wenn Sie fertig sind, drücken Sie die Raute Taste." filename="ivr-finished_pound_hash_key.wav"/>
+      <prompt phrase="Bitte nicht stören wurde aktiviert." filename="ivr-dnd_activated.wav"/>
+      <prompt phrase="Bitte nicht stören wurde deaktiviert." filename="ivr-dnd_cancelled.wav"/>
+      <prompt phrase="Drücken Sie irgendeine andere Ziffer" filename="ivr-any_other_digit.wav"/>
+      <prompt phrase="Drücken Sie 1 für ja, 2 für nein." filename="ivr-one_yes_two_no.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer der Warteschlange ein, gefolgt von der Raute Taste." filename="ivr-enter_queue_number.wav"/>
+      <prompt phrase="Bitte geben Sie ein die" filename="ivr-please_enter_the.wav"/>
+      <prompt phrase="Nummer" filename="ivr-number.wav"/>
+      <prompt phrase="gefolgt von der Raute Taste." filename="ivr-followed_by_pound.wav"/>
+      <prompt phrase="Der Kontostand ist" filename="ivr-account_balance_is.wav"/>
+      <prompt phrase="Datei" filename="ivr-file.wav"/>
+      <prompt phrase="Dateien" filename="ivr-files.wav"/>
+      <prompt phrase="Für einen Weckruf" filename="ivr-for_a_wakeup_call.wav"/>
+      <prompt phrase="Dies ist Ihre Weckruf." filename="ivr-this_is_your_wakeup_call.wav"/>
+      <prompt phrase="Um einen Weckruf anzufordern" filename="ivr-request_wakeup_call.wav"/>
+      <prompt phrase="Um den Weckruf zu bestätigen" filename="ivr-confirm_wakeup_call.wav"/>
+      <prompt phrase="Um den Weckruf zu löschen" filename="ivr-cancel_wakeup_call.wav"/>
+      <prompt phrase="Sie haben einen Weckruf angefordert um" filename="ivr-requested_wakeup_call_for.wav"/>
+      <prompt phrase="Sie haben keinen Weckruf angefordert." filename="ivr-not_requested_wakeup_call.wav"/>
+      <prompt phrase="Ihr Weckruf wurde gelöscht." filename="ivr-wakeup_call_cancelled.wav"/>
+      <prompt phrase="Für einen täglichen Wegruf" filename="ivr-for_daily_wakeup_call.wav"/>
+      <prompt phrase="Täglichen Weckruf" filename="ivr-daily_wakeup_call.wav"/>
+      <prompt phrase="Für tägliche Weckrufe" filename="ivr-for_daily_wakeup_calls.wav"/>
+      <prompt phrase="Für einen einmaligen Weckruf" filename="ivr-for_one_time_wakeup_call.wav"/>
+      <prompt phrase="Einmaliger Weckruf" filename="ivr-one_time_wakeup_call.wav"/>
+      <prompt phrase="Weckruf" filename="ivr-wakeup_call.wav"/>
+      <prompt phrase="Einen wunderschönen guten Morgen! Wach auf! Die Sonne scheint!" filename="ivr-wakey_wakey_sunshine.wav"/>
+      <prompt phrase="Willkommen." filename="ivr-welcome.wav"/>
+      <prompt phrase="Willkommen bei" filename="ivr-welcome_to.wav"/>
+      <prompt phrase="Guten Morgen." filename="ivr-good_morning.wav"/>
+      <prompt phrase="Guten Tag." filename="ivr-good_afternoon.wav"/>
+      <prompt phrase="Guten Abend." filename="ivr-good_evening.wav"/>
+      <prompt phrase="Danke." filename="ivr-thank_you.wav"/>
+      <prompt phrase="Um" filename="ivr-for.wav"/>
+      <prompt phrase="Die Wartezeiten sind derzeit länger als normal." filename="ivr-longer_than_usual_hold_times.wav"/>
+      <prompt phrase="Sie sind der" filename="ivr-you_are_the.wav"/>
+      <prompt phrase="Bitte warten Sie einen Moment." filename="ivr-one_moment_please.wav"/>
+      <prompt phrase="Ihr Anruf wird in der Reihenfolge des Eingangs bearbeitet." filename="ivr-call_answered_order_received.wav"/>
+      <prompt phrase="Sie haben eingegeben" filename="ivr-you_entered.wav"/>
+      <prompt phrase="Durchwahl" filename="ivr-extension_number.wav"/>
+      <prompt phrase="Bitte warten Sie, während Ihr Anruf verbunden wird." filename="ivr-please_hold_while_party_contacted.wav"/>
+      <prompt phrase="Bitte genießen Sie die Musik, während Ihr Anruf verbunden wird." filename="ivr-please_enjoy_music_while_party_reached.wav"/>
+      <prompt phrase="Vielen Dank für Ihren Anruf. Wenn Sie die Durchwahl der gewünschten Person kennen, geben Sie diese bitte jetzt ein. Um stattdessen ein Verzeichnis der Nebenstellen zu bekommen, drücken Sie bitte 9." filename="ivr-generic_greeting.wav"/>
+      <prompt phrase="Wenn das korrekt ist, drücken Sie bitte 1, ansonsten 2." filename="ivr-if_correct_one_if_not_two.wav"/>
+      <prompt phrase="Um diese information zu wiederholen." filename="ivr-repeat_this_information.wav"/>
+      <prompt phrase="Nein Nein Nein" filename="ivr-no_no_no.wav"/>
+      <prompt phrase="Wollten Sie wirklich diese Taste drücken?" filename="ivr-did_you_mean_to_press_key.wav"/>
+      <prompt phrase="Mal ganz im Ernst, wollten Sie WIRKLICH diese Taste drücken?" filename="ivr-seriously_mean_to_press_key.wav"/>
+      <prompt phrase="Oh ... wie auch immer." filename="ivr-oh_whatever.wav"/>
+      <prompt phrase="Es reicht! Noch so ein Fehler und ich werde auflegen!" filename="ivr-one_more_mistake.wav"/>
+      <prompt phrase="Herzlichen Glückwunsch. Sie haben Stern gedrückt. Das bedeutet nicht, dass Sie ein Star sind. Es heißt nur, dass Sie Tasten drücken können und wahrscheinlich Finger haben." filename="ivr-congratulations_you_pressed_star.wav"/>
+      <prompt phrase="Alle unsere Techniker sind derzeit beschäftigt. Sie helfen den Vertrieblern bei Vorführungen, wie toll unser System ist." filename="ivr-engineers_busy_assisting_other_sales.wav"/>
+      <prompt phrase="Diese Nachricht wird sich selbst zerstören in fünf ... vier ... drei ... zwei ... eins" filename="ivr-message_self_destruct.wav"/>
+      <prompt phrase="Alle Ihre Anrufe gehören uns." filename="ivr-all_your_call_are_belong_to_us.wav"/>
+      <prompt phrase="Ich liebe die Art, wie Du diese Tasten drückst." filename="ivr-love_those_touch_tones.wav"/>
+      <prompt phrase="Ja, wir haben keine Bananen." filename="ivr-yes_we_have_no_bananas.wav"/>
+      <prompt phrase="Hey Mann, Du nervst." filename="ivr-dude_you_suck.wav"/>
+      <prompt phrase="Ihr Anruf liegt uns sehr am Herzen, aber Ihr Verstand nicht. Darum wird es uns eine Freude sein, Sie für immer in unserer Warteschleife hängen zu lassen, um Sie mit unserer nervigen Wartemusik zu quälen." filename="ivr-on_hold_indefinitely.wav"/>
+      <prompt phrase="hat das Gebäude verlassen." filename="ivr-has_left_the_building.wav"/>
+      <prompt phrase="Dieses Telefon ist nicht zugewiesen und darf keine externen Nummern anrufen." filename="ivr-phone_is_unassigned.wav"/>
+      <prompt phrase="Dieses Telefon ist nicht richtig konfiguriert." filename="ivr-phone_not_configured.wav"/>
+      <prompt phrase="Herzlichen Glückwunsch. Dieses Telefon ist korrekt configuriert und kann jetzt einem Nutzer zugewiesen werden." filename="ivr-phone_is_configured_properly.wav"/>
+      <prompt phrase="Bitten Sie den Systemadministrator um Hilfe." filename="ivr-contact_system_administrator.wav"/>
+      <prompt phrase="Es warten keine Anrufe in dieser Warteschlange." filename="ivr-no_calls_waiting_in_queue.wav"/>
+      <prompt phrase="Sie sind Anrufer Nummer eins. Natürlich bekommt in unserem System jeder Anrufer die Nummer eins. Darum müssen Sie jetzt vielleicht ein bisschen warten." filename="ivr-you_are_number_one.wav"/>
+      <prompt phrase="hat den Notruf gewählt." filename="ivr-has_called_emergency_services.wav"/>
+      <prompt phrase="Es sind" filename="ivr-there_are.wav"/>
+      <prompt phrase="Bitte geben Sie die Quell-Telefonnummer ein, dann drücken Sie die Raute Taste." filename="ivr-enter_source_telephone_number.wav"/>
+      <prompt phrase="Bitte geben Sie die Ziel-Telefonnummer ein, dann drücken Sie die Raute Taste." filename="ivr-enter_destination_telephone_number.wav"/>
+      <prompt phrase="Aufnahme gestartet." filename="ivr-recording_started.wav"/>
+      <prompt phrase="Aufnahme beendet." filename="ivr-recording_stopped.wav"/>
+      <prompt phrase="Aufnahme-Pause." filename="ivr-recording_paused.wav"/>
+      <prompt phrase="Aufnahme gelöscht." filename="ivr-recording_deleted.wav"/>
+      <prompt phrase="Sie sind nicht mehr in der Warteschlange." filename="ivr-no_longer_in_queue.wav"/>
+      <prompt phrase="zurückgezogen." filename="ivr-withdrawn.wav"/>
+      <prompt phrase="Frage" filename="ivr-question.wav"/>
+      <prompt phrase="Fragen" filename="ivr-questions.wav"/>
+      <prompt phrase="wurde beantwortet." filename="ivr-has_been_answered.wav"/>
+      <prompt phrase="wurde entfernt." filename="ivr-has_been_removed.wav"/>
+      <prompt phrase="Keine Fragen in der Warteschlange." filename="ivr-no_questions_in_queue.wav"/>
+      <prompt phrase="ist jetzt ein." filename="ivr-is_now_on.wav"/>
+      <prompt phrase="ist jetzt aus." filename="ivr-is_now_off.wav"/>
+      <prompt phrase="Dieses Telefon darf keine externen Nummern anrufen." filename="ivr-phone_not_make_external_calls.wav"/>
+      <prompt phrase="Identifikationsnummer" filename="ivr-id_number.wav"/>
+      <prompt phrase="Um diese Anleitung zu überspringen" filename="ivr-skip_instructions.wav"/>
+      <prompt phrase="Um die Aufnahme zu verwerfen, legen Sie auf, ohne eine Taste zu drücken." filename="ivr-hangup_to_discard.wav"/>
+      <prompt phrase="oder drücken Sie" filename="ivr-or_press.wav"/>
+      <prompt phrase="Für Englisch, drücken Sie" filename="ivr-for_english_press.wav"/>
+      <prompt phrase="Kein Anschluss unter dieser Nummer." filename="ivr-call_cannot_be_completed_as_dialed.wav"/>
+      <prompt phrase="Bitte überprüfen Sie die Nummer und versuchen Sie es erneut." filename="ivr-please_check_number_try_again.wav"/>
+      <prompt phrase="Der Grund für den Fehler ist" filename="ivr-failure_reason_is.wav"/>
+      <prompt phrase="nicht vergebene Nummer" filename="ivr-unallocated_number.wav"/>
+      <prompt phrase="keine Antwort" filename="ivr-no_user_response.wav"/>
+      <prompt phrase="ungültiges Zahlenformat" filename="ivr-invalid_number_format.wav"/>
+      <prompt phrase="ungültige Sprachansage" filename="ivr-invalid_sound_prompt.wav"/>
+      <prompt phrase="Netzübergang außer Betrieb" filename="ivr-gateway_down.wav"/>
+      <prompt phrase="keine Route zum Ziel" filename="ivr-no_route_destination.wav"/>
+      <prompt phrase="Benutzer besetzt" filename="ivr-user_busy.wav"/>
+      <prompt phrase="Anruf wurde abgewiesen" filename="ivr-call_rejected.wav"/>
+      <prompt phrase="Grund nicht angegeben" filename="ivr-normal_unspecified.wav"/>
+      <prompt phrase="Inkompatibles Ziel" filename="ivr-incompatible_destination.wav"/>
+      <prompt phrase="Normaler Verbindungsabbau" filename="ivr-normal_clearing.wav"/>
+      <prompt phrase="Sie haben nicht das Recht, diese Aktion durchzuführen." filename="ivr-not_have_permission.wav"/>
+      <prompt phrase="Sie machen es falsch, aber ich werde trotzdem versuchen, Sie zu Verbinden. Dummkopf!" filename="ivr-youre_doing_it_wrong.wav"/>
+      <prompt phrase="drei ... zwei ... eins ... wir sind ASTERISK los!" filename="ivr-were_asterisk_free.wav"/>
+      <prompt phrase="Vielen Dank für Ihren Anruf bei der Dusche Telekom A G. Bei uns fühlen sich Ihre Anrufe niemals nicht so frisch an! " filename="ivr-douche_telecom.wav"/>
+      <prompt phrase="Um mehr über FreeSWITCH zu erfahren" filename="ivr-to_learn_more_about_freeswitch.wav"/>
+      <prompt phrase="Es gibt" filename="ivr-there_is.wav"/>
+      <prompt phrase="Diese Nummer ist auf der Sperrliste." filename="ivr-do_not_call_list.wav"/>
+      <prompt phrase="Der Verbindungsaufbau wurde abgebrochen." filename="ivr-call_attempt_aborted.wav"/>
+      <prompt phrase="Bitte bleiben Sie in der Leitung, um auf den nächsten freien Bearbeiter zu warten." filename="ivr-to_wait_stay_on_the_line.wav"/>
+      <prompt phrase="Wenn Sie möchten, dass wir Sie zurückrufen, drücken Sie bitte" filename="ivr-if_you_would_like_us_to_call_back.wav"/>
+      <prompt phrase="Die von Ihnen übermittelte Rufnummer ist" filename="ivr-it_appears_that_your_phone_number_is.wav"/>
+      <prompt phrase="Möchten Sie einen Rückruf unter dieser Nummer?" filename="ivr-would_you_like_to_receive_a_call_at_this_number.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer ein, unter der wir Sie erreichen können." filename="ivr-please_enter_the_number_where_we_can_reach_you.wav"/>
+      <prompt phrase="Wir werden so schnell wie möglich zurückrufen." filename="ivr-we_will_return_your_call_at_this_number.wav"/>
+      <prompt phrase="Anonymer Anrufer." filename="ivr-anonymous_caller.wav"/>
+      <prompt phrase="Bitte nehmen Sie sich ein Stück kostenlose, klebrige Torte" filename="ivr-freeguipy.wav"/>
+      <prompt phrase="Mehr Informationen erhalten Sie unter F R E E G U I P Y punkt org" filename="ivr-learn_more_freeguipydotorg.wav"/>
+      <prompt phrase="Irgendetwas ist ganz, ganz fürchterlich schiefgelaufen. Wie peinlich!" filename="ivr-terribly_wrong_awkward.wav"/>
+      <prompt phrase="Ich glaub's ja nicht. Es war DIESER Fehler!" filename="ivr-it_was_that_bug.wav"/>
+      <prompt phrase="Konzentrieren Sie sich. Achten Sie auf den Hasen! ACHTEN SIE AUF DEN HASEN!" filename="ivr-concentrate.wav"/>
+      <prompt phrase="Wenn Sie es nicht mögen, können Sie ein Ticket aufmachen." filename="ivr-file_a_jira.wav"/>
+      <prompt phrase="Darf ich Ihnen vorstellen, den Gründer von FreeSOURCE, Anthony Ministrone." filename="ivr-founder_of_freesource.wav"/>
+      <prompt phrase="Hey Mann, das ist dumm. SAUDUMM." filename="ivr-cold_foolish.wav"/>
+      <prompt phrase="Herzlichen Glückwunsch. Sie haben diesen Monat 1500 Überlaberungs-Minuten verdient." filename="ivr-trollover_minutes.wav"/>
+      <prompt phrase="Alter!" filename="ivr-yuno_silent_drill.wav"/>
+      <prompt phrase="Weile, die Stimmen warn zimlich gutt." filename="ivr-beacuase.wav"/>
+      <prompt phrase="Tag-Modus" filename="ivr-day_mode.wav"/>
+      <prompt phrase="Nacht-Modus" filename="ivr-night_mode.wav"/>
+      <prompt phrase="Mittagspausen-Modus" filename="ivr-lunch_mode.wav"/>
+      <prompt phrase="Nutzerspezifischer Modus Nummer" filename="ivr-custom_mode_number.wav"/>
+      <prompt phrase="Sie haben keine Berechtigung für die Änderung von" filename="ivr-not_have_permission_to_edit.wav"/>
+      <prompt phrase="Diesem Merkmal." filename="ivr-this_feature.wav"/>
+      <prompt phrase="Merkmal." filename="ivr-feature.wav"/>
+      <prompt phrase="Automatische Anrufbeantworter." filename="ivr-automated_attendants.wav"/>
+      <prompt phrase="Automatischer Anrufbeantworter." filename="ivr-automated_attendant.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer der Nebenstelle ein, von dem" filename="ivr-please_enter_extension_number_of.wav"/>
+      <prompt phrase="das Sie ändern wollen." filename="ivr-that_you_would_like_to_modify.wav"/>
+      <prompt phrase="Ungültige Nebenstelle. Bitte versuchen Sie es erneut." filename="ivr-invalid_extension_try_again.wav"/>
+      <prompt phrase="Für die Änderung einer Haupt-Ansage" filename="ivr-modify_main_prompts.wav"/>
+      <prompt phrase="Für die Änderung einer Options-Ansage" filename="ivr-modify_option_prompts.wav"/>
+      <prompt phrase="Um Nutzungsstatistiken anzuhören" filename="ivr-hear_usage_stats.wav"/>
+      <prompt phrase="Nach dem Signalton" filename="ivr-at_the_tone.wav"/>
+      <prompt phrase="Bite nehmen Sie die Ansage auf." filename="ivr-record_prompt.wav"/>
+      <prompt phrase="Bitte nehmen Sie die kurz-Begrüßung auf." filename="ivr-record_short_greeting.wav"/>
+      <prompt phrase="Bitte nehmen Sie den ungültig-Ton auf." filename="ivr-record_invalid_sound.wav"/>
+      <prompt phrase="Bite nehmen Sie den verlassen-Ton auf." filename="ivr-record_exit_sound.wav"/>
+      <prompt phrase="Wenn Sie fertig sind, drücken Sie bitte irgendeine Taste." filename="ivr-when_finished_press_any_key.wav"/>
+      <prompt phrase="Bitte drücken Sie die Taste der Ansage, die Sie ändern möchten." filename="ivr-press_key_of_prompt_to_modify.wav"/>
+      <prompt phrase="Taste" filename="ivr-key.wav"/>
+      <prompt phrase="enthält keine Aufnahme." filename="ivr-does_not_contain_a_recording.wav"/>
+      <prompt phrase="enthält bereits eine Aufnahme." filename="ivr-already_contains_a_recording.wav"/>
+      <prompt phrase="Option" filename="ivr-option.wav"/>
+      <prompt phrase="hatte" filename="ivr-has_had.wav"/>
+      <prompt phrase="Besucher." filename="ivr-visitors.wav"/>
+      <prompt phrase="Besucher." filename="ivr-visitor.wav"/>
+      <prompt phrase="Um die Zähler der Nutzungsstatistik zurückzusetzen" filename="ivr-reset_usage_totals.wav"/>
+      <prompt phrase="Sie haben die Zähler-Rücksetzung ausgewählt." filename="ivr-reset_usage_totals_confirm.wav"/>
+      <prompt phrase="um zu bestätigen." filename="ivr-to_confirm.wav"/>
+      <prompt phrase="Die Zähler der Nutzungsstatistik wurden auf Null zurückgesetzt." filename="ivr-usage_totals_have_been_reset.wav"/>
+      <prompt phrase="Um zu ändern" filename="ivr-to_modify.wav"/>
+      <prompt phrase="Für die Änderung einer Haupt-Ansage" filename="ivr-to_modify_main_prompt.wav"/>
+      <prompt phrase="der kurz-Begrüßung." filename="ivr-short_greeting.wav"/>
+      <prompt phrase="des ungültig-Tons." filename="ivr-ivalid_sound_prompt.wav"/>
+      <prompt phrase="des verlassen-Tons." filename="ivr-exit_sound_prompt.wav"/>
+      <prompt phrase="Diese Person akzeptiert keine anonymen Anrufe." filename="ivr-not_accept_anonymous_calls.wav"/>
+      <prompt phrase="Um zu akzeptieren, drücken Sie bitte" filename="ivr-to_accept_press.wav"/>
+      <prompt phrase="Um abzuweisen" filename="ivr-to_reject.wav"/>
+      <prompt phrase="legen Sie auf." filename="ivr-hang_up.wav"/>
+      <prompt phrase="Um abzuweisen, legen Sie auf, ohne eine Taste zu drücken." filename="ivr-to_reject_hangup.wav"/>
+      <prompt phrase="Zugang verweigert." filename="ivr-access_denied.wav"/>
+      <prompt phrase="Um einen D T M F test zu machen, drücken Sie bitte" filename="ivr-perform_dtmf_test.wav"/>
+      <prompt phrase="Um einen Echo-test zu machen, drücken Sie bitte" filename="ivr-perform_echo_test.wav"/>
+      <prompt phrase="Um einen verzögerten Echo-test zu machen, drücken Sie bitte" filename="ivr-perform_delayed_echo_test.wav"/>
+      <prompt phrase="Legen Sie auf, um den Test zu beenden." filename="ivr-hangup_to_end_test.wav"/>
+      <prompt phrase="Drücken Sie eine oder mehrere Tasten. Jede gedrückte Taste wird Ihnen vorgelesen." filename="ivr-read_each_key_press_back.wav"/>
+      <prompt phrase="Nach dem Signalton wird Ihr Audio-Signal an Sie zurückgesendet." filename="ivr-echo_your_audio_back.wav"/>
+      <prompt phrase="Nach dem Signalton wird Ihr Audio-Signal zeitverzögert an Sie zurückgesendet." filename="ivr-delayed_echo_your_audio_back.wav"/>
+      <prompt phrase="Um ein Fax zu senden, drücken Sie bitte" filename="ivr-send_fax.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer ein, an die dieses Fax gesendet werden soll. Dann drücken Sie bitte die Raute Taste." filename="ivr-enter_number_send_fax.wav"/>
+      <prompt phrase="Verschicke Ihr Fax jetzt." filename="ivr-send_fax_now.wav"/>
+      <prompt phrase="wurde auf die Sperrliste gesetzt." filename="ivr-has_been_added_to_the_blacklist.wav"/>
+      <prompt phrase="wurde von der Sperrliste entfernt." filename="ivr-has_been_removed_from_blacklist.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer ein, die zur Sperrliste hinzugefügt werden soll." filename="ivr-enter_number_to_add_to_blacklist.wav"/>
+      <prompt phrase="Bitte geben Sie die Nummer ein, die zur Sperrliste entfernt werden soll." filename="ivr-enter_number_to_remove_from_blacklist.wav"/>
+      <prompt phrase="ist nicht auf der Sperrliste." filename="ivr-is_not_on_the_blacklist.wav"/>
+      <prompt phrase="ist bereits auf der Sperrliste." filename="ivr-is_already_on_the_blacklist.wav"/>
+      <prompt phrase="Für weitere Optionen, drücken Sie bitte" filename="ivr-for_other_options.wav"/>
+      <prompt phrase="Die von Ihnen übermittelte Rufnummer ist" filename="ivr-your_caller_id_information_is.wav"/>
+      <prompt phrase="Rufnummernunterdrückung wurde aktiviert." filename="ivr-call_screening_enabled.wav"/>
+      <prompt phrase="Rufnummernunterdrückung wurde deaktiviert." filename="ivr-call_screening_disabled.wav"/>
+      <prompt phrase="Ansage." filename="ivr-prompt.wav"/>
+      <prompt phrase="aktiviert." filename="ivr-enabled.wav"/>
+      <prompt phrase="deaktiviert." filename="ivr-disabled.wav"/>
+      <prompt phrase="Niemand hat doch Zeit für sowas!" filename="ivr-aint_nobody_got_time_for_that.wav"/>
+      <prompt phrase="Wir würden Ihnen ja helfen bis Ihnen Hören und Sehen vergeht, wenn Sie doch nur unsere Anrufe und E-Mails beantworten würden." filename="ivr-we_would_support.wav"/>
+      <prompt phrase="nerven." filename="ivr-suck.wav"/>
+      <prompt phrase="nervt." filename="ivr-sucks.wav"/>
+      <prompt phrase="Um Ihre letzte Nummer erneut zu wählen" filename="ivr-to_redial_last_call.wav"/>
+      <prompt phrase="Um die Funktion Ihres Dienstes sicherzustellen" filename="ivr-ensure_service_keeps_working.wav"/>
+      <prompt phrase="Bitte wenden Sie sich an" filename="ivr-please_contact.wav"/>
+      <prompt phrase="die Rechnungs-Abteilung" filename="ivr-the_billing_department.wav"/>
+      <prompt phrase="durch Wahl von" filename="ivr-by_dialing.wav"/>
+      <prompt phrase="Die hinterlegte Kreditkarten Nummer konnte nicht belastet werden." filename="ivr-credit_card_could_not_be_charged.wav"/>
+      <prompt phrase="Einheiten" filename="ivr-units.wav"/>
+      <prompt phrase="pro Minute." filename="ivr-per_minute.wav"/>
+      <prompt phrase="Die von Ihnen übermittelte Rufnummer ist gesperrt." filename="ivr-your_caller_id_number_is_blocked.wav"/>
+      <prompt phrase="Sie müssen die von Ihnen übermittelte Rufnummer entsprerren, um die Anruf-Karte ohne PIN benutzen zu können." filename="ivr-unblock_caller_id.wav"/>
+      <prompt phrase="Alternativ können Sie auch eine PIN für Ihr Konto verwenden." filename="ivr-request_or_use_pin.wav"/> 
+      <prompt phrase="Sie haben genutzt" filename="ivr-you_have_used.wav"/>
+      <prompt phrase="von" filename="ivr-out_of.wav"/>
+      <prompt phrase="Der nächste Abrechnungszeitraum beginnt am" filename="ivr-next_billing_cycle.wav"/> 
+      <prompt phrase="Nichts." filename="ivr-nothing.wav"/>
+      <prompt phrase="Zum Überspringen der" filename="ivr-to_skip.wav"/>
+      <prompt phrase="Bitte warten Sie. Wir verbinden Sie mit der Vermittlung." filename="ivr_connect_live_operator.wav"/>
+      <prompt phrase="Bitte warten Sie. Wir verbinden Sie mit einem echten Menschen." filename="ivr-connect_actual_human_being.wav"/>
+      <prompt phrase="Das System wird für Wartungsarbeiten heruntergefahren." filename="ivr-system_down_for_maintenance.wav"/>
+      <prompt phrase="Das System wird wieder zur Verfügung stehen in etwa" filename="ivr-system_back_up_in_approximately.wav"/>
+      <prompt phrase="Da haben wir es, Schwarz auf Weiß, kristallklar! Sie haben kohlensäurehaltige Drinks gestohlen! Sie sind an die Decke gestoßen, die jetzt gewaschen und sterilisiert werden muss, damit man sich nichts holt! Sie verlieren! Guten Tag der Herr!" filename="ivr-you_lose.wav"/>
+      <prompt phrase="Für ein kaltes Getränk, drücken Sie bitte" filename="ivr-cold_pop.wav"/>
+      <prompt phrase="Wenn Sie dachten dass jemand grillt, drücken Sie bitte" filename="ivr-barbecuing.wav"/>
+      <prompt phrase="Wenn Sie keine Schuhe und nichts anderes mitgenommen haben und um Ihr Leben gerannt sind, drücken Sie bitte" filename="ivr-no_shoes.wav"/>
+      <prompt phrase="Wenn bei Ihnen Rauch hereingelassen wurde, drücken Sie bitte" filename="ivr-smoke_got_you.wav"/>
+      <prompt phrase="Wenn Sie eine Bronchitis haben, drücken Sie bitte" filename="ivr-got_bronchitis.wav"/>
+      <prompt phrase="Wenn Sie keine Zeit für sowas haben, drücken Sie bitte" filename="ivr-nobody_got_time_for_that.wav"/>
+      <prompt phrase="Dies ist eine Erinnerung an" filename="ivr-reminder_for.wav"/>
+      <prompt phrase="Sie haben einen Termin um" filename="ivr-appointment_schedule_for.wav"/>
+      <prompt phrase="Ihre PIN ist" filename="ivr-Your_verification_PIN_is.wav"/>
+      <prompt phrase="Um das Fliegender-Wechsel-Menü zu hören, geben Sie bitte die Nummer Ihrer Nebenstelle ein, gefolgt von der Raute Taste." filename="ivr-hotseat_intro.wav"/>
+      <prompt phrase="Um die Haupt-Ansage zu ändern" filename="ivr-modify_main_prompt.wav"/>
+      <prompt phrase="Bitte geben Sie Ihre PIN ein, gefolgt von der Raute Taste." filename="ivr-hotseat_pin.wav"/>
+      <prompt phrase="Um dieses Telefon Ihrer Nebenstelle zuzuordnen, drücken Sie bitte 1. Um die Zuordnung zu löschen, drücken Sie 2." filename="ivr-hotseat_options.wav"/>
+      <!-- The following phrases still need to be recorded -->
+      <prompt phrase="Sie haben die maximale Anzahl von Kanälen erreicht." filename="ivr-maximum_number_of_channels.wav"/>
+      <prompt phrase="Um einen Echo-Test zu machen" filename="ivr-to_do_a_echo_test.wav"/>
+    </ivr>
+    <misc>
+      <prompt phrase="Dieser Anruf wurde gesichert" filename="call_secured.wav"/>
+      <prompt phrase="gefolgt von der Raute Taste" filename="followed.wav"/>
+      <prompt phrase="Wenn Sie diese Person sind" filename="if_you_are_this_person.wav"/>
+      <prompt phrase="Wenn Sie möchten" filename="if_you_would_like_to.wav"/>
+      <prompt phrase="Angabe der Referenznummer" filename="provide_reference_number.wav"/>
+      <prompt phrase="Bitte geben Sie die Durchwahl ein, an die Sie weiterleiten wollen" filename="transfer1.wav"/>
+      <prompt phrase="Weiterleiten" filename="transfer2.wav"/>
+      <prompt phrase="Wir versuchen zu erreichen" filename="we_are_trying_to_reach.wav"/>
+      <prompt phrase="Es ist ein Fehler aufgetreten. Bitte benachrichtigen Sie den Administrator." filename="error.wav"/>
+      <prompt phrase="Ihr Anruf wird beendet in" filename="misc-your_call_will_be_terminated_in.wav"/>
+      <prompt phrase="Ihr Anruf wurde beendet." filename="misc-your_call_has_been_terminated.wav"/>
+      <prompt phrase="Um etwas über die ClueCon zu erfahren" filename="misc-to_hear_about_cluecon.wav"/>
+      <prompt phrase="Die ClueCon ist die wichtigste Konferenz für Open Source Telefonie Entwickler." filename="misc-cluecon_is_premier_conference.wav"/>
+      <prompt phrase="Sie findet jeden Sommer in Chicago statt." filename="misc-chicago_each_summer.wav"/>
+      <prompt phrase="Die ClueCon bringt ein breites Spektrum von VoIP Ingenieuren, Software-Entwicklern und Geschäftsleuten zusammen." filename="misc-wide_range_of_persons.wav"/>
+      <prompt phrase="Indem Sie an der ClueCon teilnehmen, zeigen Sie nicht nur Unterstützung für FreeSwitch, sondern auch für das ganze Spektrum der Open Source Telefonie Software." filename="misc-support_open_source_by_attending.wav"/>
+      <prompt phrase="FreeSWITCH ist eine Open Source Software, die von der Open Source Telephony Advancement Group OSTAG unterstützt wird." filename="misc-freeswitch_sponsored_by_ostag.wav"/>
+      <prompt phrase="OSTAG treibt die nächste Generation der Telefonie durch Open Source Software voran. Mehr Informationen erhalten Sie unter W W W punkt OSTAG punkt org." filename="misc-ostag_learn_more.wav"/>
+      <prompt phrase="Um mehr über FreeSWITCH Lösungen zu erfahren, drücken Sie bitte" filename="misc-learn_more_about_freeswitch_solutions.wav"/>
+      <prompt phrase="Um mehr über FreeSWITCH zu erfahren, drücken Sie bitte" filename="misc-information_about_freeswitch.wav"/>
+      <prompt phrase="FreeSWITCH ist eine Telekommunikations-Plattform auf dem neuesten Stand der Technik." filename="misc-freeswitch_is_state_of_the_art.wav"/>
+      <prompt phrase="Sie ist stabil, hoch skalierbar und erweiterbar." filename="misc-it_is_stable_scalable_extensible.wav"/>
+      <prompt phrase="Und das beste ist: jedermann kann sie kostenlos herunterladen und benutzen." filename="misc-free_to_download.wav"/>
+      <prompt phrase="Besuchen Sie W W W punkt freeswitch punkt org um mehr über unser Projekt und die weltweite Gemeinschaft zu erfahren." filename="misc-freeswitch_dot_org_more.wav"/>
+      <prompt phrase="Sangoma." filename="misc-Sangoma.wav"/>
+      <prompt phrase="FreeSWITCH Solutions bietet das beste FreeSWITCH consulting, das man für Geld kaufen kann." filename="misc-fss_best_consulting.wav"/>
+      <prompt phrase="Wir haben ein Netzwerk von FreeSWITCH experten, das auch die Kern-Entwickler von FreeSWITCH umfasst." filename="misc-fss_network_of_experts.wav"/>
+      <prompt phrase="Nehmen Sie mit uns Kontakt auf unter W W W punkt freeswitch solutions punkt com um zu erfahren wie wir Sie unterstützen können, ihre FreeSWITCH Erfahrung auf ein ganz neues Niveau zu bringen." filename="misc-fss_contact_us.wav"/>
+      <prompt phrase="Das ist so ein Scheiß für Helikopter-Eltern." filename="misc-soccer_mom.wav"/>
+      <prompt phrase="Möchten Sie live mit anderen Mitgliedern der FreeSWITCH Gemeinschaft reden?" filename="misc-speak_live_with_community.wav"/>
+      <prompt phrase="Machen Sie mit, jeden Mittwoch um 13 Uhr nordamerikanischer Ostküsten-Zeit beziehungsweise 19 Uhr mitteleuropäischer Zeit, bei der öffentlichen FreeSWITCH Konferenz." filename="misc-join_us_each_wed.wav"/>
+      <prompt phrase="Weitere Informationen gibt es online unter wiki punkt FreeSwitch punkt org." filename="misc-more_info_wiki.wav"/>
+      <prompt phrase="Willkommen bei der FreeSWITCH Telefonkonferenz!" filename="misc-welcome_freeswitch_conf_call.wav"/>
+      <prompt phrase="E hundert vierundsechtzig punkt org" filename="misc-e164_dot_org.wav"/>
+      <prompt phrase="Dieses Telefon darf keine externen Nummern wählen." filename="phone_not_auth.wav"/>
+      <prompt phrase="Es tut mir leid." filename="sorry.wav"/>
+      <prompt phrase="Dieser Anruf kann zur Qualitätssicherung oder für Trainings-Zwecke überwacht oder aufgenommen werden." filename="call_monitoring_blurb.wav"/>
+      <prompt phrase="Spanisch." filename="es.wav"/>
+      <prompt phrase="Das ist eine ungültige Durchwahl." filename="invalid_extension.wav"/>
+      <prompt phrase="Englisch." filename="en.wav"/>
+    </misc>
+    <zrtp>
+      <!-- Event prompts -->
+      <prompt phrase="Willkommen bei der Anmeldung zum Z R T P Sicherheits-System." filename="zrtp-enroll_welcome.wav"/>
+      <prompt phrase="Sie müssen den Authentisierungs-String mit Ihrem Gesprächspartner vergleichen. Wenn er nicht übereinstimmt, ist dies ein Hinweis darauf, dass das Gespräch abgehört wird." filename="zrtp-check_sas.wav"/>
+      <prompt phrase="Nur authentisierte Telefone können so konfiguriert werden, dass sie diesem System vertrauen bezüglich der Vermittlung von Verbindungen die mit Z R T P gesichert sind.  Ihr Telefon ist nicht authentisiert, daher wird dieser Anruf nicht vermittelt." filename="zrtp-enroll_not_sip_registered.wav"/>
+      <prompt phrase="Ihr Telefon signalisiert, dass es diesem System bereits vertraut bezüglich der Vermittlung von Verbindungen die mit Z R T P gesichert sind.  Sie müssen deshalb nichts weiter tun." filename="zrtp-enroll_already_enrolled.wav"/>
+      <prompt phrase="Nur Telefone, die das Z R T P Protokoll unterstützen, können diese Nebenstelle Nutzen. Ihr Telefon unterstützt Z R T P nicht, daher wird dieser Anruf nicht vermittelt." filename="zrtp-enroll_notzrtp.wav"/>
+      <prompt phrase="Dieses System ist für die Verarbeitung von mit Z R T P verschlüsselten Telefonanrufen ausgestattet. Sie müssen entscheiden, ob Sie zulassen möchten, dass dieses System Ihre sicheren Telefonanrufe abfangen und möglicherweise überwachen kann. Sie können auflegen, nachdem Sie dies getan haben." filename="zrtp-enroll_confirmed.wav"/>
+      <prompt phrase="Vergleichen Sie diesen Authentifizierungscode mit Ihrem Gesprächspartner, indem Sie sich diesen Code vorlesen." filename="zrtp-is_secure.wav"/>
+      <prompt phrase="Der Authentifizierungscode ist derzeit nicht überprüft." filename="zrtp-is_unverified.wav"/>
+      <prompt phrase="Der Authentifizierungscode ist jetzt überprüft." filename="zrtp-is_verified.wav"/>
+      <prompt phrase="Vielen Dank für Ihren Anruf. Auf Wiedersehen." filename="zrtp-thankyou_goodbye.wav"/>
+      <prompt phrase="Etwas stimmt nicht." filename="zrtp-somethings_wrong.wav"/>
+      <prompt phrase="Fehler." filename="zrtp-status_error.wav"/>
+      <prompt phrase="Verbindung ist nicht sicher." filename="zrtp-status_notsecure.wav"/>
+      <prompt phrase="Verbindung ist sicher." filename="zrtp-status_secure.wav"/>
+      <prompt phrase="Sichere die Verbindung." filename="zrtp-status_securing.wav"/>
+    </zrtp>
   </de>
 </language>
+<!--
+For Emacs:
+Local Variables:
+mode:xml
+indent-tabs-mode:nil
+tab-width:2
+c-basic-offset:2
+End:
+For VIM:
+vim:set softtabstop=2 shiftwidth=2 tabstop=2 expandtab:
+-->
diff --git a/docs/phrase/phrase_en.xml b/docs/phrase/phrase_en.xml
index 9a71b0616b5..e47ca4f9870 100644
--- a/docs/phrase/phrase_en.xml
+++ b/docs/phrase/phrase_en.xml
@@ -248,7 +248,6 @@
       <prompt phrase="for advanced options" filename="vm-advanced_alt.wav"/>
       <prompt phrase="to exit" filename="vm-to_exit.wav"/>
       <prompt phrase="to exit" filename="vm-to_exit_alt.wav"/>
-      <prompt phrase="to record a greeting" filename="vm-record_greeting.wav"/>
       <prompt phrase="to record a greeting" filename="vm-to_record_greeting.wav"/>
       <prompt phrase="to choose greeting" filename="vm-choose_greeting.wav"/>
       <prompt phrase="to record your name" filename="vm-record_name2.wav"/>
@@ -393,10 +392,10 @@
       <prompt phrase="To speak with an operator" filename="ivr-to_speak_with_an_operator.wav"/>
       <prompt phrase="Welcome to FreeSWITCH, the future of telephony" filename="ivr-welcome_to_freeswitch.wav"/>
       <prompt phrase="You may exit at any time by simply hanging up" filename="ivr-you_may_exit_by_hanging_up.wav"/>
-      <prompt phrase="or" filename ="ivr-or.wav"/>
-      <prompt phrase="not" filename ="ivr-not.wav"/>
-      <prompt phrase="call" filename ="ivr-call.wav"/>
-      <prompt phrase="You may" filename ="ivr-you_may.wav"/>
+      <prompt phrase="or" filename="ivr-or.wav"/>
+      <prompt phrase="not" filename="ivr-not.wav"/>
+      <prompt phrase="call" filename="ivr-call.wav"/>
+      <prompt phrase="You may" filename="ivr-you_may.wav"/>
       <prompt phrase="hello" filename="ivr-hello.wav"/>
       <prompt phrase="Using your telephone keypad..." filename="ivr-use_telephone_keypad.wav"/>
       <prompt phrase="Please spell the name of the person you are trying to reach." filename="ivr-spell_name.wav"/>
@@ -463,7 +462,6 @@
       <prompt phrase="Number." filename="ivr-number.wav"/>
       <prompt phrase="...followed by the pound or hash key." filename="ivr-followed_by_pound.wav"/>
       <prompt phrase="The account balance is..." filename="ivr-account_balance_is.wav"/>
-      <prompt phrase="Thank you for calling. If you know your party's extension, please enter it now. For a directory, press..." filename="ivr-generic_greeting.wav"/>
       <prompt phrase="...file..." filename="ivr-file.wav"/>
       <prompt phrase="...files..." filename="ivr-files.wav"/>
       <prompt phrase="For a wakeup call..." filename="ivr-for_a_wakeup_call.wav"/>
@@ -680,7 +678,6 @@
       <prompt phrase="If the smoke got you, press..." filename="ivr-smoke_got_you.wav"/>
       <prompt phrase="If you got bronchitis, press..." filename="ivr-got_bronchitis.wav"/>
       <prompt phrase="If ain't nobody got time for that, press..." filename="ivr-nobody_got_time_for_that.wav"/>
-      <prompt phrase="This is a call from..." filename="ivr-call_from.wav"/>
       <prompt phrase="This is a reminder for..." filename="ivr-reminder_for.wav"/>
       <prompt phrase="You have an appointment scheduled for..." filename="ivr-appointment_schedule_for.wav"/>
       <prompt phrase="Your verification PIN is..." filename="ivr-Your_verification_PIN_is.wav"/>
@@ -690,9 +687,6 @@
       <prompt phrase="To hotseat this phone to your extension, press 1. To remove hotseat on this phone, press 2." filename="ivr-hotseat_options.wav"/>
       <!-- The following phrases still need to be recorded -->
       <prompt phrase="You have reached the maximum number of channels." filename="ivr-maximum_number_of_channels.wav"/>
-      <prompt phrase="" filename=""/>
-      <prompt phrase="" filename=""/>
-      <prompt phrase="" filename=""/>
       <prompt phrase="To do an echo test" filename="ivr-to_do_a_echo_test.wav"/>
     </ivr>
     <misc>
diff --git a/freeswitch.spec b/freeswitch.spec
index bc94233a242..4fa22123058 100644
--- a/freeswitch.spec
+++ b/freeswitch.spec
@@ -2028,6 +2028,7 @@ fi
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/skinny.conf.xml
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/smpp.conf.xml
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/sms_flowroute.conf.xml
+%config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/sndfile.conf.xml
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/sofia.conf.xml
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/spandsp.conf.xml
 %config(noreplace) %attr(0640, freeswitch, daemon) %{sysconfdir}/autoload_configs/switch.conf.xml
@@ -2629,6 +2630,8 @@ fi
 #
 ######################################################################################################################
 %changelog
+* Fri Jan 31 2020 - Andrey Volk
+- Add sndfile.conf.xml
 * Tue Apr 23 2019 - Andrey Volk
 - Fix build for Stack 20.x
 * Tue Dec 11 2018 - Andrey Volk
diff --git a/libs/apr-util/hooks/apr_hooks.c b/libs/apr-util/hooks/apr_hooks.c
index da0ae970581..776bc884e74 100644
--- a/libs/apr-util/hooks/apr_hooks.c
+++ b/libs/apr-util/hooks/apr_hooks.c
@@ -180,7 +180,9 @@ static TSort *tsort(TSort *pData,int nItems)
 		    break;
 		}
     }
-    pTail->pNext=NULL;  /* unfudge the tail */
+	if (pTail) {
+		pTail->pNext = NULL;  /* unfudge the tail */
+	}
     return pHead;
 }
 
diff --git a/libs/apr-util/misc/apr_queue.c b/libs/apr-util/misc/apr_queue.c
index e905a53ebba..a947df8a089 100644
--- a/libs/apr-util/misc/apr_queue.c
+++ b/libs/apr-util/misc/apr_queue.c
@@ -222,7 +222,7 @@ APU_DECLARE(apr_status_t) apr_queue_trypush(apr_queue_t *queue, void *data)
     }
 
     if (apr_queue_full(queue)) {
-        rv = apr_thread_mutex_unlock(queue->one_big_mutex);
+        apr_thread_mutex_unlock(queue->one_big_mutex);
         return APR_EAGAIN;
     }
     
@@ -397,7 +397,7 @@ APU_DECLARE(apr_status_t) apr_queue_trypop(apr_queue_t *queue, void **data)
     }
 
     if (apr_queue_empty(queue)) {
-        rv = apr_thread_mutex_unlock(queue->one_big_mutex);
+        apr_thread_mutex_unlock(queue->one_big_mutex);
         return APR_EAGAIN;
     } 
 
diff --git a/libs/apr-util/xml/apr_xml.c b/libs/apr-util/xml/apr_xml.c
index 73543555ff8..a05348d60f5 100644
--- a/libs/apr-util/xml/apr_xml.c
+++ b/libs/apr-util/xml/apr_xml.c
@@ -82,7 +82,7 @@ static int find_prefix(apr_xml_parser *parser, const char *prefix)
     ** prefix.
     */
     for (; elem; elem = elem->parent) {
-	apr_xml_ns_scope *ns_scope = elem->ns_scope;
+	apr_xml_ns_scope *ns_scope;
 
 	for (ns_scope = elem->ns_scope; ns_scope; ns_scope = ns_scope->next) {
 	    if (strcmp(prefix, ns_scope->prefix) == 0) {
diff --git a/libs/apr-util/xml/expat/lib/xmltok.c b/libs/apr-util/xml/expat/lib/xmltok.c
index e3fe3f788f9..74c681f1e6b 100644
--- a/libs/apr-util/xml/expat/lib/xmltok.c
+++ b/libs/apr-util/xml/expat/lib/xmltok.c
@@ -1225,7 +1225,7 @@ void unknown_toUtf8(const ENCODING *enc,
 		    const char **fromP, const char *fromLim,
 		    char **toP, const char *toLim)
 {
-  char buf[XML_UTF8_ENCODE_MAX];
+  char buf[XML_UTF8_ENCODE_MAX] = {0};
   for (;;) {
     const char *utf8;
     int n;
diff --git a/libs/apr/file_io/unix/filepath.c b/libs/apr/file_io/unix/filepath.c
index 78797bd990c..64f1b2fbd40 100644
--- a/libs/apr/file_io/unix/filepath.c
+++ b/libs/apr/file_io/unix/filepath.c
@@ -133,12 +133,15 @@ APR_DECLARE(apr_status_t) apr_filepath_merge(char **newpath,
          * but required since the compiler (at least vc) doesn't like
          * passing the address of a char const* for a char** arg.
          */
-        char *getpath;
+        char *getpath = NULL;
         rv = apr_filepath_get(&getpath, flags, p);
         rootpath = getpath;
         if (rv != APR_SUCCESS)
             return errno;
 
+        if (!getpath)
+            return APR_ENOMEM;
+
         /* XXX: Any kernel subject to goofy, uncanonical results
          * must run the rootpath against the user's given flags.
          * Simplest would be a recursive call to apr_filepath_merge
diff --git a/libs/apr/file_io/unix/filestat.c b/libs/apr/file_io/unix/filestat.c
index d27eafdcdf2..a34b22c3700 100644
--- a/libs/apr/file_io/unix/filestat.c
+++ b/libs/apr/file_io/unix/filestat.c
@@ -130,7 +130,7 @@ APR_DECLARE(apr_status_t) apr_file_attrs_set(const char *fname,
                                              apr_pool_t *pool)
 {
     apr_status_t status;
-    apr_finfo_t finfo;
+    apr_finfo_t finfo = {0};
 
     /* Don't do anything if we can't handle the requested attributes */
     if (!(attr_mask & (APR_FILE_ATTR_READONLY
@@ -185,7 +185,7 @@ APR_DECLARE(apr_status_t) apr_file_mtime_set(const char *fname,
                                               apr_pool_t *pool)
 {
     apr_status_t status;
-    apr_finfo_t finfo;
+    apr_finfo_t finfo = {0};
 
     status = apr_stat(&finfo, fname, APR_FINFO_ATIME, pool);
     if (status) {
diff --git a/libs/apr/memory/unix/apr_pools.c b/libs/apr/memory/unix/apr_pools.c
index c412a4aef05..7e37a5d0a07 100644
--- a/libs/apr/memory/unix/apr_pools.c
+++ b/libs/apr/memory/unix/apr_pools.c
@@ -821,8 +821,17 @@ APR_DECLARE(apr_status_t) apr_pool_create_ex(apr_pool_t **newpool,
     if (!abort_fn && parent)
         abort_fn = parent->abort_fn;
 
-    if (allocator == NULL)
+    if (allocator == NULL) {
+        if (!parent) {
+            /* There is no way to continue without an allocator when no parent */
+            if (abort_fn)
+                abort_fn(APR_EINVAL);
+
+            return APR_EINVAL;
+        }
+
         allocator = parent->allocator;
+    }
 
     if ((node = allocator_alloc(allocator,
                                 MIN_ALLOC - APR_MEMNODE_T_SIZE)) == NULL) {
diff --git a/libs/apr/misc/unix/otherchild.c b/libs/apr/misc/unix/otherchild.c
index 427a57e7e47..c97cfdd6446 100644
--- a/libs/apr/misc/unix/otherchild.c
+++ b/libs/apr/misc/unix/otherchild.c
@@ -96,7 +96,7 @@ APR_DECLARE(void) apr_proc_other_child_unregister(void *data)
     }
 
     /* segfault if this function called with invalid parm */
-    apr_pool_cleanup_kill(cur->p, cur->data, other_child_cleanup);
+    if (cur) apr_pool_cleanup_kill(cur->p, cur->data, other_child_cleanup);
     other_child_cleanup(data);
 }
 
diff --git a/libs/apr/network_io/unix/sendrecv.c b/libs/apr/network_io/unix/sendrecv.c
index 9ebc3cf3b4b..ca7230accca 100644
--- a/libs/apr/network_io/unix/sendrecv.c
+++ b/libs/apr/network_io/unix/sendrecv.c
@@ -27,6 +27,8 @@
 #include <osreldate.h>
 #endif
 
+#include <assert.h>     /* assert() */
+
 apr_status_t apr_socket_send(apr_socket_t *sock, const char *buf, 
                              apr_size_t *len)
 {
@@ -287,6 +289,7 @@ apr_status_t apr_socket_sendfile(apr_socket_t *sock, apr_file_t *file,
 
     /* Ignore flags for now. */
     flags = 0;
+    assert(flags==0);
 
     if (hdtr->numheaders > 0) {
         apr_size_t hdrbytes;
diff --git a/libs/apr/network_io/unix/sockets.c b/libs/apr/network_io/unix/sockets.c
index 4cff3bbca19..72dab7b89cf 100644
--- a/libs/apr/network_io/unix/sockets.c
+++ b/libs/apr/network_io/unix/sockets.c
@@ -32,7 +32,11 @@ static apr_status_t socket_cleanup(void *sock)
 {
     apr_socket_t *thesocket = sock;
 
-	if (thesocket && thesocket->socketdes == -1) {
+	if (!thesocket) {
+		return APR_ENOTSOCK;
+	}
+
+	if (thesocket->socketdes == -1) {
 		return APR_SUCCESS;
 	}
 
diff --git a/libs/apr/random/unix/sha2.c b/libs/apr/random/unix/sha2.c
index 169afd5a78f..b8bd241d85f 100644
--- a/libs/apr/random/unix/sha2.c
+++ b/libs/apr/random/unix/sha2.c
@@ -444,6 +444,16 @@ void apr__SHA256_Transform(SHA256_CTX* context, const sha2_word32* data) {
 
         /* Clean up */
         a = b = c = d = e = f = g = h = T1 = T2 = 0;
+        assert(a==0);
+        assert(b==0);
+        assert(c==0);
+        assert(d==0);
+        assert(e==0);
+        assert(f==0);
+        assert(g==0);
+        assert(h==0);
+        assert(T1==0);
+        assert(T2==0);
 }
 
 #endif /* SHA2_UNROLL_TRANSFORM */
@@ -478,6 +488,8 @@ void apr__SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len)
                         context->bitcount += len << 3;
                         /* Clean up: */
                         usedspace = freespace = 0;
+                        assert(usedspace==0);
+                        assert(freespace==0);
                         return;
                 }
         }
@@ -495,6 +507,8 @@ void apr__SHA256_Update(SHA256_CTX* context, const sha2_byte *data, size_t len)
         }
         /* Clean up: */
         usedspace = freespace = 0;
+        assert(usedspace==0);
+        assert(freespace==0);
 }
 
 void apr__SHA256_Final(sha2_byte digest[], SHA256_CTX* context) {
@@ -559,6 +573,7 @@ void apr__SHA256_Final(sha2_byte digest[], SHA256_CTX* context) {
         /* Clean up state data: */
         MEMSET_BZERO(context, sizeof(*context));
         usedspace = 0;
+        assert(usedspace==0);
 }
 
 char *apr__SHA256_End(SHA256_CTX* context, char buffer[]) {
@@ -768,6 +783,16 @@ void apr__SHA512_Transform(SHA512_CTX* context, const sha2_word64* data) {
 
         /* Clean up */
         a = b = c = d = e = f = g = h = T1 = T2 = 0;
+        assert(a==0);
+        assert(b==0);
+        assert(c==0);
+        assert(d==0);
+        assert(e==0);
+        assert(f==0);
+        assert(g==0);
+        assert(h==0);
+        assert(T1==0);
+        assert(T2==0);
 }
 
 #endif /* SHA2_UNROLL_TRANSFORM */
@@ -802,6 +827,8 @@ void apr__SHA512_Update(SHA512_CTX* context, const sha2_byte *data, size_t len)
                         ADDINC128(context->bitcount, len << 3);
                         /* Clean up: */
                         usedspace = freespace = 0;
+                        assert(usedspace==0);
+                        assert(freespace==0);
                         return;
                 }
         }
@@ -819,6 +846,8 @@ void apr__SHA512_Update(SHA512_CTX* context, const sha2_byte *data, size_t len)
         }
         /* Clean up: */
         usedspace = freespace = 0;
+        assert(usedspace==0);
+        assert(freespace==0);
 }
 
 void apr__SHA512_Last(SHA512_CTX* context) {
diff --git a/libs/apr/strings/apr_snprintf.c b/libs/apr/strings/apr_snprintf.c
index 4f59f92c764..611f2d7972b 100644
--- a/libs/apr/strings/apr_snprintf.c
+++ b/libs/apr/strings/apr_snprintf.c
@@ -110,7 +110,6 @@ static char *apr_cvt(double arg, int ndigits, int *decpt, int *sign,
         arg = -arg;
     }
     arg = modf(arg, &fi);
-    p1 = &buf[NDIG];
     /*
      * Do integer part
      */
diff --git a/libs/apr/strings/apr_strings.c b/libs/apr/strings/apr_strings.c
index ec687913c53..771d351f19d 100644
--- a/libs/apr/strings/apr_strings.c
+++ b/libs/apr/strings/apr_strings.c
@@ -124,7 +124,7 @@ APR_DECLARE(void *) apr_pmemdup(apr_pool_t *a, const void *m, apr_size_t n)
 APR_DECLARE_NONSTD(char *) apr_pstrcat(apr_pool_t *a, ...)
 {
     char *cp, *argp, *res;
-    apr_size_t saved_lengths[MAX_SAVED_LENGTHS];
+    apr_size_t saved_lengths[MAX_SAVED_LENGTHS] = { 0 };
     int nargs = 0;
 
     /* Pass one --- find length of required string */
diff --git a/libs/apr/tables/apr_hash.c b/libs/apr/tables/apr_hash.c
index 4e3723e19f1..5cac297af96 100644
--- a/libs/apr/tables/apr_hash.c
+++ b/libs/apr/tables/apr_hash.c
@@ -459,7 +459,7 @@ APR_DECLARE(apr_hash_t *) apr_hash_merge(apr_pool_t *p,
                     break;
                 }
             }
-            if (!ent) {
+            if (new_vals && !ent) {
                 new_vals[j].klen = iter->klen;
                 new_vals[j].key = iter->key;
                 new_vals[j].val = iter->val;
diff --git a/libs/esl/src/cJSON.c b/libs/esl/src/cJSON.c
index 3206091a5b1..1f092ea8f94 100644
--- a/libs/esl/src/cJSON.c
+++ b/libs/esl/src/cJSON.c
@@ -1,16 +1,13 @@
 /*
-  Copyright (c) 2009 Dave Gamble
-
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:
-
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.
-
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,29 +20,72 @@
 /* cJSON */
 /* JSON parser in C. */
 
+/* disable warnings about old C89 functions in MSVC */
+#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER)
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility push(default)
+#endif
+#if defined(_MSC_VER)
+#pragma warning (push)
+/* disable warning about single line comments in system headers */
+#pragma warning (disable : 4001)
+#endif
+
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
-#include <float.h>
 #include <limits.h>
 #include <ctype.h>
+
+#ifdef ENABLE_LOCALES
+#include <locale.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
 #include "esl_cJSON.h"
 
 /* define our own boolean type */
-typedef int cjbool;
-#define true ((cjbool)1)
-#define false ((cjbool)0)
+#ifdef true
+#undef true
+#endif
+#define true ((cJSON_bool)1)
+
+#ifdef false
+#undef false
+#endif
+#define false ((cJSON_bool)0)
 
-static const unsigned char *global_ep = NULL;
+typedef struct {
+    const unsigned char *json;
+    size_t position;
+} error;
+static error global_error = { NULL, 0 };
 
 CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void)
 {
-    return (const char*) global_ep;
+    return (const char*) (global_error.json + global_error.position);
+}
+
+CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item) {
+    if (!cJSON_IsString(item)) {
+        return NULL;
+    }
+
+    return item->valuestring;
 }
 
 /* This is a safeguard to prevent copy-pasters from using incompatible C and header files */
-#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 3) || (CJSON_VERSION_PATCH != 0)
+#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 7) || (CJSON_VERSION_PATCH != 12)
     #error cJSON.h and cJSON.c have different versions. Make sure that both have the same.
 #endif
 
@@ -57,49 +97,79 @@ CJSON_PUBLIC(const char*) cJSON_Version(void)
     return version;
 }
 
-/* case insensitive strcmp */
-static int cJSON_strcasecmp(const unsigned char *s1, const unsigned char *s2)
+/* Case insensitive string comparison, doesn't consider two NULL pointers equal though */
+static int case_insensitive_strcmp(const unsigned char *string1, const unsigned char *string2)
 {
-    if (!s1)
+    if ((string1 == NULL) || (string2 == NULL))
     {
-        return (s1 == s2) ? 0 : 1; /* both NULL? */
+        return 1;
     }
-    if (!s2)
+
+    if (string1 == string2)
     {
-        return 1;
+        return 0;
     }
-    for(; tolower(*s1) == tolower(*s2); ++s1, ++s2)
+
+    for(; tolower(*string1) == tolower(*string2); (void)string1++, string2++)
     {
-        if (*s1 == '\0')
+        if (*string1 == '\0')
         {
             return 0;
         }
     }
 
-    return tolower(*s1) - tolower(*s2);
+    return tolower(*string1) - tolower(*string2);
 }
 
 typedef struct internal_hooks
 {
-    void *(*allocate)(size_t size);
-    void (*deallocate)(void *pointer);
-    void *(*reallocate)(void *pointer, size_t size);
+    void *(CJSON_CDECL *allocate)(size_t size);
+    void (CJSON_CDECL *deallocate)(void *pointer);
+    void *(CJSON_CDECL *reallocate)(void *pointer, size_t size);
 } internal_hooks;
 
-static internal_hooks global_hooks = { malloc, free, realloc };
+#if defined(_MSC_VER)
+/* work around MSVC error C2322: '...' address of dillimport '...' is not static */
+static void * CJSON_CDECL internal_malloc(size_t size)
+{
+    return malloc(size);
+}
+static void CJSON_CDECL internal_free(void *pointer)
+{
+    free(pointer);
+}
+static void * CJSON_CDECL internal_realloc(void *pointer, size_t size)
+{
+    return realloc(pointer, size);
+}
+#else
+#define internal_malloc malloc
+#define internal_free free
+#define internal_realloc realloc
+#endif
+
+/* strlen of character literals resolved at compile time */
+#define static_strlen(string_literal) (sizeof(string_literal) - sizeof(""))
 
-static unsigned char* cJSON_strdup(const unsigned char* str, const internal_hooks * const hooks)
+static internal_hooks global_hooks = { internal_malloc, internal_free, internal_realloc };
+
+static unsigned char* cJSON_strdup(const unsigned char* string, const internal_hooks * const hooks)
 {
-    size_t len = 0;
+    size_t length = 0;
     unsigned char *copy = NULL;
-	const unsigned char *s = str ? str : (unsigned char *)"";
 
-    len = strlen((const char*)s) + 1;
-    if (!(copy = (unsigned char*)hooks->allocate(len)))
+    if (string == NULL)
+    {
+        return NULL;
+    }
+
+    length = strlen((const char*)string) + sizeof("");
+    copy = (unsigned char*)hooks->allocate(length);
+    if (copy == NULL)
     {
         return NULL;
     }
-    memcpy(copy, s, len);
+    memcpy(copy, string, length);
 
     return copy;
 }
@@ -148,44 +218,110 @@ static cJSON *cJSON_New_Item(const internal_hooks * const hooks)
 }
 
 /* Delete a cJSON structure. */
-CJSON_PUBLIC(void) cJSON_Delete(cJSON *c)
+CJSON_PUBLIC(void) cJSON_Delete(cJSON *item)
 {
     cJSON *next = NULL;
-    while (c)
+    while (item != NULL)
     {
-        next = c->next;
-        if (!(c->type & cJSON_IsReference) && c->child)
+        next = item->next;
+        if (!(item->type & cJSON_IsReference) && (item->child != NULL))
         {
-            cJSON_Delete(c->child);
+            cJSON_Delete(item->child);
         }
-        if (!(c->type & cJSON_IsReference) && c->valuestring)
+        if (!(item->type & cJSON_IsReference) && (item->valuestring != NULL))
         {
-            global_hooks.deallocate(c->valuestring);
+            global_hooks.deallocate(item->valuestring);
         }
-        if (!(c->type & cJSON_StringIsConst) && c->string)
+        if (!(item->type & cJSON_StringIsConst) && (item->string != NULL))
         {
-            global_hooks.deallocate(c->string);
+            global_hooks.deallocate(item->string);
         }
-        global_hooks.deallocate(c);
-        c = next;
+        global_hooks.deallocate(item);
+        item = next;
     }
 }
 
+/* get the decimal point character of the current locale */
+static unsigned char get_decimal_point(void)
+{
+#ifdef ENABLE_LOCALES
+    struct lconv *lconv = localeconv();
+    return (unsigned char) lconv->decimal_point[0];
+#else
+    return '.';
+#endif
+}
+
+typedef struct
+{
+    const unsigned char *content;
+    size_t length;
+    size_t offset;
+    size_t depth; /* How deeply nested (in arrays/objects) is the input at the current offset. */
+    internal_hooks hooks;
+} parse_buffer;
+
+/* check if the given size is left to read in a given parse buffer (starting with 1) */
+#define can_read(buffer, size) ((buffer != NULL) && (((buffer)->offset + size) <= (buffer)->length))
+/* check if the buffer can be accessed at the given index (starting with 0) */
+#define can_access_at_index(buffer, index) ((buffer != NULL) && (((buffer)->offset + index) < (buffer)->length))
+#define cannot_access_at_index(buffer, index) (!can_access_at_index(buffer, index))
+/* get a pointer to the buffer at the position */
+#define buffer_at_offset(buffer) ((buffer)->content + (buffer)->offset)
+
 /* Parse the input text to generate a number, and populate the result into item. */
-static const unsigned char *parse_number(cJSON * const item, const unsigned char * const input)
+static cJSON_bool parse_number(cJSON * const item, parse_buffer * const input_buffer)
 {
     double number = 0;
     unsigned char *after_end = NULL;
+    unsigned char number_c_string[64];
+    unsigned char decimal_point = get_decimal_point();
+    size_t i = 0;
 
-    if (input == NULL)
+    if ((input_buffer == NULL) || (input_buffer->content == NULL))
     {
-        return NULL;
+        return false;
+    }
+
+    /* copy the number into a temporary buffer and replace '.' with the decimal point
+     * of the current locale (for strtod)
+     * This also takes care of '\0' not necessarily being available for marking the end of the input */
+    for (i = 0; (i < (sizeof(number_c_string) - 1)) && can_access_at_index(input_buffer, i); i++)
+    {
+        switch (buffer_at_offset(input_buffer)[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '+':
+            case '-':
+            case 'e':
+            case 'E':
+                number_c_string[i] = buffer_at_offset(input_buffer)[i];
+                break;
+
+            case '.':
+                number_c_string[i] = decimal_point;
+                break;
+
+            default:
+                goto loop_end;
+        }
     }
+loop_end:
+    number_c_string[i] = '\0';
 
-    number = strtod((const char*)input, (char**)&after_end);
-    if (input == after_end)
+    number = strtod((const char*)number_c_string, (char**)&after_end);
+    if (number_c_string == after_end)
     {
-        return NULL; /* parse_error */
+        return false; /* parse_error */
     }
 
     item->valuedouble = number;
@@ -195,7 +331,7 @@ static const unsigned char *parse_number(cJSON * const item, const unsigned char
     {
         item->valueint = INT_MAX;
     }
-    else if (number <= INT_MIN)
+    else if (number <= (double)INT_MIN)
     {
         item->valueint = INT_MIN;
     }
@@ -206,7 +342,8 @@ static const unsigned char *parse_number(cJSON * const item, const unsigned char
 
     item->type = cJSON_Number;
 
-    return after_end;
+    input_buffer->offset += (size_t)(after_end - number_c_string);
+    return true;
 }
 
 /* don't ask me, but the original cJSON_SetNumberValue returns an integer or double */
@@ -216,13 +353,13 @@ CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number)
     {
         object->valueint = INT_MAX;
     }
-    else if (number <= INT_MIN)
+    else if (number <= (double)INT_MIN)
     {
         object->valueint = INT_MIN;
     }
     else
     {
-        object->valueint = cJSON_Number;
+        object->valueint = (int)number;
     }
 
     return object->valuedouble = number;
@@ -233,11 +370,14 @@ typedef struct
     unsigned char *buffer;
     size_t length;
     size_t offset;
-    cjbool noalloc;
+    size_t depth; /* current nesting depth (for formatted printing) */
+    cJSON_bool noalloc;
+    cJSON_bool format; /* is this print a formatted print */
+    internal_hooks hooks;
 } printbuffer;
 
 /* realloc printbuffer if necessary to have at least "needed" bytes more */
-static unsigned char* ensure(printbuffer * const p, size_t needed, const internal_hooks * const hooks)
+static unsigned char* ensure(printbuffer * const p, size_t needed)
 {
     unsigned char *newbuffer = NULL;
     size_t newsize = 0;
@@ -247,13 +387,19 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
         return NULL;
     }
 
+    if ((p->length > 0) && (p->offset >= p->length))
+    {
+        /* make sure that offset is valid */
+        return NULL;
+    }
+
     if (needed > INT_MAX)
     {
         /* sizes bigger than INT_MAX are currently not supported */
         return NULL;
     }
 
-    needed += p->offset;
+    needed += p->offset + 1;
     if (needed <= p->length)
     {
         return p->buffer + p->offset;
@@ -264,8 +410,7 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
     }
 
     /* calculate new buffer size */
-    newsize = needed * 2;
-    if (newsize > INT_MAX)
+    if (needed > (INT_MAX / 2))
     {
         /* overflow of int, use INT_MAX if possible */
         if (needed <= INT_MAX)
@@ -277,26 +422,41 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
             return NULL;
         }
     }
+    else
+    {
+        newsize = needed * 2;
+    }
 
-    if (hooks->reallocate != NULL)
+    if (p->hooks.reallocate != NULL)
     {
         /* reallocate with realloc if available */
-        newbuffer = (unsigned char*)hooks->reallocate(p->buffer, newsize);
+        newbuffer = (unsigned char*)p->hooks.reallocate(p->buffer, newsize);
+        if (newbuffer == NULL)
+        {
+            p->hooks.deallocate(p->buffer);
+            p->length = 0;
+            p->buffer = NULL;
+
+            return NULL;
+        }
     }
     else
     {
         /* otherwise reallocate manually */
-        newbuffer = (unsigned char*)hooks->allocate(newsize);
+        newbuffer = (unsigned char*)p->hooks.allocate(newsize);
         if (!newbuffer)
         {
-            hooks->deallocate(p->buffer);
+            p->hooks.deallocate(p->buffer);
             p->length = 0;
             p->buffer = NULL;
 
             return NULL;
         }
-        memcpy(newbuffer, p->buffer, p->offset + 1);
-        hooks->deallocate(p->buffer);
+        if (newbuffer)
+        {
+            memcpy(newbuffer, p->buffer, p->offset + 1);
+        }
+        p->hooks.deallocate(p->buffer);
     }
     p->length = newsize;
     p->buffer = newbuffer;
@@ -318,54 +478,69 @@ static void update_offset(printbuffer * const buffer)
 }
 
 /* Render the number nicely from the given item into a string. */
-static unsigned char *print_number(const cJSON * const item, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_number(const cJSON * const item, printbuffer * const output_buffer)
 {
     unsigned char *output_pointer = NULL;
     double d = item->valuedouble;
+    int length = 0;
+    size_t i = 0;
+    unsigned char number_buffer[26]; /* temporary buffer to print the number into */
+    unsigned char decimal_point = get_decimal_point();
+    double test;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
-    /* value is an int */
-    if ((fabs(((double)item->valueint) - d) <= DBL_EPSILON) && (d <= INT_MAX) && (d >= INT_MIN))
+    /* This checks for NaN and Infinity */
+    if ((d * 0) != 0)
+    {
+        length = sprintf((char*)number_buffer, "null");
+    }
+    else
     {
-        /* 2^64+1 can be represented in 21 chars. */
-        output_pointer = ensure(output_buffer, 21, hooks);
-        if (output_pointer != NULL)
+        /* Try 15 decimal places of precision to avoid nonsignificant nonzero digits */
+        length = sprintf((char*)number_buffer, "%1.15g", d);
+
+        /* Check whether the original double can be recovered */
+        if ((sscanf((char*)number_buffer, "%lg", &test) != 1) || ((double)test != d))
         {
-            sprintf((char*)output_pointer, "%d", item->valueint);
+            /* If not, print with 17 decimal places of precision */
+            length = sprintf((char*)number_buffer, "%1.17g", d);
         }
     }
-    /* value is a floating point number */
-    else
+
+    /* sprintf failed or buffer overrun occurred */
+    if ((length < 0) || (length > (int)(sizeof(number_buffer) - 1)))
+    {
+        return false;
+    }
+
+    /* reserve appropriate space in the output */
+    output_pointer = ensure(output_buffer, (size_t)length + sizeof(""));
+    if (output_pointer == NULL)
+    {
+        return false;
+    }
+
+    /* copy the printed number to the output and replace locale
+     * dependent decimal point with '.' */
+    for (i = 0; i < ((size_t)length); i++)
     {
-        /* This is a nice tradeoff. */
-        output_pointer = ensure(output_buffer, 64, hooks);
-        if (output_pointer != NULL)
+        if (number_buffer[i] == decimal_point)
         {
-            /* This checks for NaN and Infinity */
-            if ((d * 0) != 0)
-            {
-                sprintf((char*)output_pointer, "null");
-            }
-            else if ((fabs(floor(d) - d) <= DBL_EPSILON) && (fabs(d) < 1.0e60))
-            {
-                sprintf((char*)output_pointer, "%.0f", d);
-            }
-            else if ((fabs(d) < 1.0e-6) || (fabs(d) > 1.0e9))
-            {
-                sprintf((char*)output_pointer, "%e", d);
-            }
-            else
-            {
-                sprintf((char*)output_pointer, "%f", d);
-            }
+            output_pointer[i] = '.';
+            continue;
         }
+
+        output_pointer[i] = number_buffer[i];
     }
+    output_pointer[i] = '\0';
+
+    output_buffer->offset += (size_t)length;
 
-    return output_pointer;
+    return true;
 }
 
 /* parse 4 digit hexadecimal number */
@@ -406,28 +581,19 @@ static unsigned parse_hex4(const unsigned char * const input)
 
 /* converts a UTF-16 literal to UTF-8
  * A literal can be one or two sequences of the form \uXXXX */
-static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer, const unsigned char **error_pointer)
+static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer)
 {
-    /* first bytes of UTF8 encoding for a given length in bytes */
-    static const unsigned char firstByteMark[5] =
-    {
-        0x00, /* should never happen */
-        0x00, /* 0xxxxxxx */
-        0xC0, /* 110xxxxx */
-        0xE0, /* 1110xxxx */
-        0xF0 /* 11110xxx */
-    };
-
     long unsigned int codepoint = 0;
     unsigned int first_code = 0;
     const unsigned char *first_sequence = input_pointer;
     unsigned char utf8_length = 0;
+    unsigned char utf8_position = 0;
     unsigned char sequence_length = 0;
+    unsigned char first_byte_mark = 0;
 
     if ((input_end - first_sequence) < 6)
     {
         /* input ends unexpectedly */
-        *error_pointer = first_sequence;
         goto fail;
     }
 
@@ -435,9 +601,8 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
     first_code = parse_hex4(first_sequence + 2);
 
     /* check that the code is valid */
-    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)) || (first_code == 0))
+    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)))
     {
-        *error_pointer = first_sequence;
         goto fail;
     }
 
@@ -451,14 +616,12 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
         if ((input_end - second_sequence) < 6)
         {
             /* input ends unexpectedly */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
         if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u'))
         {
             /* missing second half of the surrogate pair */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
@@ -468,7 +631,6 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
         if ((second_code < 0xDC00) || (second_code > 0xDFFF))
         {
             /* invalid second half of the surrogate pair */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
@@ -494,47 +656,43 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
     {
         /* two bytes, encoding 110xxxxx 10xxxxxx */
         utf8_length = 2;
+        first_byte_mark = 0xC0; /* 11000000 */
     }
     else if (codepoint < 0x10000)
     {
         /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
         utf8_length = 3;
+        first_byte_mark = 0xE0; /* 11100000 */
     }
     else if (codepoint <= 0x10FFFF)
     {
         /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
         utf8_length = 4;
+        first_byte_mark = 0xF0; /* 11110000 */
     }
     else
     {
         /* invalid unicode codepoint */
-        *error_pointer = first_sequence;
         goto fail;
     }
 
     /* encode as utf8 */
-    switch (utf8_length)
-    {
-        case 4:
-            /* 10xxxxxx */
-            (*output_pointer)[3] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 3:
-            /* 10xxxxxx */
-            (*output_pointer)[2] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 2:
-            (*output_pointer)[1] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 1:
-            /* depending on the length in bytes this determines the
-               encoding of the first UTF8 byte */
-            (*output_pointer)[0] = (unsigned char)((codepoint | firstByteMark[utf8_length]) & 0xFF);
-            break;
-        default:
-            *error_pointer = first_sequence;
-            goto fail;
+    for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--)
+    {
+        /* 10xxxxxx */
+        (*output_pointer)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
+        codepoint >>= 6;
+    }
+    /* encode first byte */
+    if (utf8_length > 1)
+    {
+        (*output_pointer)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
     }
+    else
+    {
+        (*output_pointer)[0] = (unsigned char)(codepoint & 0x7F);
+    }
+
     *output_pointer += utf8_length;
 
     return sequence_length;
@@ -544,17 +702,16 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
 }
 
 /* Parse the input text into an unescaped cinput, and populate item. */
-static const unsigned char *parse_string(cJSON * const item, const unsigned char * const input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_buffer)
 {
-    const unsigned char *input_pointer = input + 1;
-    const unsigned char *input_end = input + 1;
+    const unsigned char *input_pointer = buffer_at_offset(input_buffer) + 1;
+    const unsigned char *input_end = buffer_at_offset(input_buffer) + 1;
     unsigned char *output_pointer = NULL;
     unsigned char *output = NULL;
 
     /* not a string */
-    if (*input != '\"')
+    if (buffer_at_offset(input_buffer)[0] != '\"')
     {
-        *error_pointer = input;
         goto fail;
     }
 
@@ -562,12 +719,12 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
         /* calculate approximate size of the output (overestimate) */
         size_t allocation_length = 0;
         size_t skipped_bytes = 0;
-        while ((*input_end != '\"') && (*input_end != '\0'))
+        while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"'))
         {
             /* is escape sequence */
             if (input_end[0] == '\\')
             {
-                if (input_end[1] == '\0')
+                if ((size_t)(input_end + 1 - input_buffer->content) >= input_buffer->length)
                 {
                     /* prevent buffer overflow when last input character is a backslash */
                     goto fail;
@@ -577,14 +734,14 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
             }
             input_end++;
         }
-        if (*input_end == '\0')
+        if (((size_t)(input_end - input_buffer->content) >= input_buffer->length) || (*input_end != '\"'))
         {
             goto fail; /* string ended unexpectedly */
         }
 
         /* This is at most how much we need for the output */
-        allocation_length = (size_t) (input_end - input) - skipped_bytes;
-        output = (unsigned char*)hooks->allocate(allocation_length + sizeof('\0'));
+        allocation_length = (size_t) (input_end - buffer_at_offset(input_buffer)) - skipped_bytes;
+        output = (unsigned char*)input_buffer->hooks.allocate(allocation_length + sizeof(""));
         if (output == NULL)
         {
             goto fail; /* allocation failure */
@@ -603,6 +760,11 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
         else
         {
             unsigned char sequence_length = 2;
+            if ((input_end - input_pointer) < 1)
+            {
+                goto fail;
+            }
+
             switch (input_pointer[1])
             {
                 case 'b':
@@ -628,7 +790,7 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
 
                 /* UTF-16 literal */
                 case 'u':
-                    sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer, error_pointer);
+                    sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
                     if (sequence_length == 0)
                     {
                         /* failed to convert UTF16-literal to UTF-8 */
@@ -637,7 +799,6 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
                     break;
 
                 default:
-                    *error_pointer = input_pointer;
                     goto fail;
             }
             input_pointer += sequence_length;
@@ -650,19 +811,27 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
     item->type = cJSON_String;
     item->valuestring = (char*)output;
 
-    return input_end + 1;
+    input_buffer->offset = (size_t) (input_end - input_buffer->content);
+    input_buffer->offset++;
+
+    return true;
 
 fail:
     if (output != NULL)
     {
-        hooks->deallocate(output);
+        input_buffer->hooks.deallocate(output);
     }
 
-    return NULL;
+    if (input_pointer != NULL)
+    {
+        input_buffer->offset = (size_t)(input_pointer - input_buffer->content);
+    }
+
+    return false;
 }
 
 /* Render the cstring provided to an escaped version that can be printed. */
-static unsigned char *print_string_ptr(const unsigned char * const input, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_string_ptr(const unsigned char * const input, printbuffer * const output_buffer)
 {
     const unsigned char *input_pointer = NULL;
     unsigned char *output = NULL;
@@ -673,42 +842,52 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* empty string */
     if (input == NULL)
     {
-        output = ensure(output_buffer, sizeof("\"\""), hooks);
+        output = ensure(output_buffer, sizeof("\"\""));
         if (output == NULL)
         {
-            return NULL;
+            return false;
         }
         strcpy((char*)output, "\"\"");
 
-        return output;
+        return true;
     }
 
     /* set "flag" to 1 if something needs to be escaped */
     for (input_pointer = input; *input_pointer; input_pointer++)
     {
-        if (strchr("\"\\\b\f\n\r\t", *input_pointer))
-        {
-            /* one character escape sequence */
-            escape_characters++;
-        }
-        else if (*input_pointer < 32)
-        {
-            /* UTF-16 escape sequence uXXXX */
-            escape_characters += 5;
+        switch (*input_pointer)
+        {
+            case '\"':
+            case '\\':
+            case '\b':
+            case '\f':
+            case '\n':
+            case '\r':
+            case '\t':
+                /* one character escape sequence */
+                escape_characters++;
+                break;
+            default:
+                if (*input_pointer < 32)
+                {
+                    /* UTF-16 escape sequence uXXXX */
+                    escape_characters += 5;
+                }
+                break;
         }
     }
     output_length = (size_t)(input_pointer - input) + escape_characters;
 
-    output = ensure(output_buffer, output_length + sizeof("\"\""), hooks);
+    output = ensure(output_buffer, output_length + sizeof("\"\""));
     if (output == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* no characters have to be escaped */
@@ -719,13 +898,13 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
         output[output_length + 1] = '\"';
         output[output_length + 2] = '\0';
 
-        return output;
+        return true;
     }
 
     output[0] = '\"';
     output_pointer = output + 1;
     /* copy the string */
-    for (input_pointer = input; *input_pointer != '\0'; input_pointer++, output_pointer++)
+    for (input_pointer = input; *input_pointer != '\0'; (void)input_pointer++, output_pointer++)
     {
         if ((*input_pointer > 31) && (*input_pointer != '\"') && (*input_pointer != '\\'))
         {
@@ -770,72 +949,138 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
     output[output_length + 1] = '\"';
     output[output_length + 2] = '\0';
 
-    return output;
+    return true;
 }
 
 /* Invoke print_string_ptr (which is useful) on an item. */
-static unsigned char *print_string(const cJSON * const item, printbuffer * const p, const internal_hooks * const hooks)
+static cJSON_bool print_string(const cJSON * const item, printbuffer * const p)
 {
-    return print_string_ptr((unsigned char*)item->valuestring, p, hooks);
+    return print_string_ptr((unsigned char*)item->valuestring, p);
 }
 
 /* Predeclare these prototypes. */
-static const unsigned char *parse_value(cJSON * const item, const unsigned char * const input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_value(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
-static const unsigned char *parse_array(cJSON * const item, const unsigned char *input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_array(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
-static const unsigned char *parse_object(cJSON * const item, const unsigned char *input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_object(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
+static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer);
+static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_array(const cJSON * const item, printbuffer * const output_buffer);
+static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_object(const cJSON * const item, printbuffer * const output_buffer);
 
 /* Utility to jump whitespace and cr/lf */
-static const unsigned char *skip_whitespace(const unsigned char *in)
+static parse_buffer *buffer_skip_whitespace(parse_buffer * const buffer)
 {
-    while (in && *in && (*in <= 32))
+    if ((buffer == NULL) || (buffer->content == NULL))
     {
-        in++;
+        return NULL;
+    }
+
+    while (can_access_at_index(buffer, 0) && (buffer_at_offset(buffer)[0] <= 32))
+    {
+       buffer->offset++;
     }
 
-    return in;
+    if (buffer->offset == buffer->length)
+    {
+        buffer->offset--;
+    }
+
+    return buffer;
 }
 
-/* Parse an object - create a new root, and populate. */
-CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cjbool require_null_terminated)
+/* skip the UTF-8 BOM (byte order mark) if it is at the beginning of a buffer */
+static parse_buffer *skip_utf8_bom(parse_buffer * const buffer)
 {
-    const unsigned char *end = NULL;
-    /* use global error pointer if no specific one was given */
-    const unsigned char **ep = return_parse_end ? (const unsigned char**)return_parse_end : &global_ep;
-    cJSON *c = cJSON_New_Item(&global_hooks);
-    *ep = NULL;
-    if (!c) /* memory fail */
+    if ((buffer == NULL) || (buffer->content == NULL) || (buffer->offset != 0))
     {
         return NULL;
     }
 
-    end = parse_value(c, skip_whitespace((const unsigned char*)value), ep, &global_hooks);
-    if (!end)
+    if (can_access_at_index(buffer, 4) && (strncmp((const char*)buffer_at_offset(buffer), "\xEF\xBB\xBF", 3) == 0))
+    {
+        buffer->offset += 3;
+    }
+
+    return buffer;
+}
+
+/* Parse an object - create a new root, and populate. */
+CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated)
+{
+    parse_buffer buffer = { 0, 0, 0, 0, { 0, 0, 0 } };
+    cJSON *item = NULL;
+
+    /* reset error position */
+    global_error.json = NULL;
+    global_error.position = 0;
+
+    if (value == NULL)
+    {
+        goto fail;
+    }
+
+    buffer.content = (const unsigned char*)value;
+    buffer.length = strlen((const char*)value) + sizeof("");
+    buffer.offset = 0;
+    buffer.hooks = global_hooks;
+
+    item = cJSON_New_Item(&global_hooks);
+    if (item == NULL) /* memory fail */
+    {
+        goto fail;
+    }
+
+    if (!parse_value(item, buffer_skip_whitespace(skip_utf8_bom(&buffer))))
     {
         /* parse failure. ep is set. */
-        cJSON_Delete(c);
-        return NULL;
+        goto fail;
     }
 
     /* if we require null-terminated JSON without appended garbage, skip and then check for a null terminator */
     if (require_null_terminated)
     {
-        end = skip_whitespace(end);
-        if (*end)
+        buffer_skip_whitespace(&buffer);
+        if ((buffer.offset >= buffer.length) || buffer_at_offset(&buffer)[0] != '\0')
         {
-            cJSON_Delete(c);
-            *ep = end;
-            return NULL;
+            goto fail;
         }
     }
     if (return_parse_end)
     {
-        *return_parse_end = (const char*)end;
+        *return_parse_end = (const char*)buffer_at_offset(&buffer);
+    }
+
+    return item;
+
+fail:
+    if (item != NULL)
+    {
+        cJSON_Delete(item);
+    }
+
+    if (value != NULL)
+    {
+        error local_error;
+        local_error.json = (const unsigned char*)value;
+        local_error.position = 0;
+
+        if (buffer.offset < buffer.length)
+        {
+            local_error.position = buffer.offset;
+        }
+        else if (buffer.length > 0)
+        {
+            local_error.position = buffer.length - 1;
+        }
+
+        if (return_parse_end != NULL)
+        {
+            *return_parse_end = (const char*)local_error.json + local_error.position;
+        }
+
+        global_error = local_error;
     }
 
-    return c;
+    return NULL;
 }
 
 /* Default options for cJSON_Parse */
@@ -844,40 +1089,55 @@ CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value)
     return cJSON_ParseWithOpts(value, 0, 0);
 }
 
-#define min(a, b) ((a < b) ? a : b)
+#define cjson_min(a, b) ((a < b) ? a : b)
 
-static unsigned char *print(const cJSON * const item, cjbool format, const internal_hooks * const hooks)
+static unsigned char *print(const cJSON * const item, cJSON_bool format, const internal_hooks * const hooks)
 {
+    static const size_t default_buffer_size = 256;
     printbuffer buffer[1];
     unsigned char *printed = NULL;
 
     memset(buffer, 0, sizeof(buffer));
 
     /* create buffer */
-    buffer->buffer = (unsigned char*) hooks->allocate(256);
+    buffer->buffer = (unsigned char*) hooks->allocate(default_buffer_size);
+    buffer->length = default_buffer_size;
+    buffer->format = format;
+    buffer->hooks = *hooks;
     if (buffer->buffer == NULL)
     {
         goto fail;
     }
 
     /* print the value */
-    if (print_value(item, 0, format, buffer, hooks) == NULL)
+    if (!print_value(item, buffer))
     {
         goto fail;
     }
     update_offset(buffer);
 
-    /* copy the buffer over to a new one */
-    printed = (unsigned char*) hooks->allocate(buffer->offset + 1);
-    if (printed == NULL)
+    /* check if reallocate is available */
+    if (hooks->reallocate != NULL)
     {
-        goto fail;
+        printed = (unsigned char*) hooks->reallocate(buffer->buffer, buffer->offset + 1);
+        if (printed == NULL) {
+            goto fail;
+        }
+        buffer->buffer = NULL;
     }
-    strncpy((char*)printed, (char*)buffer->buffer, min(buffer->length, buffer->offset + 1));
-    printed[buffer->offset] = '\0'; /* just to be sure */
+    else /* otherwise copy the JSON over to a new buffer */
+    {
+        printed = (unsigned char*) hooks->allocate(buffer->offset + 1);
+        if (printed == NULL)
+        {
+            goto fail;
+        }
+        memcpy(printed, buffer->buffer, cjson_min(buffer->length, buffer->offset + 1));
+        printed[buffer->offset] = '\0'; /* just to be sure */
 
-    /* free the buffer */
-    hooks->deallocate(buffer->buffer);
+        /* free the buffer */
+        hooks->deallocate(buffer->buffer);
+    }
 
     return printed;
 
@@ -887,6 +1147,11 @@ static unsigned char *print(const cJSON * const item, cjbool format, const inter
         hooks->deallocate(buffer->buffer);
     }
 
+    if (printed != NULL)
+    {
+        hooks->deallocate(printed);
+    }
+
     return NULL;
 }
 
@@ -901,9 +1166,9 @@ CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item)
     return (char*)print(item, false, &global_hooks);
 }
 
-CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cjbool fmt)
+CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt)
 {
-    printbuffer p;
+    printbuffer p = { 0, 0, 0, 0, 0, 0, { 0, 0, 0 } };
 
     if (prebuffer < 0)
     {
@@ -919,15 +1184,23 @@ CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cjboo
     p.length = (size_t)prebuffer;
     p.offset = 0;
     p.noalloc = false;
+    p.format = fmt;
+    p.hooks = global_hooks;
+
+    if (!print_value(item, &p))
+    {
+        global_hooks.deallocate(p.buffer);
+        return NULL;
+    }
 
-    return (char*)print_value(item, 0, fmt, &p, &global_hooks);
+    return (char*)p.buffer;
 }
 
-CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const cjbool fmt)
+CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const cJSON_bool fmt)
 {
-    printbuffer p;
+    printbuffer p = { 0, 0, 0, 0, 0, 0, { 0, 0, 0 } };
 
-    if (len < 0)
+    if ((len < 0) || (buf == NULL))
     {
         return false;
     }
@@ -936,164 +1209,181 @@ CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len,
     p.length = (size_t)len;
     p.offset = 0;
     p.noalloc = true;
-    return print_value(item, 0, fmt, &p, &global_hooks) != NULL;
+    p.format = fmt;
+    p.hooks = global_hooks;
+
+    return print_value(item, &p);
 }
 
 /* Parser core - when encountering text, process appropriately. */
-static const unsigned  char *parse_value(cJSON * const item, const unsigned char * const input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer)
 {
-    if (input == NULL)
+    if ((input_buffer == NULL) || (input_buffer->content == NULL))
     {
-        return NULL; /* no input */
+        return false; /* no input */
     }
 
     /* parse the different types of values */
     /* null */
-    if (!strncmp((const char*)input, "null", 4))
+    if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "null", 4) == 0))
     {
         item->type = cJSON_NULL;
-        return input + 4;
+        input_buffer->offset += 4;
+        return true;
     }
     /* false */
-    if (!strncmp((const char*)input, "false", 5))
+    if (can_read(input_buffer, 5) && (strncmp((const char*)buffer_at_offset(input_buffer), "false", 5) == 0))
     {
         item->type = cJSON_False;
-        return input + 5;
+        input_buffer->offset += 5;
+        return true;
     }
     /* true */
-    if (!strncmp((const char*)input, "true", 4))
+    if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "true", 4) == 0))
     {
         item->type = cJSON_True;
         item->valueint = 1;
-        return input + 4;
+        input_buffer->offset += 4;
+        return true;
     }
     /* string */
-    if (*input == '\"')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '\"'))
     {
-        return parse_string(item, input, error_pointer, hooks);
+        return parse_string(item, input_buffer);
     }
     /* number */
-    if ((*input == '-') || ((*input >= '0') && (*input <= '9')))
+    if (can_access_at_index(input_buffer, 0) && ((buffer_at_offset(input_buffer)[0] == '-') || ((buffer_at_offset(input_buffer)[0] >= '0') && (buffer_at_offset(input_buffer)[0] <= '9'))))
     {
-        return parse_number(item, input);
+        return parse_number(item, input_buffer);
     }
     /* array */
-    if (*input == '[')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '['))
     {
-        return parse_array(item, input, error_pointer, hooks);
+        return parse_array(item, input_buffer);
     }
     /* object */
-    if (*input == '{')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '{'))
     {
-        return parse_object(item, input, error_pointer, hooks);
+        return parse_object(item, input_buffer);
     }
 
-    /* failure. */
-    *error_pointer = input;
-    return NULL;
+    return false;
 }
 
 /* Render a value to text. */
-static unsigned char *print_value(const cJSON * const item, const size_t depth, const cjbool format,  printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer)
 {
     unsigned char *output = NULL;
 
     if ((item == NULL) || (output_buffer == NULL))
     {
-        return NULL;
+        return false;
     }
 
     switch ((item->type) & 0xFF)
     {
         case cJSON_NULL:
-            output = ensure(output_buffer, 5, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 5);
+            if (output == NULL)
             {
-                strcpy((char*)output, "null");
+                return false;
             }
-            break;
+            strcpy((char*)output, "null");
+            return true;
+
         case cJSON_False:
-            output = ensure(output_buffer, 6, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 6);
+            if (output == NULL)
             {
-                strcpy((char*)output, "false");
+                return false;
             }
-            break;
+            strcpy((char*)output, "false");
+            return true;
+
         case cJSON_True:
-            output = ensure(output_buffer, 5, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 5);
+            if (output == NULL)
             {
-                strcpy((char*)output, "true");
+                return false;
             }
-            break;
+            strcpy((char*)output, "true");
+            return true;
+
         case cJSON_Number:
-            output = print_number(item, output_buffer, hooks);
-            break;
+            return print_number(item, output_buffer);
+
         case cJSON_Raw:
         {
             size_t raw_length = 0;
             if (item->valuestring == NULL)
             {
-                if (!output_buffer->noalloc)
-                {
-                    hooks->deallocate(output_buffer->buffer);
-                }
-                output = NULL;
-                break;
+                return false;
             }
 
-            raw_length = strlen(item->valuestring) + sizeof('\0');
-            output = ensure(output_buffer, raw_length, hooks);
-            if (output != NULL)
+            raw_length = strlen(item->valuestring) + sizeof("");
+            output = ensure(output_buffer, raw_length);
+            if (output == NULL)
             {
-                memcpy(output, item->valuestring, raw_length);
+                return false;
             }
-            break;
+            memcpy(output, item->valuestring, raw_length);
+            return true;
         }
+
         case cJSON_String:
-            output = print_string(item, output_buffer, hooks);
-            break;
+            return print_string(item, output_buffer);
+
         case cJSON_Array:
-            output = print_array(item, depth, format, output_buffer, hooks);
-            break;
+            return print_array(item, output_buffer);
+
         case cJSON_Object:
-            output = print_object(item, depth, format, output_buffer, hooks);
-            break;
+            return print_object(item, output_buffer);
+
         default:
-            output = NULL;
-            break;
+            return false;
     }
-
-    return output;
 }
 
 /* Build an array from input text. */
-static const unsigned char *parse_array(cJSON * const item, const unsigned char *input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer)
 {
     cJSON *head = NULL; /* head of the linked list */
     cJSON *current_item = NULL;
 
-    if (*input != '[')
+    if (input_buffer->depth >= CJSON_NESTING_LIMIT)
+    {
+        return false; /* to deeply nested */
+    }
+    input_buffer->depth++;
+
+    if (buffer_at_offset(input_buffer)[0] != '[')
     {
         /* not an array */
-        *error_pointer = input;
         goto fail;
     }
 
-    input = skip_whitespace(input + 1);
-    if (*input == ']')
+    input_buffer->offset++;
+    buffer_skip_whitespace(input_buffer);
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ']'))
     {
         /* empty array */
         goto success;
     }
 
+    /* check if we skipped to the end of the buffer */
+    if (cannot_access_at_index(input_buffer, 0))
+    {
+        input_buffer->offset--;
+        goto fail;
+    }
+
     /* step back to character in front of the first element */
-    input--;
+    input_buffer->offset--;
     /* loop through the comma separated array elements */
     do
     {
         /* allocate next item */
-        cJSON *new_item = cJSON_New_Item(hooks);
+        cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
         if (new_item == NULL)
         {
             goto fail; /* allocation failure */
@@ -1114,27 +1404,30 @@ static const unsigned char *parse_array(cJSON * const item, const unsigned char
         }
 
         /* parse next value */
-        input = skip_whitespace(input + 1);
-        input = parse_value(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_value(current_item, input_buffer))
         {
             goto fail; /* failed to parse value */
         }
+        buffer_skip_whitespace(input_buffer);
     }
-    while (*input == ',');
+    while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
 
-    if (*input != ']')
+    if (cannot_access_at_index(input_buffer, 0) || buffer_at_offset(input_buffer)[0] != ']')
     {
-        *error_pointer = input;
         goto fail; /* expected end of array */
     }
 
 success:
+    input_buffer->depth--;
+
     item->type = cJSON_Array;
     item->child = head;
 
-    return input + 1;
+    input_buffer->offset++;
+
+    return true;
 
 fail:
     if (head != NULL)
@@ -1142,52 +1435,50 @@ static const unsigned char *parse_array(cJSON * const item, const unsigned char
         cJSON_Delete(head);
     }
 
-    return NULL;
+    return false;
 }
 
 /* Render an array to text */
-static unsigned char *print_array(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_array(const cJSON * const item, printbuffer * const output_buffer)
 {
-    unsigned char *output = NULL;
     unsigned char *output_pointer = NULL;
     size_t length = 0;
     cJSON *current_element = item->child;
-    size_t output_offset = 0;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* Compose the output array. */
     /* opening square bracket */
-    output_offset = output_buffer->offset;
-    output_pointer = ensure(output_buffer, 1, hooks);
+    output_pointer = ensure(output_buffer, 1);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     *output_pointer = '[';
     output_buffer->offset++;
+    output_buffer->depth++;
 
     while (current_element != NULL)
     {
-        if (print_value(current_element, depth + 1, format, output_buffer, hooks) == NULL)
+        if (!print_value(current_element, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
         if (current_element->next)
         {
-            length = format ? 2 : 1;
-            output_pointer = ensure(output_buffer, length + 1, hooks);
+            length = (size_t) (output_buffer->format ? 2 : 1);
+            output_pointer = ensure(output_buffer, length + 1);
             if (output_pointer == NULL)
             {
-                return NULL;
+                return false;
             }
             *output_pointer++ = ',';
-            if(format)
+            if(output_buffer->format)
             {
                 *output_pointer++ = ' ';
             }
@@ -1197,43 +1488,56 @@ static unsigned char *print_array(const cJSON * const item, const size_t depth,
         current_element = current_element->next;
     }
 
-    output_pointer = ensure(output_buffer, 2, hooks);
+    output_pointer = ensure(output_buffer, 2);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
     *output_pointer++ = ']';
     *output_pointer = '\0';
-    output = output_buffer->buffer + output_offset;
+    output_buffer->depth--;
 
-    return output;
+    return true;
 }
 
 /* Build an object from the text. */
-static const unsigned char *parse_object(cJSON * const item, const unsigned char *input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer)
 {
     cJSON *head = NULL; /* linked list head */
     cJSON *current_item = NULL;
 
-    if (*input != '{')
+    if (input_buffer->depth >= CJSON_NESTING_LIMIT)
+    {
+        return false; /* to deeply nested */
+    }
+    input_buffer->depth++;
+
+    if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '{'))
     {
-        *error_pointer = input;
         goto fail; /* not an object */
     }
 
-    input = skip_whitespace(input + 1);
-    if (*input == '}')
+    input_buffer->offset++;
+    buffer_skip_whitespace(input_buffer);
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '}'))
     {
         goto success; /* empty object */
     }
 
+    /* check if we skipped to the end of the buffer */
+    if (cannot_access_at_index(input_buffer, 0))
+    {
+        input_buffer->offset--;
+        goto fail;
+    }
+
     /* step back to character in front of the first element */
-    input--;
+    input_buffer->offset--;
     /* loop through the comma separated array elements */
     do
     {
         /* allocate next item */
-        cJSON *new_item = cJSON_New_Item(hooks);
+        cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
         if (new_item == NULL)
         {
             goto fail; /* allocation failure */
@@ -1254,46 +1558,47 @@ static const unsigned char *parse_object(cJSON * const item, const unsigned char
         }
 
         /* parse the name of the child */
-        input = skip_whitespace(input + 1);
-        input = parse_string(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_string(current_item, input_buffer))
         {
-            goto fail; /* faile to parse name */
+            goto fail; /* failed to parse name */
         }
+        buffer_skip_whitespace(input_buffer);
 
         /* swap valuestring and string, because we parsed the name */
         current_item->string = current_item->valuestring;
         current_item->valuestring = NULL;
 
-        if (*input != ':')
+        if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != ':'))
         {
-            *error_pointer = input;
             goto fail; /* invalid object */
         }
 
         /* parse the value */
-        input = skip_whitespace(input + 1);
-        input = parse_value(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_value(current_item, input_buffer))
         {
             goto fail; /* failed to parse value */
         }
+        buffer_skip_whitespace(input_buffer);
     }
-    while (*input == ',');
+    while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
 
-    if (*input != '}')
+    if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '}'))
     {
-        *error_pointer = input;
         goto fail; /* expected end of object */
     }
 
 success:
+    input_buffer->depth--;
+
     item->type = cJSON_Object;
     item->child = head;
 
-    return input + 1;
+    input_buffer->offset++;
+    return true;
 
 fail:
     if (head != NULL)
@@ -1301,34 +1606,32 @@ static const unsigned char *parse_object(cJSON * const item, const unsigned char
         cJSON_Delete(head);
     }
 
-    return NULL;
+    return false;
 }
 
 /* Render an object to text. */
-static unsigned char *print_object(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_object(const cJSON * const item, printbuffer * const output_buffer)
 {
-    unsigned char *output = NULL;
     unsigned char *output_pointer = NULL;
     size_t length = 0;
-    size_t output_offset = 0;
     cJSON *current_item = item->child;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* Compose the output: */
-    output_offset = output_buffer->offset;
-    length = format ? 2 : 1; /* fmt: {\n */
-    output_pointer = ensure(output_buffer, length + 1, hooks);
+    length = (size_t) (output_buffer->format ? 2 : 1); /* fmt: {\n */
+    output_pointer = ensure(output_buffer, length + 1);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     *output_pointer++ = '{';
-    if (format)
+    output_buffer->depth++;
+    if (output_buffer->format)
     {
         *output_pointer++ = '\n';
     }
@@ -1336,61 +1639,61 @@ static unsigned char *print_object(const cJSON * const item, const size_t depth,
 
     while (current_item)
     {
-        if (format)
+        if (output_buffer->format)
         {
             size_t i;
-            output_pointer = ensure(output_buffer, depth + 1, hooks);
+            output_pointer = ensure(output_buffer, output_buffer->depth);
             if (output_pointer == NULL)
             {
-                return NULL;
+                return false;
             }
-            for (i = 0; i < depth + 1; i++)
+            for (i = 0; i < output_buffer->depth; i++)
             {
                 *output_pointer++ = '\t';
             }
-            output_buffer->offset += depth + 1;
+            output_buffer->offset += output_buffer->depth;
         }
 
         /* print key */
-        if (print_string_ptr((unsigned char*)current_item->string, output_buffer, hooks) == NULL)
+        if (!print_string_ptr((unsigned char*)current_item->string, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
 
-        length = format ? 2 : 1;
-        output_pointer = ensure(output_buffer, length, hooks);
+        length = (size_t) (output_buffer->format ? 2 : 1);
+        output_pointer = ensure(output_buffer, length);
         if (output_pointer == NULL)
         {
-            return NULL;
+            return false;
         }
         *output_pointer++ = ':';
-        if (format)
+        if (output_buffer->format)
         {
             *output_pointer++ = '\t';
         }
         output_buffer->offset += length;
 
         /* print value */
-        if (!print_value(current_item, depth + 1, format, output_buffer, hooks))
+        if (!print_value(current_item, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
 
         /* print comma if not last */
-        length = (size_t) (format ? 1 : 0) + (current_item->next ? 1 : 0);
-        output_pointer = ensure(output_buffer, length + 1, hooks);
+        length = ((size_t)(output_buffer->format ? 1 : 0) + (size_t)(current_item->next ? 1 : 0));
+        output_pointer = ensure(output_buffer, length + 1);
         if (output_pointer == NULL)
         {
-            return NULL;
+            return false;
         }
         if (current_item->next)
         {
             *output_pointer++ = ',';
         }
 
-        if (format)
+        if (output_buffer->format)
         {
             *output_pointer++ = '\n';
         }
@@ -1400,83 +1703,122 @@ static unsigned char *print_object(const cJSON * const item, const size_t depth,
         current_item = current_item->next;
     }
 
-    output_pointer = ensure(output_buffer, format ? (depth + 2) : 2, hooks);
+    output_pointer = ensure(output_buffer, output_buffer->format ? (output_buffer->depth + 1) : 2);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
-    if (format)
+    if (output_buffer->format)
     {
         size_t i;
-        for (i = 0; i < (depth); i++)
+        for (i = 0; i < (output_buffer->depth - 1); i++)
         {
             *output_pointer++ = '\t';
         }
     }
     *output_pointer++ = '}';
     *output_pointer = '\0';
-    output = (output_buffer->buffer) + output_offset;
+    output_buffer->depth--;
 
-    return output;
+    return true;
 }
 
 /* Get Array size/item / object item. */
 CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array)
 {
-    cJSON *c = array->child;
-    size_t i = 0;
-    while(c)
+    cJSON *child = NULL;
+    size_t size = 0;
+
+    if (array == NULL)
     {
-        i++;
-        c = c->next;
+        return 0;
+    }
+
+    child = array->child;
+
+    while(child != NULL)
+    {
+        size++;
+        child = child->next;
     }
 
     /* FIXME: Can overflow here. Cannot be fixed without breaking the API */
 
-    return (int)i;
+    return (int)size;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int item)
+static cJSON* get_array_item(const cJSON *array, size_t index)
 {
-    cJSON *c = array ? array->child : NULL;
-    while (c && item > 0)
+    cJSON *current_child = NULL;
+
+    if (array == NULL)
     {
-        item--;
-        c = c->next;
+        return NULL;
     }
 
-    return c;
+    current_child = array->child;
+    while ((current_child != NULL) && (index > 0))
+    {
+        index--;
+        current_child = current_child->next;
+    }
+
+    return current_child;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON *object, const char *string)
+CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index)
 {
-    cJSON *c = object ? object->child : NULL;
-    while (c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
+    if (index < 0)
     {
-        c = c->next;
+        return NULL;
     }
-    return c;
+
+    return get_array_item(array, (size_t)index);
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string)
+static cJSON *get_object_item(const cJSON * const object, const char * const name, const cJSON_bool case_sensitive)
 {
     cJSON *current_element = NULL;
 
-    if ((object == NULL) || (string == NULL))
+    if ((object == NULL) || (name == NULL))
     {
         return NULL;
     }
 
     current_element = object->child;
-    while ((current_element != NULL) && (strcmp(string, current_element->string) != 0))
+    if (case_sensitive)
     {
-        current_element = current_element->next;
+        while ((current_element != NULL) && (current_element->string != NULL) && (strcmp(name, current_element->string) != 0))
+        {
+            current_element = current_element->next;
+        }
+    }
+    else
+    {
+        while ((current_element != NULL) && (case_insensitive_strcmp((const unsigned char*)name, (const unsigned char*)(current_element->string)) != 0))
+        {
+            current_element = current_element->next;
+        }
+    }
+
+    if ((current_element == NULL) || (current_element->string == NULL)) {
+        return NULL;
     }
 
     return current_element;
 }
 
-CJSON_PUBLIC(cjbool) cJSON_HasObjectItem(const cJSON *object, const char *string)
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string)
+{
+    return get_object_item(object, string, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string)
+{
+    return get_object_item(object, string, true);
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string)
 {
     return cJSON_GetObjectItem(object, string) ? 1 : 0;
 }
@@ -1491,26 +1833,32 @@ static void suffix_object(cJSON *prev, cJSON *item)
 /* Utility for handling references. */
 static cJSON *create_reference(const cJSON *item, const internal_hooks * const hooks)
 {
-    cJSON *ref = cJSON_New_Item(hooks);
-    if (!ref)
+    cJSON *reference = NULL;
+    if (item == NULL)
     {
         return NULL;
     }
-    memcpy(ref, item, sizeof(cJSON));
-    ref->string = NULL;
-    ref->type |= cJSON_IsReference;
-    ref->next = ref->prev = NULL;
-    return ref;
+
+    reference = cJSON_New_Item(hooks);
+    if (reference == NULL)
+    {
+        return NULL;
+    }
+
+    memcpy(reference, item, sizeof(cJSON));
+    reference->string = NULL;
+    reference->type |= cJSON_IsReference;
+    reference->next = reference->prev = NULL;
+    return reference;
 }
 
-/* Add item to array/object. */
-CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
+static cJSON_bool add_item_to_array(cJSON *array, cJSON *item)
 {
     cJSON *child = NULL;
 
     if ((item == NULL) || (array == NULL))
     {
-        return;
+        return false;
     }
 
     child = array->child;
@@ -1529,76 +1877,238 @@ CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
         }
         suffix_object(child, item);
     }
+
+    return true;
+}
+
+/* Add item to array/object. */
+CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
+{
+    add_item_to_array(array, item);
+}
+
+#if defined(__clang__) || (defined(__GNUC__)  && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))
+    #pragma GCC diagnostic push
+#endif
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+/* helper function to cast away const */
+static void* cast_away_const(const void* string)
+{
+    return (void*)string;
+}
+#if defined(__clang__) || (defined(__GNUC__)  && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))
+    #pragma GCC diagnostic pop
+#endif
+
+
+static cJSON_bool add_item_to_object(cJSON * const object, const char * const string, cJSON * const item, const internal_hooks * const hooks, const cJSON_bool constant_key)
+{
+    char *new_key = NULL;
+    int new_type = cJSON_Invalid;
+
+    if ((object == NULL) || (string == NULL) || (item == NULL))
+    {
+        return false;
+    }
+
+    if (constant_key)
+    {
+        new_key = (char*)cast_away_const(string);
+        new_type = item->type | cJSON_StringIsConst;
+    }
+    else
+    {
+        new_key = (char*)cJSON_strdup((const unsigned char*)string, hooks);
+        if (new_key == NULL)
+        {
+            return false;
+        }
+
+        new_type = item->type & ~cJSON_StringIsConst;
+    }
+
+    if (!(item->type & cJSON_StringIsConst) && (item->string != NULL))
+    {
+        hooks->deallocate(item->string);
+    }
+
+    item->string = new_key;
+    item->type = new_type;
+
+    return add_item_to_array(object, item);
 }
 
 CJSON_PUBLIC(void) cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item)
 {
-    /* call cJSON_AddItemToObjectCS for code reuse */
-    cJSON_AddItemToObjectCS(object, (char*)cJSON_strdup((const unsigned char*)string, &global_hooks), item);
-    /* remove cJSON_StringIsConst flag */
-    item->type &= ~cJSON_StringIsConst;
+    add_item_to_object(object, string, item, &global_hooks, false);
 }
 
 /* Add an item to an object with constant string as key */
 CJSON_PUBLIC(void) cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item)
 {
-    if (!item)
+    add_item_to_object(object, string, item, &global_hooks, true);
+}
+
+CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item)
+{
+    if (array == NULL)
     {
         return;
     }
-    if (!(item->type & cJSON_StringIsConst) && item->string)
+
+    add_item_to_array(array, create_reference(item, &global_hooks));
+}
+
+CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item)
+{
+    if ((object == NULL) || (string == NULL))
     {
-        global_hooks.deallocate(item->string);
+        return;
     }
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-qual"
-    item->string = (char*)string;
-#pragma GCC diagnostic pop
-    item->type |= cJSON_StringIsConst;
-    cJSON_AddItemToArray(object, item);
+
+    add_item_to_object(object, string, create_reference(item, &global_hooks), &global_hooks, false);
 }
 
-CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item)
+CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name)
 {
-    cJSON_AddItemToArray(array, create_reference(item, &global_hooks));
+    cJSON *null = cJSON_CreateNull();
+    if (add_item_to_object(object, name, null, &global_hooks, false))
+    {
+        return null;
+    }
+
+    cJSON_Delete(null);
+    return NULL;
 }
 
-CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item)
+CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name)
 {
-    cJSON_AddItemToObject(object, string, create_reference(item, &global_hooks));
+    cJSON *true_item = cJSON_CreateTrue();
+    if (add_item_to_object(object, name, true_item, &global_hooks, false))
+    {
+        return true_item;
+    }
+
+    cJSON_Delete(true_item);
+    return NULL;
 }
 
-static cJSON *DetachItemFromArray(cJSON *array, size_t which)
+CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    cJSON *false_item = cJSON_CreateFalse();
+    if (add_item_to_object(object, name, false_item, &global_hooks, false))
     {
-        c = c->next;
-        which--;
+        return false_item;
     }
-    if (!c)
+
+    cJSON_Delete(false_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean)
+{
+    cJSON *bool_item = cJSON_CreateBool(boolean);
+    if (add_item_to_object(object, name, bool_item, &global_hooks, false))
+    {
+        return bool_item;
+    }
+
+    cJSON_Delete(bool_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number)
+{
+    cJSON *number_item = cJSON_CreateNumber(number);
+    if (add_item_to_object(object, name, number_item, &global_hooks, false))
+    {
+        return number_item;
+    }
+
+    cJSON_Delete(number_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string)
+{
+    cJSON *string_item = cJSON_CreateString(string);
+    if (add_item_to_object(object, name, string_item, &global_hooks, false))
+    {
+        return string_item;
+    }
+
+    cJSON_Delete(string_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw)
+{
+    cJSON *raw_item = cJSON_CreateRaw(raw);
+    if (add_item_to_object(object, name, raw_item, &global_hooks, false))
+    {
+        return raw_item;
+    }
+
+    cJSON_Delete(raw_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name)
+{
+    cJSON *object_item = cJSON_CreateObject();
+    if (add_item_to_object(object, name, object_item, &global_hooks, false))
+    {
+        return object_item;
+    }
+
+    cJSON_Delete(object_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name)
+{
+    cJSON *array = cJSON_CreateArray();
+    if (add_item_to_object(object, name, array, &global_hooks, false))
+    {
+        return array;
+    }
+
+    cJSON_Delete(array);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item)
+{
+    if ((parent == NULL) || (item == NULL))
     {
-        /* item doesn't exist */
         return NULL;
     }
-    if (c->prev)
+
+    if (item->prev != NULL)
     {
         /* not the first element */
-        c->prev->next = c->next;
+        item->prev->next = item->next;
     }
-    if (c->next)
+    if (item->next != NULL)
     {
-        c->next->prev = c->prev;
+        /* not the last element */
+        item->next->prev = item->prev;
     }
-    if (c==array->child)
+
+    if (item == parent->child)
     {
-        array->child = c->next;
+        /* first element */
+        parent->child = item->next;
     }
     /* make sure the detached item doesn't point anywhere anymore */
-    c->prev = c->next = NULL;
+    item->prev = NULL;
+    item->next = NULL;
 
-    return c;
+    return item;
 }
+
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which)
 {
     if (which < 0)
@@ -1606,7 +2116,7 @@ CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which)
         return NULL;
     }
 
-    return DetachItemFromArray(array, (size_t)which);
+    return cJSON_DetachItemViaPointer(array, get_array_item(array, (size_t)which));
 }
 
 CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which)
@@ -1616,19 +2126,16 @@ CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which)
 
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string)
 {
-    size_t i = 0;
-    cJSON *c = object->child;
-    while (c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
-    {
-        i++;
-        c = c->next;
-    }
-    if (c)
-    {
-        return DetachItemFromArray(object, i);
-    }
+    cJSON *to_detach = cJSON_GetObjectItem(object, string);
 
-    return NULL;
+    return cJSON_DetachItemViaPointer(object, to_detach);
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string)
+{
+    cJSON *to_detach = cJSON_GetObjectItemCaseSensitive(object, string);
+
+    return cJSON_DetachItemViaPointer(object, to_detach);
 }
 
 CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string)
@@ -1636,24 +2143,32 @@ CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string)
     cJSON_Delete(cJSON_DetachItemFromObject(object, string));
 }
 
+CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string)
+{
+    cJSON_Delete(cJSON_DetachItemFromObjectCaseSensitive(object, string));
+}
+
 /* Replace array/object items with new ones. */
 CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    cJSON *after_inserted = NULL;
+
+    if (which < 0)
     {
-        c = c->next;
-        which--;
+        return;
     }
-    if (!c)
+
+    after_inserted = get_array_item(array, (size_t)which);
+    if (after_inserted == NULL)
     {
-        cJSON_AddItemToArray(array, newitem);
+        add_item_to_array(array, newitem);
         return;
     }
-    newitem->next = c;
-    newitem->prev = c->prev;
-    c->prev = newitem;
-    if (c == array->child)
+
+    newitem->next = after_inserted;
+    newitem->prev = after_inserted->prev;
+    after_inserted->prev = newitem;
+    if (after_inserted == array->child)
     {
         array->child = newitem;
     }
@@ -1663,35 +2178,41 @@ CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newit
     }
 }
 
-static void ReplaceItemInArray(cJSON *array, size_t which, cJSON *newitem)
+CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    if ((parent == NULL) || (replacement == NULL) || (item == NULL))
     {
-        c = c->next;
-        which--;
+        return false;
     }
-    if (!c)
+
+    if (replacement == item)
     {
-        return;
+        return true;
     }
-    newitem->next = c->next;
-    newitem->prev = c->prev;
-    if (newitem->next)
+
+    replacement->next = item->next;
+    replacement->prev = item->prev;
+
+    if (replacement->next != NULL)
     {
-        newitem->next->prev = newitem;
+        replacement->next->prev = replacement;
     }
-    if (c == array->child)
+    if (replacement->prev != NULL)
     {
-        array->child = newitem;
+        replacement->prev->next = replacement;
     }
-    else
+    if (parent->child == item)
     {
-        newitem->prev->next = newitem;
+        parent->child = replacement;
     }
-    c->next = c->prev = NULL;
-    cJSON_Delete(c);
+
+    item->next = NULL;
+    item->prev = NULL;
+    cJSON_Delete(item);
+
+    return true;
 }
+
 CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem)
 {
     if (which < 0)
@@ -1699,29 +2220,37 @@ CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newi
         return;
     }
 
-    ReplaceItemInArray(array, (size_t)which, newitem);
+    cJSON_ReplaceItemViaPointer(array, get_array_item(array, (size_t)which), newitem);
 }
 
-CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem)
+static cJSON_bool replace_item_in_object(cJSON *object, const char *string, cJSON *replacement, cJSON_bool case_sensitive)
 {
-    size_t i = 0;
-    cJSON *c = object->child;
-    while(c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
+    if ((replacement == NULL) || (string == NULL))
     {
-        i++;
-        c = c->next;
+        return false;
     }
-    if(c)
-    {
-        /* free the old string if not const */
-        if (!(newitem->type & cJSON_StringIsConst) && newitem->string)
-        {
-             global_hooks.deallocate(newitem->string);
-        }
 
-        newitem->string = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks);
-        ReplaceItemInArray(object, i, newitem);
+    /* replace the name in the replacement */
+    if (!(replacement->type & cJSON_StringIsConst) && (replacement->string != NULL))
+    {
+        cJSON_free(replacement->string);
     }
+    replacement->string = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks);
+    replacement->type &= ~cJSON_StringIsConst;
+
+    cJSON_ReplaceItemViaPointer(object, get_object_item(object, string, case_sensitive), replacement);
+
+    return true;
+}
+
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem)
+{
+    replace_item_in_object(object, string, newitem, false);
+}
+
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, const char *string, cJSON *newitem)
+{
+    replace_item_in_object(object, string, newitem, true);
 }
 
 /* Create basic types: */
@@ -1758,7 +2287,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void)
     return item;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cjbool b)
+CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool b)
 {
     cJSON *item = cJSON_New_Item(&global_hooks);
     if(item)
@@ -1782,7 +2311,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num)
         {
             item->valueint = INT_MAX;
         }
-        else if (num <= INT_MIN)
+        else if (num <= (double)INT_MIN)
         {
             item->valueint = INT_MIN;
         }
@@ -1812,6 +2341,39 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string)
     return item;
 }
 
+CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string)
+{
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL)
+    {
+        item->type = cJSON_String | cJSON_IsReference;
+        item->valuestring = (char*)cast_away_const(string);
+    }
+
+    return item;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child)
+{
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL) {
+        item->type = cJSON_Object | cJSON_IsReference;
+        item->child = (cJSON*)cast_away_const(child);
+    }
+
+    return item;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child) {
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL) {
+        item->type = cJSON_Array | cJSON_IsReference;
+        item->child = (cJSON*)cast_away_const(child);
+    }
+
+    return item;
+}
+
 CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw)
 {
     cJSON *item = cJSON_New_Item(&global_hooks);
@@ -1859,7 +2421,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1894,7 +2456,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1903,7 +2465,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count)
 
     for(i = 0; a && (i < (size_t)count); i++)
     {
-        n = cJSON_CreateNumber(numbers[i]);
+        n = cJSON_CreateNumber((double)numbers[i]);
         if(!n)
         {
             cJSON_Delete(a);
@@ -1930,7 +2492,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1966,7 +2528,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (strings == NULL))
     {
         return NULL;
     }
@@ -1996,7 +2558,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count)
 }
 
 /* Duplication */
-CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cjbool recurse)
+CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse)
 {
     cJSON *newitem = NULL;
     cJSON *child = NULL;
@@ -2075,66 +2637,334 @@ CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cjbool recurse)
     return NULL;
 }
 
-CJSON_PUBLIC(void) cJSON_Minify(char *json)
+static void skip_oneline_comment(char **input)
 {
-    unsigned char *into = (unsigned char*)json;
-    while (*json)
+    *input += static_strlen("//");
+
+    for (; (*input)[0] != '\0'; ++(*input))
     {
-        if (*json == ' ')
-        {
-            json++;
+        if ((*input)[0] == '\n') {
+            *input += static_strlen("\n");
+            return;
         }
-        else if (*json == '\t')
+    }
+}
+
+static void skip_multiline_comment(char **input)
+{
+    *input += static_strlen("/*");
+
+    for (; (*input)[0] != '\0'; ++(*input))
+    {
+        if (((*input)[0] == '*') && ((*input)[1] == '/'))
         {
-            /* Whitespace characters. */
-            json++;
+            *input += static_strlen("*/");
+            return;
         }
-        else if (*json == '\r')
-        {
-            json++;
+    }
+}
+
+static void minify_string(char **input, char **output) {
+    (*output)[0] = (*input)[0];
+    *input += static_strlen("\"");
+    *output += static_strlen("\"");
+
+
+    for (; (*input)[0] != '\0'; (void)++(*input), ++(*output)) {
+        (*output)[0] = (*input)[0];
+
+        if ((*input)[0] == '\"') {
+            (*output)[0] = '\"';
+            *input += static_strlen("\"");
+            *output += static_strlen("\"");
+            return;
+        } else if (((*input)[0] == '\\') && ((*input)[1] == '\"')) {
+            (*output)[1] = (*input)[1];
+            *input += static_strlen("\"");
+            *output += static_strlen("\"");
         }
-        else if (*json=='\n')
+    }
+}
+
+CJSON_PUBLIC(void) cJSON_Minify(char *json)
+{
+    char *into = json;
+
+    if (json == NULL)
+    {
+        return;
+    }
+
+    while (json[0] != '\0')
+    {
+        switch (json[0])
         {
-            json++;
+            case ' ':
+            case '\t':
+            case '\r':
+            case '\n':
+                json++;
+                break;
+
+            case '/':
+                if (json[1] == '/')
+                {
+                    skip_oneline_comment(&json);
+                }
+                else if (json[1] == '*')
+                {
+                    skip_multiline_comment(&json);
+                } else {
+                    json++;
+                }
+                break;
+
+            case '\"':
+                minify_string(&json, (char**)&into);
+                break;
+
+            default:
+                into[0] = json[0];
+                json++;
+                into++;
         }
-        else if ((*json == '/') && (json[1] == '/'))
-        {
-            /* double-slash comments, to end of line. */
-            while (*json && (*json != '\n'))
+    }
+
+    /* and null-terminate. */
+    *into = '\0';
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Invalid;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_False;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xff) == cJSON_True;
+}
+
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & (cJSON_True | cJSON_False)) != 0;
+}
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_NULL;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Number;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_String;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Array;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Object;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Raw;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive)
+{
+    if ((a == NULL) || (b == NULL) || ((a->type & 0xFF) != (b->type & 0xFF)) || cJSON_IsInvalid(a))
+    {
+        return false;
+    }
+
+    /* check if type is valid */
+    switch (a->type & 0xFF)
+    {
+        case cJSON_False:
+        case cJSON_True:
+        case cJSON_NULL:
+        case cJSON_Number:
+        case cJSON_String:
+        case cJSON_Raw:
+        case cJSON_Array:
+        case cJSON_Object:
+            break;
+
+        default:
+            return false;
+    }
+
+    /* identical objects are equal */
+    if (a == b)
+    {
+        return true;
+    }
+
+    switch (a->type & 0xFF)
+    {
+        /* in these cases and equal type is enough */
+        case cJSON_False:
+        case cJSON_True:
+        case cJSON_NULL:
+            return true;
+
+        case cJSON_Number:
+            if (a->valuedouble == b->valuedouble)
             {
-                json++;
+                return true;
             }
-        }
-        else if ((*json == '/') && (json[1] == '*'))
-        {
-            /* multiline comments. */
-            while (*json && !((*json == '*') && (json[1] == '/')))
+            return false;
+
+        case cJSON_String:
+        case cJSON_Raw:
+            if ((a->valuestring == NULL) || (b->valuestring == NULL))
             {
-                json++;
+                return false;
             }
-            json += 2;
-        }
-        else if (*json == '\"')
+            if (strcmp(a->valuestring, b->valuestring) == 0)
+            {
+                return true;
+            }
+
+            return false;
+
+        case cJSON_Array:
         {
-            /* string literals, which are \" sensitive. */
-            *into++ = (unsigned char)*json++;
-            while (*json && (*json != '\"'))
+            cJSON *a_element = a->child;
+            cJSON *b_element = b->child;
+
+            for (; (a_element != NULL) && (b_element != NULL);)
             {
-                if (*json == '\\')
+                if (!cJSON_Compare(a_element, b_element, case_sensitive))
                 {
-                    *into++ = (unsigned char)*json++;
+                    return false;
                 }
-                *into++ = (unsigned char)*json++;
+
+                a_element = a_element->next;
+                b_element = b_element->next;
+            }
+
+            /* one of the arrays is longer than the other */
+            if (a_element != b_element) {
+                return false;
             }
-            *into++ = (unsigned char)*json++;
+
+            return true;
         }
-        else
+
+        case cJSON_Object:
         {
-            /* All other characters. */
-            *into++ = (unsigned char)*json++;
+            cJSON *a_element = NULL;
+            cJSON *b_element = NULL;
+            cJSON_ArrayForEach(a_element, a)
+            {
+                /* TODO This has O(n^2) runtime, which is horrible! */
+                b_element = get_object_item(b, a_element->string, case_sensitive);
+                if (b_element == NULL)
+                {
+                    return false;
+                }
+
+                if (!cJSON_Compare(a_element, b_element, case_sensitive))
+                {
+                    return false;
+                }
+            }
+
+            /* doing this twice, once on a and b to prevent true comparison if a subset of b
+             * TODO: Do this the proper way, this is just a fix for now */
+            cJSON_ArrayForEach(b_element, b)
+            {
+                a_element = get_object_item(a, b_element->string, case_sensitive);
+                if (a_element == NULL)
+                {
+                    return false;
+                }
+
+                if (!cJSON_Compare(b_element, a_element, case_sensitive))
+                {
+                    return false;
+                }
+            }
+
+            return true;
         }
+
+        default:
+            return false;
     }
+}
 
-    /* and null-terminate. */
-    *into = '\0';
+CJSON_PUBLIC(void *) cJSON_malloc(size_t size)
+{
+    return global_hooks.allocate(size);
+}
+
+CJSON_PUBLIC(void) cJSON_free(void *object)
+{
+    global_hooks.deallocate(object);
 }
diff --git a/libs/esl/src/cJSON_Utils.c b/libs/esl/src/cJSON_Utils.c
index 395aa736923..2bacf642ac8 100644
--- a/libs/esl/src/cJSON_Utils.c
+++ b/libs/esl/src/cJSON_Utils.c
@@ -1,126 +1,194 @@
+/*
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+/* disable warnings about old C89 functions in MSVC */
+#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER)
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#ifdef __GNUCC__
+#pragma GCC visibility push(default)
+#endif
+#if defined(_MSC_VER)
+#pragma warning (push)
+/* disable warning about single line comments in system headers */
+#pragma warning (disable : 4001)
+#endif
+
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <limits.h>
-#include <assert.h>
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
+#ifdef __GNUCC__
+#pragma GCC visibility pop
+#endif
 
 #include "esl_cJSON_Utils.h"
 
-static unsigned char* cJSONUtils_strdup(const unsigned char* str)
+/* define our own boolean type */
+#ifdef true
+#undef true
+#endif
+#define true ((cJSON_bool)1)
+
+#ifdef false
+#undef false
+#endif
+#define false ((cJSON_bool)0)
+
+static unsigned char* cJSONUtils_strdup(const unsigned char* const string)
 {
-    size_t len = 0;
+    size_t length = 0;
     unsigned char *copy = NULL;
 
-    len = strlen((const char*)str) + 1;
-    if (!(copy = (unsigned char*)malloc(len)))
+    length = strlen((const char*)string) + sizeof("");
+    copy = (unsigned char*) cJSON_malloc(length);
+    if (copy == NULL)
     {
         return NULL;
     }
-    memcpy(copy, str, len);
+    memcpy(copy, string, length);
 
     return copy;
 }
 
-static int cJSONUtils_strcasecmp(const unsigned char *s1, const unsigned char *s2)
+/* string comparison which doesn't consider NULL pointers equal */
+static int compare_strings(const unsigned char *string1, const unsigned char *string2, const cJSON_bool case_sensitive)
 {
-    if (!s1)
+    if ((string1 == NULL) || (string2 == NULL))
     {
-        return (s1 == s2) ? 0 : 1; /* both NULL? */
+        return 1;
     }
-    if (!s2)
+
+    if (string1 == string2)
     {
-        return 1;
+        return 0;
     }
-    for(; tolower(*s1) == tolower(*s2); ++s1, ++s2)
+
+    if (case_sensitive)
     {
-        if(*s1 == 0)
+        return strcmp((const char*)string1, (const char*)string2);
+    }
+
+    for(; tolower(*string1) == tolower(*string2); (void)string1++, string2++)
+    {
+        if (*string1 == '\0')
         {
             return 0;
         }
     }
 
-    return tolower(*s1) - tolower(*s2);
+    return tolower(*string1) - tolower(*string2);
 }
 
-/* JSON Pointer implementation: */
-static int cJSONUtils_Pstrcasecmp(const unsigned char *a, const unsigned char *e)
+/* Compare the next path element of two JSON pointers, two NULL pointers are considered unequal: */
+static cJSON_bool compare_pointers(const unsigned char *name, const unsigned char *pointer, const cJSON_bool case_sensitive)
 {
-    if (!a || !e)
+    if ((name == NULL) || (pointer == NULL))
     {
-        return (a == e) ? 0 : 1; /* both NULL? */
+        return false;
     }
-    for (; *a && *e && (*e != '/'); a++, e++) /* compare until next '/' */
+
+    for (; (*name != '\0') && (*pointer != '\0') && (*pointer != '/'); (void)name++, pointer++) /* compare until next '/' */
     {
-        if (*e == '~')
+        if (*pointer == '~')
         {
             /* check for escaped '~' (~0) and '/' (~1) */
-            if (!((e[1] == '0') && (*a == '~')) && !((e[1] == '1') && (*a == '/')))
+            if (((pointer[1] != '0') || (*name != '~')) && ((pointer[1] != '1') || (*name != '/')))
             {
-                /* invalid escape sequence or wrong character in *a */
-                return 1;
+                /* invalid escape sequence or wrong character in *name */
+                return false;
             }
             else
             {
-                e++;
+                pointer++;
             }
         }
-        else if (tolower(*a) != tolower(*e))
+        else if ((!case_sensitive && (tolower(*name) != tolower(*pointer))) || (case_sensitive && (*name != *pointer)))
         {
-            return 1;
+            return false;
         }
     }
-    if (((*e != 0) && (*e != '/')) != (*a != 0))
+    if (((*pointer != 0) && (*pointer != '/')) != (*name != 0))
     {
         /* one string has ended, the other not */
-        return 1;
+        return false;;
     }
 
-    return 0;
+    return true;
 }
 
-static size_t cJSONUtils_PointerEncodedstrlen(const unsigned char *s)
+/* calculate the length of a string if encoded as JSON pointer with ~0 and ~1 escape sequences */
+static size_t pointer_encoded_length(const unsigned char *string)
 {
-    size_t l = 0;
-    for (; *s; s++, l++)
+    size_t length;
+    for (length = 0; *string != '\0'; (void)string++, length++)
     {
-        if ((*s == '~') || (*s == '/'))
+        /* character needs to be escaped? */
+        if ((*string == '~') || (*string == '/'))
         {
-            l++;
+            length++;
         }
     }
 
-    return l;
+    return length;
 }
 
-static void cJSONUtils_PointerEncodedstrcpy(unsigned char *d, const unsigned char *s)
+/* copy a string while escaping '~' and '/' with ~0 and ~1 JSON pointer escape codes */
+static void encode_string_as_pointer(unsigned char *destination, const unsigned char *source)
 {
-    for (; *s; s++)
+    for (; source[0] != '\0'; (void)source++, destination++)
     {
-        if (*s == '/')
+        if (source[0] == '/')
         {
-            *d++ = '~';
-            *d++ = '1';
+            destination[1] = '1';
+            destination++;
         }
-        else if (*s == '~')
+        else if (source[0] == '~')
         {
-            *d++ = '~';
-            *d++ = '0';
+            destination[0] = '~';
+            destination[1] = '1';
+            destination++;
         }
         else
         {
-            *d++ = *s;
+            destination[0] = source[0];
         }
     }
 
-    *d = '\0';
+    destination[0] = '\0';
 }
 
-CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *target)
+CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(const cJSON * const object, const cJSON * const target)
 {
-    int type = object->type;
-    size_t c = 0;
-    cJSON *obj = 0;
+    size_t child_index = 0;
+    cJSON *current_child = 0;
+
+    if ((object == NULL) || (target == NULL))
+    {
+        return NULL;
+    }
 
     if (object == target)
     {
@@ -128,44 +196,44 @@ CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *ta
         return (char*)cJSONUtils_strdup((const unsigned char*)"");
     }
 
-    /* recursively search all children of the object */
-    for (obj = object->child; obj; obj = obj->next, c++)
+    /* recursively search all children of the object or array */
+    for (current_child = object->child; current_child != NULL; (void)(current_child = current_child->next), child_index++)
     {
-        unsigned char *found = (unsigned char*)cJSONUtils_FindPointerFromObjectTo(obj, target);
-        if (found)
+        unsigned char *target_pointer = (unsigned char*)cJSONUtils_FindPointerFromObjectTo(current_child, target);
+        /* found the target? */
+        if (target_pointer != NULL)
         {
-            if ((type & 0xFF) == cJSON_Array)
+            if (cJSON_IsArray(object))
             {
                 /* reserve enough memory for a 64 bit integer + '/' and '\0' */
-                unsigned char *ret = (unsigned char*)malloc(strlen((char*)found) + 23);
-                assert(ret);
+                unsigned char *full_pointer = (unsigned char*)cJSON_malloc(strlen((char*)target_pointer) + 20 + sizeof("/"));
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (child_index > ULONG_MAX)
                 {
-                    free(found);
+                    cJSON_free(target_pointer);
                     return NULL;
                 }
-                sprintf((char*)ret, "/%lu%s", (unsigned long)c, found); /* /<array_index><path> */
-                free(found);
+                sprintf((char*)full_pointer, "/%lu%s", (unsigned long)child_index, target_pointer); /* /<array_index><path> */
+                cJSON_free(target_pointer);
 
-                return (char*)ret;
+                return (char*)full_pointer;
             }
-            else if ((type & 0xFF) == cJSON_Object)
+
+            if (cJSON_IsObject(object))
             {
-                unsigned char *ret = (unsigned char*)malloc(strlen((char*)found) + cJSONUtils_PointerEncodedstrlen((unsigned char*)obj->string) + 2);
-                assert(ret);
-                *ret = '/';
-                cJSONUtils_PointerEncodedstrcpy(ret + 1, (unsigned char*)obj->string);
-                strcat((char*)ret, (char*)found);
-                free(found);
+                unsigned char *full_pointer = (unsigned char*)cJSON_malloc(strlen((char*)target_pointer) + pointer_encoded_length((unsigned char*)current_child->string) + 2);
+                full_pointer[0] = '/';
+                encode_string_as_pointer(full_pointer + 1, (unsigned char*)current_child->string);
+                strcat((char*)full_pointer, (char*)target_pointer);
+                cJSON_free(target_pointer);
 
-                return (char*)ret;
+                return (char*)full_pointer;
             }
 
             /* reached leaf of the tree, found nothing */
-            free(found);
+            cJSON_free(target_pointer);
             return NULL;
         }
     }
@@ -174,392 +242,881 @@ CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *ta
     return NULL;
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON *object, const char *pointer)
+/* non broken version of cJSON_GetArrayItem */
+static cJSON *get_array_item(const cJSON *array, size_t item)
+{
+    cJSON *child = array ? array->child : NULL;
+    while ((child != NULL) && (item > 0))
+    {
+        item--;
+        child = child->next;
+    }
+
+    return child;
+}
+
+static cJSON_bool decode_array_index_from_pointer(const unsigned char * const pointer, size_t * const index)
 {
+    size_t parsed_index = 0;
+    size_t position = 0;
+
+    if ((pointer[0] == '0') && ((pointer[1] != '\0') && (pointer[1] != '/')))
+    {
+        /* leading zeroes are not permitted */
+        return 0;
+    }
+
+    for (position = 0; (pointer[position] >= '0') && (pointer[0] <= '9'); position++)
+    {
+        parsed_index = (10 * parsed_index) + (size_t)(pointer[position] - '0');
+
+    }
+
+    if ((pointer[position] != '\0') && (pointer[position] != '/'))
+    {
+        return 0;
+    }
+
+    *index = parsed_index;
+
+    return 1;
+}
+
+static cJSON *get_item_from_pointer(cJSON * const object, const char * pointer, const cJSON_bool case_sensitive)
+{
+    cJSON *current_element = object;
+
+    if (pointer == NULL)
+    {
+        return NULL;
+    }
+
     /* follow path of the pointer */
-    while ((*pointer++ == '/') && object)
+    while ((pointer[0] == '/') && (current_element != NULL))
     {
-        if ((object->type & 0xFF) == cJSON_Array)
+        pointer++;
+        if (cJSON_IsArray(current_element))
         {
-            size_t which = 0;
-            /* parse array index */
-            while ((*pointer >= '0') && (*pointer <= '9'))
-            {
-                which = (10 * which) + (size_t)(*pointer++ - '0');
-            }
-            if (*pointer && (*pointer != '/'))
-            {
-                /* not end of string or new path token */
-                return NULL;
-            }
-            if (which > INT_MAX)
+            size_t index = 0;
+            if (!decode_array_index_from_pointer((const unsigned char*)pointer, &index))
             {
                 return NULL;
             }
-            object = cJSON_GetArrayItem(object, (int)which);
+
+            current_element = get_array_item(current_element, index);
         }
-        else if ((object->type & 0xFF) == cJSON_Object)
+        else if (cJSON_IsObject(current_element))
         {
-            object = object->child;
+            current_element = current_element->child;
             /* GetObjectItem. */
-            while (object && cJSONUtils_Pstrcasecmp((unsigned char*)object->string, (const unsigned char*)pointer))
+            while ((current_element != NULL) && !compare_pointers((unsigned char*)current_element->string, (const unsigned char*)pointer, case_sensitive))
             {
-                object = object->next;
-            }
-            /* skip to the next path token or end of string */
-            while (*pointer && (*pointer != '/'))
-            {
-                pointer++;
+                current_element = current_element->next;
             }
         }
         else
         {
             return NULL;
         }
+
+        /* skip to the next path token or end of string */
+        while ((pointer[0] != '\0') && (pointer[0] != '/'))
+        {
+            pointer++;
+        }
     }
 
-    return object;
+    return current_element;
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON * const object, const char *pointer)
+{
+    return get_item_from_pointer(object, pointer, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointerCaseSensitive(cJSON * const object, const char *pointer)
+{
+    return get_item_from_pointer(object, pointer, true);
 }
 
 /* JSON Patch implementation. */
-static void cJSONUtils_InplaceDecodePointerString(unsigned char *string)
+static void decode_pointer_inplace(unsigned char *string)
 {
-    unsigned char *s2 = string;
+    unsigned char *decoded_string = string;
 
     if (string == NULL) {
         return;
     }
 
-    for (; *string; s2++, string++)
+    for (; *string; (void)decoded_string++, string++)
     {
-        *s2 = (*string != '~')
-            ? (*string)
-            : ((*(++string) == '0')
-                    ? '~'
-                    : '/');
+        if (string[0] == '~')
+        {
+            if (string[1] == '0')
+            {
+                decoded_string[0] = '~';
+            }
+            else if (string[1] == '1')
+            {
+                decoded_string[1] = '/';
+            }
+            else
+            {
+                /* invalid escape sequence */
+                return;
+            }
+
+            string++;
+        }
     }
 
-    *s2 = '\0';
+    decoded_string[0] = '\0';
 }
 
-static cJSON *cJSONUtils_PatchDetach(cJSON *object, const unsigned char *path)
+/* non-broken cJSON_DetachItemFromArray */
+static cJSON *detach_item_from_array(cJSON *array, size_t which)
 {
-    unsigned char *parentptr = NULL;
-    unsigned char *childptr = NULL;
+    cJSON *c = array->child;
+    while (c && (which > 0))
+    {
+        c = c->next;
+        which--;
+    }
+    if (!c)
+    {
+        /* item doesn't exist */
+        return NULL;
+    }
+    if (c->prev)
+    {
+        /* not the first element */
+        c->prev->next = c->next;
+    }
+    if (c->next)
+    {
+        c->next->prev = c->prev;
+    }
+    if (c==array->child)
+    {
+        array->child = c->next;
+    }
+    /* make sure the detached item doesn't point anywhere anymore */
+    c->prev = c->next = NULL;
+
+    return c;
+}
+
+/* detach an item at the given path */
+static cJSON *detach_path(cJSON *object, const unsigned char *path, const cJSON_bool case_sensitive)
+{
+    unsigned char *parent_pointer = NULL;
+    unsigned char *child_pointer = NULL;
     cJSON *parent = NULL;
-    cJSON *ret = NULL;
+    cJSON *detached_item = NULL;
 
     /* copy path and split it in parent and child */
-    parentptr = cJSONUtils_strdup(path);
-    if (parentptr == NULL) {
-        return NULL;
+    parent_pointer = cJSONUtils_strdup(path);
+    if (parent_pointer == NULL) {
+        goto cleanup;
     }
 
-    childptr = (unsigned char*)strrchr((char*)parentptr, '/'); /* last '/' */
-    if (childptr == NULL)
+    child_pointer = (unsigned char*)strrchr((char*)parent_pointer, '/'); /* last '/' */
+    if (child_pointer == NULL)
     {
-        free(parentptr);
-        return NULL;
+        goto cleanup;
     }
     /* split strings */
-    *childptr++ = '\0';
+    child_pointer[0] = '\0';
+    child_pointer++;
 
-    parent = cJSONUtils_GetPointer(object, (char*)parentptr);
-    cJSONUtils_InplaceDecodePointerString(childptr);
+    parent = get_item_from_pointer(object, (char*)parent_pointer, case_sensitive);
+    decode_pointer_inplace(child_pointer);
 
-    if (!parent)
+    if (cJSON_IsArray(parent))
+    {
+        size_t index = 0;
+        if (!decode_array_index_from_pointer(child_pointer, &index))
+        {
+            goto cleanup;
+        }
+        detached_item = detach_item_from_array(parent, index);
+    }
+    else if (cJSON_IsObject(parent))
+    {
+        detached_item = cJSON_DetachItemFromObject(parent, (char*)child_pointer);
+    }
+    else
     {
         /* Couldn't find object to remove child from. */
-        ret = NULL;
+        goto cleanup;
+    }
+
+cleanup:
+    if (parent_pointer != NULL)
+    {
+        cJSON_free(parent_pointer);
+    }
+
+    return detached_item;
+}
+
+/* sort lists using mergesort */
+static cJSON *sort_list(cJSON *list, const cJSON_bool case_sensitive)
+{
+    cJSON *first = list;
+    cJSON *second = list;
+    cJSON *current_item = list;
+    cJSON *result = list;
+    cJSON *result_tail = NULL;
+
+    if ((list == NULL) || (list->next == NULL))
+    {
+        /* One entry is sorted already. */
+        return result;
+    }
+
+    while ((current_item != NULL) && (current_item->next != NULL) && (compare_strings((unsigned char*)current_item->string, (unsigned char*)current_item->next->string, case_sensitive) < 0))
+    {
+        /* Test for list sorted. */
+        current_item = current_item->next;
     }
-    else if ((parent->type & 0xFF) == cJSON_Array)
+    if ((current_item == NULL) || (current_item->next == NULL))
     {
-        ret = cJSON_DetachItemFromArray(parent, atoi((char*)childptr));
+        /* Leave sorted lists unmodified. */
+        return result;
     }
-    else if ((parent->type & 0xFF) == cJSON_Object)
+
+    /* reset pointer to the beginning */
+    current_item = list;
+    while (current_item != NULL)
     {
-        ret = cJSON_DetachItemFromObject(parent, (char*)childptr);
+        /* Walk two pointers to find the middle. */
+        second = second->next;
+        current_item = current_item->next;
+        /* advances current_item two steps at a time */
+        if (current_item != NULL)
+        {
+            current_item = current_item->next;
+        }
+    }
+    if ((second != NULL) && (second->prev != NULL))
+    {
+        /* Split the lists */
+        second->prev->next = NULL;
+        second->prev = NULL;
     }
-    free(parentptr);
 
-    /* return the detachted item */
-    return ret;
+    /* Recursively sort the sub-lists. */
+    first = sort_list(first, case_sensitive);
+    second = sort_list(second, case_sensitive);
+    result = NULL;
+
+    /* Merge the sub-lists */
+    while ((first != NULL) && (second != NULL))
+    {
+        cJSON *smaller = NULL;
+        if (compare_strings((unsigned char*)first->string, (unsigned char*)second->string, case_sensitive) < 0)
+        {
+            smaller = first;
+        }
+        else
+        {
+            smaller = second;
+        }
+
+        if (result == NULL)
+        {
+            /* start merged list with the smaller element */
+            result_tail = smaller;
+            result = smaller;
+        }
+        else
+        {
+            /* add smaller element to the list */
+            result_tail->next = smaller;
+            smaller->prev = result_tail;
+            result_tail = smaller;
+        }
+
+        if (first == smaller)
+        {
+            first = first->next;
+        }
+        else
+        {
+            second = second->next;
+        }
+    }
+
+    if (first != NULL)
+    {
+        /* Append rest of first list. */
+        if (result == NULL)
+        {
+            return first;
+        }
+        result_tail->next = first;
+        first->prev = result_tail;
+    }
+    if (second != NULL)
+    {
+        /* Append rest of second list */
+        if (result == NULL)
+        {
+            return second;
+        }
+        result_tail->next = second;
+        second->prev = result_tail;
+    }
+
+    return result;
+}
+
+static void sort_object(cJSON * const object, const cJSON_bool case_sensitive)
+{
+    if (object == NULL)
+    {
+        return;
+    }
+    object->child = sort_list(object->child, case_sensitive);
 }
 
-static int cJSONUtils_Compare(cJSON *a, cJSON *b)
+static cJSON_bool compare_json(cJSON *a, cJSON *b, const cJSON_bool case_sensitive)
 {
     if ((a == NULL) || (b == NULL) || ((a->type & 0xFF) != (b->type & 0xFF)))
     {
         /* mismatched type. */
-        return -1;
+        return false;
     }
     switch (a->type & 0xFF)
     {
         case cJSON_Number:
             /* numeric mismatch. */
-            return ((a->valueint != b->valueint) || (a->valuedouble != b->valuedouble)) ? -2 : 0;
+            if ((a->valueint != b->valueint) || (a->valuedouble != b->valuedouble))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_String:
             /* string mismatch. */
-            return (strcmp(a->valuestring, b->valuestring) != 0) ? -3 : 0;
+            if (strcmp(a->valuestring, b->valuestring) != 0)
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_Array:
-            for (a = a->child, b = b->child; a && b; a = a->next, b = b->next)
+            for ((void)(a = a->child), b = b->child; (a != NULL) && (b != NULL); (void)(a = a->next), b = b->next)
             {
-                int err = cJSONUtils_Compare(a, b);
-                if (err)
+                cJSON_bool identical = compare_json(a, b, case_sensitive);
+                if (!identical)
                 {
-                    return err;
+                    return false;
                 }
             }
+
             /* array size mismatch? (one of both children is not NULL) */
-            return (a || b) ? -4 : 0;
+            if ((a != NULL) || (b != NULL))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_Object:
-            cJSONUtils_SortObject(a);
-            cJSONUtils_SortObject(b);
-            a = a->child;
-            b = b->child;
-            while (a && b)
+            sort_object(a, case_sensitive);
+            sort_object(b, case_sensitive);
+            for ((void)(a = a->child), b = b->child; (a != NULL) && (b != NULL); (void)(a = a->next), b = b->next)
             {
-                int err = 0;
+                cJSON_bool identical = false;
                 /* compare object keys */
-                if (cJSONUtils_strcasecmp((unsigned char*)a->string, (unsigned char*)b->string))
+                if (compare_strings((unsigned char*)a->string, (unsigned char*)b->string, case_sensitive))
                 {
                     /* missing member */
-                    return -6;
+                    return false;
                 }
-                err = cJSONUtils_Compare(a, b);
-                if (err)
+                identical = compare_json(a, b, case_sensitive);
+                if (!identical)
                 {
-                    return err;
+                    return false;
                 }
-                a = a->next;
-                b = b->next;
             }
+
             /* object length mismatch (one of both children is not null) */
-            return (a || b) ? -5 : 0;
+            if ((a != NULL) || (b != NULL))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
 
         default:
             break;
     }
+
     /* null, true or false */
-    return 0;
+    return true;
 }
 
-static int cJSONUtils_ApplyPatch(cJSON *object, cJSON *patch)
+/* non broken version of cJSON_InsertItemInArray */
+static cJSON_bool insert_item_in_array(cJSON *array, size_t which, cJSON *newitem)
 {
-    cJSON *op = NULL;
-    cJSON *path = NULL;
-    cJSON *value = NULL;
-    cJSON *parent = NULL;
-    int opcode = 0;
-    unsigned char *parentptr = NULL;
-    unsigned char *childptr = NULL;
+    cJSON *child = array->child;
+    while (child && (which > 0))
+    {
+        child = child->next;
+        which--;
+    }
+    if (which > 0)
+    {
+        /* item is after the end of the array */
+        return 0;
+    }
+    if (child == NULL)
+    {
+        cJSON_AddItemToArray(array, newitem);
+        return 1;
+    }
+
+    /* insert into the linked list */
+    newitem->next = child;
+    newitem->prev = child->prev;
+    child->prev = newitem;
 
-    op = cJSON_GetObjectItem(patch, "op");
-    path = cJSON_GetObjectItem(patch, "path");
-    if (!op || !path)
+    /* was it at the beginning */
+    if (child == array->child)
     {
-        /* malformed patch. */
-        return 2;
+        array->child = newitem;
+    }
+    else
+    {
+        newitem->prev->next = newitem;
+    }
+
+    return 1;
+}
+
+static cJSON *get_object_item(const cJSON * const object, const char* name, const cJSON_bool case_sensitive)
+{
+    if (case_sensitive)
+    {
+        return cJSON_GetObjectItemCaseSensitive(object, name);
     }
 
-    /* decode operation */
-    if (!strcmp(op->valuestring, "add"))
+    return cJSON_GetObjectItem(object, name);
+}
+
+enum patch_operation { INVALID, ADD, REMOVE, REPLACE, MOVE, COPY, TEST };
+
+static enum patch_operation decode_patch_operation(const cJSON * const patch, const cJSON_bool case_sensitive)
+{
+    cJSON *operation = get_object_item(patch, "op", case_sensitive);
+    if (!cJSON_IsString(operation))
     {
-        opcode = 0;
+        return INVALID;
     }
-    else if (!strcmp(op->valuestring, "remove"))
+
+    if (strcmp(operation->valuestring, "add") == 0)
     {
-        opcode = 1;
+        return ADD;
     }
-    else if (!strcmp(op->valuestring, "replace"))
+
+    if (strcmp(operation->valuestring, "remove") == 0)
     {
-        opcode = 2;
+        return REMOVE;
     }
-    else if (!strcmp(op->valuestring, "move"))
+
+    if (strcmp(operation->valuestring, "replace") == 0)
     {
-        opcode = 3;
+        return REPLACE;
     }
-    else if (!strcmp(op->valuestring, "copy"))
+
+    if (strcmp(operation->valuestring, "move") == 0)
     {
-        opcode = 4;
+        return MOVE;
     }
-    else if (!strcmp(op->valuestring, "test"))
+
+    if (strcmp(operation->valuestring, "copy") == 0)
+    {
+        return COPY;
+    }
+
+    if (strcmp(operation->valuestring, "test") == 0)
+    {
+        return TEST;
+    }
+
+    return INVALID;
+}
+
+/* overwrite and existing item with another one and free resources on the way */
+static void overwrite_item(cJSON * const root, const cJSON replacement)
+{
+    if (root == NULL)
+    {
+        return;
+    }
+
+    if (root->string != NULL)
+    {
+        cJSON_free(root->string);
+    }
+    if (root->valuestring != NULL)
+    {
+        cJSON_free(root->valuestring);
+    }
+    if (root->child != NULL)
+    {
+        cJSON_Delete(root->child);
+    }
+
+    memcpy(root, &replacement, sizeof(cJSON));
+}
+
+static int apply_patch(cJSON *object, const cJSON *patch, const cJSON_bool case_sensitive)
+{
+    cJSON *path = NULL;
+    cJSON *value = NULL;
+    cJSON *parent = NULL;
+    enum patch_operation opcode = INVALID;
+    unsigned char *parent_pointer = NULL;
+    unsigned char *child_pointer = NULL;
+    int status = 0;
+
+    path = get_object_item(patch, "path", case_sensitive);
+    if (!cJSON_IsString(path))
+    {
+        /* malformed patch. */
+        status = 2;
+        goto cleanup;
+    }
+
+    opcode = decode_patch_operation(patch, case_sensitive);
+    if (opcode == INVALID)
+    {
+        status = 3;
+        goto cleanup;
+    }
+    else if (opcode == TEST)
     {
         /* compare value: {...} with the given path */
-        return cJSONUtils_Compare(cJSONUtils_GetPointer(object, path->valuestring), cJSON_GetObjectItem(patch, "value"));
+        status = !compare_json(get_item_from_pointer(object, path->valuestring, case_sensitive), get_object_item(patch, "value", case_sensitive), case_sensitive);
+        goto cleanup;
     }
-    else
+
+    /* special case for replacing the root */
+    if (path->valuestring[0] == '\0')
     {
-        /* unknown opcode. */
-        return 3;
+        if (opcode == REMOVE)
+        {
+            static const cJSON invalid = { NULL, NULL, NULL, cJSON_Invalid, NULL, 0, 0, NULL};
+
+            overwrite_item(object, invalid);
+
+            status = 0;
+            goto cleanup;
+        }
+
+        if ((opcode == REPLACE) || (opcode == ADD))
+        {
+            value = get_object_item(patch, "value", case_sensitive);
+            if (value == NULL)
+            {
+                /* missing "value" for add/replace. */
+                status = 7;
+                goto cleanup;
+            }
+
+            value = cJSON_Duplicate(value, 1);
+            if (value == NULL)
+            {
+                /* out of memory for add/replace. */
+                status = 8;
+                goto cleanup;
+            }
+
+            overwrite_item(object, *value);
+
+            /* delete the duplicated value */
+            cJSON_free(value);
+            value = NULL;
+
+            /* the string "value" isn't needed */
+            if (object->string != NULL)
+            {
+                cJSON_free(object->string);
+                object->string = NULL;
+            }
+
+            status = 0;
+            goto cleanup;
+        }
     }
 
-    /* Remove/Replace */
-    if ((opcode == 1) || (opcode == 2))
+    if ((opcode == REMOVE) || (opcode == REPLACE))
     {
         /* Get rid of old. */
-        cJSON_Delete(cJSONUtils_PatchDetach(object, (unsigned char*)path->valuestring));
-        if (opcode == 1)
+        cJSON *old_item = detach_path(object, (unsigned char*)path->valuestring, case_sensitive);
+        if (old_item == NULL)
         {
-            /* For Remove, this is job done. */
-            return 0;
+            status = 13;
+            goto cleanup;
+        }
+        cJSON_Delete(old_item);
+        if (opcode == REMOVE)
+        {
+            /* For Remove, this job is done. */
+            status = 0;
+            goto cleanup;
         }
     }
 
     /* Copy/Move uses "from". */
-    if ((opcode == 3) || (opcode == 4))
+    if ((opcode == MOVE) || (opcode == COPY))
     {
-        cJSON *from = cJSON_GetObjectItem(patch, "from");
-        if (!from)
+        cJSON *from = get_object_item(patch, "from", case_sensitive);
+        if (from == NULL)
         {
             /* missing "from" for copy/move. */
-            return 4;
+            status = 4;
+            goto cleanup;
         }
 
-        if (opcode == 3)
+        if (opcode == MOVE)
         {
-            /* move */
-            value = cJSONUtils_PatchDetach(object, (unsigned char*)from->valuestring);
+            value = detach_path(object, (unsigned char*)from->valuestring, case_sensitive);
         }
-        if (opcode == 4)
+        if (opcode == COPY)
         {
-            /* copy */
-            value = cJSONUtils_GetPointer(object, from->valuestring);
+            value = get_item_from_pointer(object, from->valuestring, case_sensitive);
         }
-        if (!value)
+        if (value == NULL)
         {
             /* missing "from" for copy/move. */
-            return 5;
+            status = 5;
+            goto cleanup;
         }
-        if (opcode == 4)
+        if (opcode == COPY)
         {
             value = cJSON_Duplicate(value, 1);
         }
-        if (!value)
+        if (value == NULL)
         {
             /* out of memory for copy/move. */
-            return 6;
+            status = 6;
+            goto cleanup;
         }
     }
     else /* Add/Replace uses "value". */
     {
-        value = cJSON_GetObjectItem(patch, "value");
-        if (!value)
+        value = get_object_item(patch, "value", case_sensitive);
+        if (value == NULL)
         {
             /* missing "value" for add/replace. */
-            return 7;
+            status = 7;
+            goto cleanup;
         }
         value = cJSON_Duplicate(value, 1);
-        if (!value)
+        if (value == NULL)
         {
             /* out of memory for add/replace. */
-            return 8;
+            status = 8;
+            goto cleanup;
         }
     }
 
     /* Now, just add "value" to "path". */
 
     /* split pointer in parent and child */
-    parentptr = cJSONUtils_strdup((unsigned char*)path->valuestring);
-    childptr = (unsigned char*)strrchr((char*)parentptr, '/');
-    if (childptr)
+    parent_pointer = cJSONUtils_strdup((unsigned char*)path->valuestring);
+    child_pointer = (unsigned char*)strrchr((char*)parent_pointer, '/');
+    if (child_pointer != NULL)
     {
-        *childptr++ = '\0';
+        child_pointer[0] = '\0';
+        child_pointer++;
     }
-    parent = cJSONUtils_GetPointer(object, (char*)parentptr);
-    cJSONUtils_InplaceDecodePointerString(childptr);
+    parent = get_item_from_pointer(object, (char*)parent_pointer, case_sensitive);
+    decode_pointer_inplace(child_pointer);
 
     /* add, remove, replace, move, copy, test. */
-    if (!parent)
+    if ((parent == NULL) || (child_pointer == NULL))
     {
         /* Couldn't find object to add to. */
-        free(parentptr);
-        cJSON_Delete(value);
-        return 9;
+        status = 9;
+        goto cleanup;
     }
-    else if ((parent->type & 0xFF) == cJSON_Array)
+    else if (cJSON_IsArray(parent))
     {
-        if (!strcmp((char*)childptr, "-"))
+        if (strcmp((char*)child_pointer, "-") == 0)
         {
             cJSON_AddItemToArray(parent, value);
+            value = NULL;
+        }
+        else
+        {
+            size_t index = 0;
+            if (!decode_array_index_from_pointer(child_pointer, &index))
+            {
+                status = 11;
+                goto cleanup;
+            }
+
+            if (!insert_item_in_array(parent, index, value))
+            {
+                status = 10;
+                goto cleanup;
+            }
+            value = NULL;
+        }
+    }
+    else if (cJSON_IsObject(parent))
+    {
+        if (case_sensitive)
+        {
+            cJSON_DeleteItemFromObjectCaseSensitive(parent, (char*)child_pointer);
         }
         else
         {
-            cJSON_InsertItemInArray(parent, atoi((char*)childptr), value);
+            cJSON_DeleteItemFromObject(parent, (char*)child_pointer);
         }
+        cJSON_AddItemToObject(parent, (char*)child_pointer, value);
+        value = NULL;
     }
-    else if ((parent->type & 0xFF) == cJSON_Object)
+    else /* parent is not an object */
     {
-        cJSON_DeleteItemFromObject(parent, (char*)childptr);
-        cJSON_AddItemToObject(parent, (char*)childptr, value);
+        /* Couldn't find object to add to. */
+        status = 9;
+        goto cleanup;
     }
-    else
+
+cleanup:
+    if (value != NULL)
     {
         cJSON_Delete(value);
     }
-    free(parentptr);
+    if (parent_pointer != NULL)
+    {
+        cJSON_free(parent_pointer);
+    }
 
-    return 0;
+    return status;
 }
 
-CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches)
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON * const object, const cJSON * const patches)
 {
-    int err = 0;
+    const cJSON *current_patch = NULL;
+    int status = 0;
 
-    if (patches == NULL)
+    if (!cJSON_IsArray(patches))
     {
+        /* malformed patches. */
         return 1;
     }
 
-    if ((patches->type & 0xFF) != cJSON_Array)
+    if (patches != NULL)
+    {
+        current_patch = patches->child;
+    }
+
+    while (current_patch != NULL)
+    {
+        status = apply_patch(object, current_patch, false);
+        if (status != 0)
+        {
+            return status;
+        }
+        current_patch = current_patch->next;
+    }
+
+    return 0;
+}
+
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatchesCaseSensitive(cJSON * const object, const cJSON * const patches)
+{
+    const cJSON *current_patch = NULL;
+    int status = 0;
+
+    if (!cJSON_IsArray(patches))
     {
         /* malformed patches. */
         return 1;
     }
 
-    patches = patches->child;
+    if (patches != NULL)
+    {
+        current_patch = patches->child;
+    }
 
-    while (patches)
+    while (current_patch != NULL)
     {
-        if ((err = cJSONUtils_ApplyPatch(object, patches)))
+        status = apply_patch(object, current_patch, true);
+        if (status != 0)
         {
-            return err;
+            return status;
         }
-        patches = patches->next;
+        current_patch = current_patch->next;
     }
 
     return 0;
 }
 
-static void cJSONUtils_GeneratePatch(cJSON *patches, const unsigned char *op, const unsigned char *path, const unsigned char *suffix, cJSON *val)
+static void compose_patch(cJSON * const patches, const unsigned char * const operation, const unsigned char * const path, const unsigned char *suffix, const cJSON * const value)
 {
-    cJSON *patch = cJSON_CreateObject();
-    cJSON_AddItemToObject(patch, "op", cJSON_CreateString((const char*)op));
-    if (suffix)
+    cJSON *patch = NULL;
+
+    if ((patches == NULL) || (operation == NULL) || (path == NULL))
+    {
+        return;
+    }
+
+    patch = cJSON_CreateObject();
+    if (patch == NULL)
     {
-        unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + cJSONUtils_PointerEncodedstrlen(suffix) + 2);
-        assert(newpath);
-        cJSONUtils_PointerEncodedstrcpy(newpath + sprintf((char*)newpath, "%s/", (const char*)path), suffix);
-        cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)newpath));
-        free(newpath);
+        return;
     }
-    else
+    cJSON_AddItemToObject(patch, "op", cJSON_CreateString((const char*)operation));
+
+    if (suffix == NULL)
     {
         cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)path));
     }
-    if (val)
+    else
     {
-        cJSON_AddItemToObject(patch, "value", cJSON_Duplicate(val, 1));
+        size_t suffix_length = pointer_encoded_length(suffix);
+        size_t path_length = strlen((const char*)path);
+        unsigned char *full_path = (unsigned char*)cJSON_malloc(path_length + suffix_length + sizeof("/"));
+
+        sprintf((char*)full_path, "%s/", (const char*)path);
+        encode_string_as_pointer(full_path + path_length + 1, suffix);
+
+        cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)full_path));
+        cJSON_free(full_path);
+    }
+
+    if (value != NULL)
+    {
+        cJSON_AddItemToObject(patch, "value", cJSON_Duplicate(value, 1));
     }
     cJSON_AddItemToArray(patches, patch);
 }
 
-CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON *array, const char *op, const char *path, cJSON *val)
+CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON * const array, const char * const operation, const char * const path, const cJSON * const value)
 {
-    cJSONUtils_GeneratePatch(array, (const unsigned char*)op, (const unsigned char*)path, 0, val);
+    compose_patch(array, (const unsigned char*)operation, (const unsigned char*)path, NULL, value);
 }
 
-static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path, cJSON *from, cJSON *to)
+static void create_patches(cJSON * const patches, const unsigned char * const path, cJSON * const from, cJSON * const to, const cJSON_bool case_sensitive)
 {
     if ((from == NULL) || (to == NULL))
     {
@@ -568,104 +1125,127 @@ static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path,
 
     if ((from->type & 0xFF) != (to->type & 0xFF))
     {
-        cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+        compose_patch(patches, (const unsigned char*)"replace", path, 0, to);
         return;
     }
 
-    switch ((from->type & 0xFF))
+    switch (from->type & 0xFF)
     {
         case cJSON_Number:
             if ((from->valueint != to->valueint) || (from->valuedouble != to->valuedouble))
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+                compose_patch(patches, (const unsigned char*)"replace", path, NULL, to);
             }
             return;
 
         case cJSON_String:
             if (strcmp(from->valuestring, to->valuestring) != 0)
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+                compose_patch(patches, (const unsigned char*)"replace", path, NULL, to);
             }
             return;
 
         case cJSON_Array:
         {
-            size_t c = 0;
-            unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + 23); /* Allow space for 64bit int. */
-            assert(newpath);
-            /* generate patches for all array elements that exist in "from" and "to" */
-            for (c = 0, from = from->child, to = to->child; from && to; from = from->next, to = to->next, c++)
+            size_t index = 0;
+            cJSON *from_child = from->child;
+            cJSON *to_child = to->child;
+            unsigned char *new_path = (unsigned char*)cJSON_malloc(strlen((const char*)path) + 20 + sizeof("/")); /* Allow space for 64bit int. log10(2^64) = 20 */
+
+            /* generate patches for all array elements that exist in both "from" and "to" */
+            for (index = 0; (from_child != NULL) && (to_child != NULL); (void)(from_child = from_child->next), (void)(to_child = to_child->next), index++)
             {
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (index > ULONG_MAX)
                 {
-                    free(newpath);
+                    cJSON_free(new_path);
                     return;
                 }
-                sprintf((char*)newpath, "%s/%lu", path, (unsigned long)c); /* path of the current array element */
-                cJSONUtils_CompareToPatch(patches, newpath, from, to);
+                sprintf((char*)new_path, "%s/%lu", path, (unsigned long)index); /* path of the current array element */
+                create_patches(patches, new_path, from_child, to_child, case_sensitive);
             }
+
             /* remove leftover elements from 'from' that are not in 'to' */
-            for (; from; from = from->next, c++)
+            for (; (from_child != NULL); (void)(from_child = from_child->next))
             {
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (index > ULONG_MAX)
                 {
-                    free(newpath);
+                    cJSON_free(new_path);
                     return;
                 }
-                sprintf((char*)newpath, "%lu", (unsigned long)c);
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"remove", path, newpath, 0);
+                sprintf((char*)new_path, "%lu", (unsigned long)index);
+                compose_patch(patches, (const unsigned char*)"remove", path, new_path, NULL);
             }
             /* add new elements in 'to' that were not in 'from' */
-            for (; to; to = to->next, c++)
+            for (; (to_child != NULL); (void)(to_child = to_child->next), index++)
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"add", path, (const unsigned char*)"-", to);
+                compose_patch(patches, (const unsigned char*)"add", path, (const unsigned char*)"-", to_child);
             }
-            free(newpath);
+            cJSON_free(new_path);
             return;
         }
 
         case cJSON_Object:
         {
-            cJSON *a = NULL;
-            cJSON *b = NULL;
-            cJSONUtils_SortObject(from);
-            cJSONUtils_SortObject(to);
+            cJSON *from_child = NULL;
+            cJSON *to_child = NULL;
+            sort_object(from, case_sensitive);
+            sort_object(to, case_sensitive);
 
-            a = from->child;
-            b = to->child;
+            from_child = from->child;
+            to_child = to->child;
             /* for all object values in the object with more of them */
-            while (a || b)
+            while ((from_child != NULL) || (to_child != NULL))
             {
-                int diff = (!a) ? 1 : ((!b) ? -1 : cJSONUtils_strcasecmp((unsigned char*)a->string, (unsigned char*)b->string));
-                if (!diff)
+                int diff;
+                if (from_child == NULL)
+                {
+                    diff = 1;
+                }
+                else if (to_child == NULL)
+                {
+                    diff = -1;
+                }
+                else
+                {
+                    diff = compare_strings((unsigned char*)from_child->string, (unsigned char*)to_child->string, case_sensitive);
+                }
+
+                if (diff == 0)
                 {
                     /* both object keys are the same */
-                    unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + cJSONUtils_PointerEncodedstrlen((unsigned char*)a->string) + 2);
-                    assert(newpath);
-                    cJSONUtils_PointerEncodedstrcpy(newpath + sprintf((char*)newpath, "%s/", path), (unsigned char*)a->string);
+                    size_t path_length = strlen((const char*)path);
+                    size_t from_child_name_length = pointer_encoded_length((unsigned char*)from_child->string);
+                    unsigned char *new_path = (unsigned char*)cJSON_malloc(path_length + from_child_name_length + sizeof("/"));
+
+                    sprintf((char*)new_path, "%s/", path);
+                    encode_string_as_pointer(new_path + path_length + 1, (unsigned char*)from_child->string);
+
                     /* create a patch for the element */
-                    cJSONUtils_CompareToPatch(patches, newpath, a, b);
-                    free(newpath);
-                    a = a->next;
-                    b = b->next;
+                    create_patches(patches, new_path, from_child, to_child, case_sensitive);
+                    cJSON_free(new_path);
+
+                    from_child = from_child->next;
+                    to_child = to_child->next;
                 }
                 else if (diff < 0)
                 {
                     /* object element doesn't exist in 'to' --> remove it */
-                    cJSONUtils_GeneratePatch(patches, (const unsigned char*)"remove", path, (unsigned char*)a->string, 0);
-                    a = a->next;
+                    compose_patch(patches, (const unsigned char*)"remove", path, (unsigned char*)from_child->string, NULL);
+
+                    from_child = from_child->next;
                 }
                 else
                 {
                     /* object element doesn't exist in 'from' --> add it */
-                    cJSONUtils_GeneratePatch(patches, (const unsigned char*)"add", path, (unsigned char*)b->string, b);
-                    b = b->next;
+                    compose_patch(patches, (const unsigned char*)"add", path, (unsigned char*)to_child->string, to_child);
+
+                    to_child = to_child->next;
                 }
             }
             return;
@@ -676,211 +1256,199 @@ static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path,
     }
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON *from, cJSON *to)
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON * const from, cJSON * const to)
 {
-    cJSON *patches = cJSON_CreateArray();
-    cJSONUtils_CompareToPatch(patches, (const unsigned char*)"", from, to);
+    cJSON *patches = NULL;
+
+    if ((from == NULL) || (to == NULL))
+    {
+        return NULL;
+    }
+
+    patches = cJSON_CreateArray();
+    create_patches(patches, (const unsigned char*)"", from, to, false);
 
     return patches;
 }
 
-/* sort lists using mergesort */
-static cJSON *cJSONUtils_SortList(cJSON *list)
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatchesCaseSensitive(cJSON * const from, cJSON * const to)
 {
-    cJSON *first = list;
-    cJSON *second = list;
-    cJSON *ptr = list;
+    cJSON *patches = NULL;
 
-    if (!list || !list->next)
+    if ((from == NULL) || (to == NULL))
     {
-        /* One entry is sorted already. */
-        return list;
+        return NULL;
     }
 
-    while (ptr && ptr->next && (cJSONUtils_strcasecmp((unsigned char*)ptr->string, (unsigned char*)ptr->next->string) < 0))
-    {
-        /* Test for list sorted. */
-        ptr = ptr->next;
-    }
-    if (!ptr || !ptr->next)
-    {
-        /* Leave sorted lists unmodified. */
-        return list;
-    }
+    patches = cJSON_CreateArray();
+    create_patches(patches, (const unsigned char*)"", from, to, true);
 
-    /* reset ptr to the beginning */
-    ptr = list;
-    while (ptr)
+    return patches;
+}
+
+CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON * const object)
+{
+    sort_object(object, false);
+}
+
+CJSON_PUBLIC(void) cJSONUtils_SortObjectCaseSensitive(cJSON * const object)
+{
+    sort_object(object, true);
+}
+
+static cJSON *merge_patch(cJSON *target, const cJSON * const patch, const cJSON_bool case_sensitive)
+{
+    cJSON *patch_child = NULL;
+
+    if (!cJSON_IsObject(patch))
     {
-        /* Walk two pointers to find the middle. */
-        second = second->next;
-        ptr = ptr->next;
-        /* advances ptr two steps at a time */
-        if (ptr)
-        {
-            ptr = ptr->next;
-        }
+        /* scalar value, array or NULL, just duplicate */
+        cJSON_Delete(target);
+        return cJSON_Duplicate(patch, 1);
     }
-    if (second && second->prev)
+
+    if (!cJSON_IsObject(target))
     {
-        /* Split the lists */
-        second->prev->next = NULL;
+        cJSON_Delete(target);
+        target = cJSON_CreateObject();
     }
 
-    /* Recursively sort the sub-lists. */
-    first = cJSONUtils_SortList(first);
-    second = cJSONUtils_SortList(second);
-    list = ptr = NULL;
-
-    while (first && second) /* Merge the sub-lists */
+    patch_child = patch->child;
+    while (patch_child != NULL)
     {
-        if (cJSONUtils_strcasecmp((unsigned char*)first->string, (unsigned char*)second->string) < 0)
+        if (cJSON_IsNull(patch_child))
         {
-            if (!list)
+            /* NULL is the indicator to remove a value, see RFC7396 */
+            if (case_sensitive)
             {
-                /* start merged list with the first element of the first list */
-                list = ptr = first;
+                cJSON_DeleteItemFromObjectCaseSensitive(target, patch_child->string);
             }
             else
             {
-                /* add first element of first list to merged list */
-                ptr->next = first;
-                first->prev = ptr;
-                ptr = first;
+                cJSON_DeleteItemFromObject(target, patch_child->string);
             }
-            first = first->next;
         }
         else
         {
-            if (!list)
+            cJSON *replace_me = NULL;
+            cJSON *replacement = NULL;
+
+            if (case_sensitive)
             {
-                /* start merged list with the first element of the second list */
-                list = ptr = second;
+                replace_me = cJSON_DetachItemFromObjectCaseSensitive(target, patch_child->string);
             }
             else
             {
-                /* add first element of second list to merged list */
-                ptr->next = second;
-                second->prev = ptr;
-                ptr = second;
+                replace_me = cJSON_DetachItemFromObject(target, patch_child->string);
             }
-            second = second->next;
-        }
-    }
-    if (first)
-    {
-        /* Append rest of first list. */
-        if (!list)
-        {
-            return first;
-        }
-        ptr->next = first;
-        first->prev = ptr;
-    }
-    if (second)
-    {
-        /* Append rest of second list */
-        if (!list)
-        {
-            return second;
+
+            replacement = merge_patch(replace_me, patch_child, case_sensitive);
+            if (replacement == NULL)
+            {
+                return NULL;
+            }
+
+            cJSON_AddItemToObject(target, patch_child->string, replacement);
         }
-        ptr->next = second;
-        second->prev = ptr;
+        patch_child = patch_child->next;
     }
-
-    return list;
+    return target;
 }
 
-CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON *object)
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, const cJSON * const patch)
 {
-    object->child = cJSONUtils_SortList(object->child);
+    return merge_patch(target, patch, false);
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, cJSON *patch)
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatchCaseSensitive(cJSON *target, const cJSON * const patch)
 {
-    if (!patch || ((patch->type & 0xFF) != cJSON_Object))
-    {
-        /* scalar value, array or NULL, just duplicate */
-        cJSON_Delete(target);
-        return cJSON_Duplicate(patch, 1);
-    }
-
-    if (!target || ((target->type & 0xFF) != cJSON_Object))
-    {
-        cJSON_Delete(target);
-        target = cJSON_CreateObject();
-    }
-
-    patch = patch->child;
-    while (patch)
-    {
-        if ((patch->type & 0xFF) == cJSON_NULL)
-        {
-            /* NULL is the indicator to remove a value, see RFC7396 */
-            cJSON_DeleteItemFromObject(target, patch->string);
-        }
-        else
-        {
-            cJSON *replaceme = cJSON_DetachItemFromObject(target, patch->string);
-            cJSON_AddItemToObject(target, patch->string, cJSONUtils_MergePatch(replaceme, patch));
-        }
-        patch = patch->next;
-    }
-    return target;
+    return merge_patch(target, patch, true);
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON *from, cJSON *to)
+static cJSON *generate_merge_patch(cJSON * const from, cJSON * const to, const cJSON_bool case_sensitive)
 {
+    cJSON *from_child = NULL;
+    cJSON *to_child = NULL;
     cJSON *patch = NULL;
-    if (!to)
+    if (to == NULL)
     {
         /* patch to delete everything */
         return cJSON_CreateNull();
     }
-    if (((to->type & 0xFF) != cJSON_Object) || !from || ((from->type & 0xFF) != cJSON_Object))
+    if (!cJSON_IsObject(to) || !cJSON_IsObject(from))
     {
         return cJSON_Duplicate(to, 1);
     }
 
-    cJSONUtils_SortObject(from);
-    cJSONUtils_SortObject(to);
+    sort_object(from, case_sensitive);
+    sort_object(to, case_sensitive);
 
-    from = from->child;
-    to = to->child;
+    from_child = from->child;
+    to_child = to->child;
     patch = cJSON_CreateObject();
-    while (from || to)
+    while (from_child || to_child)
     {
-        int compare = from ? (to ? strcmp(from->string, to->string) : -1) : 1;
-        if (compare < 0)
+        int diff;
+        if (from_child != NULL)
+        {
+            if (to_child != NULL)
+            {
+                diff = strcmp(from_child->string, to_child->string);
+            }
+            else
+            {
+                diff = -1;
+            }
+        }
+        else
+        {
+            diff = 1;
+        }
+
+        if (diff < 0)
         {
             /* from has a value that to doesn't have -> remove */
-            cJSON_AddItemToObject(patch, from->string, cJSON_CreateNull());
-            from = from->next;
+            cJSON_AddItemToObject(patch, from_child->string, cJSON_CreateNull());
+
+            from_child = from_child->next;
         }
-        else if (compare > 0)
+        else if (diff > 0)
         {
             /* to has a value that from doesn't have -> add to patch */
-            cJSON_AddItemToObject(patch, to->string, cJSON_Duplicate(to, 1));
-            to = to->next;
+            cJSON_AddItemToObject(patch, to_child->string, cJSON_Duplicate(to_child, 1));
+
+            to_child = to_child->next;
         }
         else
         {
             /* object key exists in both objects */
-            if (cJSONUtils_Compare(from, to))
+            if (!compare_json(from_child, to_child, case_sensitive))
             {
                 /* not identical --> generate a patch */
-                cJSON_AddItemToObject(patch, to->string, cJSONUtils_GenerateMergePatch(from, to));
+                cJSON_AddItemToObject(patch, to_child->string, cJSONUtils_GenerateMergePatch(from_child, to_child));
             }
+
             /* next key in the object */
-            from = from->next;
-            to = to->next;
+            from_child = from_child->next;
+            to_child = to_child->next;
         }
     }
-    if (!patch->child)
+    if (patch->child == NULL)
     {
+        /* no patch generated */
         cJSON_Delete(patch);
         return NULL;
     }
 
     return patch;
 }
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON * const from, cJSON * const to)
+{
+    return generate_merge_patch(from, to, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatchCaseSensitive(cJSON * const from, cJSON * const to)
+{
+    return generate_merge_patch(from, to, true);
+}
diff --git a/libs/esl/src/include/esl_cJSON.h b/libs/esl/src/include/esl_cJSON.h
index 79dbb98157a..514251c9b24 100644
--- a/libs/esl/src/include/esl_cJSON.h
+++ b/libs/esl/src/include/esl_cJSON.h
@@ -1,16 +1,13 @@
 /*
-  Copyright (c) 2009 Dave Gamble
-
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:
-
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.
-
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -28,10 +25,55 @@ extern "C"
 {
 #endif
 
+#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32))
+#define __WINDOWS__
+#endif
+
+#ifdef __WINDOWS__
+
+/* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention.  For windows you have 3 define options:
+CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols
+CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols (default)
+CJSON_IMPORT_SYMBOLS - Define this if you want to dllimport symbol
+For *nix builds that support visibility attribute, you can define similar behavior by
+setting default visibility to hidden by adding
+-fvisibility=hidden (for gcc)
+or
+-xldscope=hidden (for sun cc)
+to CFLAGS
+then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does
+*/
+
+#define CJSON_CDECL __cdecl
+#define CJSON_STDCALL __stdcall
+
+/* export symbols by default, this is necessary for copy pasting the C and header file */
+#if !defined(CJSON_HIDE_SYMBOLS) && !defined(CJSON_IMPORT_SYMBOLS) && !defined(CJSON_EXPORT_SYMBOLS)
+#define CJSON_EXPORT_SYMBOLS
+#endif
+
+#if defined(CJSON_HIDE_SYMBOLS)
+#define CJSON_PUBLIC(type)   type CJSON_STDCALL
+#elif defined(CJSON_EXPORT_SYMBOLS)
+#define CJSON_PUBLIC(type)   __declspec(dllexport) type CJSON_STDCALL
+#elif defined(CJSON_IMPORT_SYMBOLS)
+#define CJSON_PUBLIC(type)   __declspec(dllimport) type CJSON_STDCALL
+#endif
+#else /* !__WINDOWS__ */
+#define CJSON_CDECL
+#define CJSON_STDCALL
+
+#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY)
+#define CJSON_PUBLIC(type)   __attribute__((visibility("default"))) type
+#else
+#define CJSON_PUBLIC(type) type
+#endif
+#endif
+
 /* project version */
 #define CJSON_VERSION_MAJOR 1
-#define CJSON_VERSION_MINOR 3
-#define CJSON_VERSION_PATCH 0
+#define CJSON_VERSION_MINOR 7
+#define CJSON_VERSION_PATCH 12
 
 #include <stddef.h>
 
@@ -63,7 +105,7 @@ typedef struct cJSON
 
     /* The item's string, if type==cJSON_String  and type == cJSON_Raw */
     char *valuestring;
-    /* The item's number, if type==cJSON_Number */
+    /* writing to valueint is DEPRECATED, use cJSON_SetNumberValue instead */
     int valueint;
     /* The item's number, if type==cJSON_Number */
     double valuedouble;
@@ -74,45 +116,17 @@ typedef struct cJSON
 
 typedef struct cJSON_Hooks
 {
-      void *(*malloc_fn)(size_t sz);
-      void (*free_fn)(void *ptr);
+      /* malloc/free are CDECL on Windows regardless of the default calling convention of the compiler, so ensure the hooks allow passing those functions directly. */
+      void *(CJSON_CDECL *malloc_fn)(size_t sz);
+      void (CJSON_CDECL *free_fn)(void *ptr);
 } cJSON_Hooks;
 
-#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32))
-#define __WINDOWS__
-#endif
-#ifdef __WINDOWS__
-
-/* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention.  For windows you have 2 define options:
-
-CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols
-CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols
-
-For *nix builds that support visibility attribute, you can define similar behavior by 
-
-setting default visibility to hidden by adding
--fvisibility=hidden (for gcc)
-or
--xldscope=hidden (for sun cc)
-to CFLAGS
-
-then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does
-
-*/
+typedef int cJSON_bool;
 
-#if defined(CJSON_HIDE_SYMBOLS)
-#define CJSON_PUBLIC(type)   type __stdcall
-#elif defined(CJSON_EXPORT_SYMBOLS)
-#define CJSON_PUBLIC(type)   __declspec(dllexport) type __stdcall
-#else
-#define CJSON_PUBLIC(type)   __declspec(dllimport) type __stdcall
-#endif
-#else /* !WIN32 */
-#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY)
-#define CJSON_PUBLIC(type)   __attribute__((visibility("default"))) type
-#else
-#define CJSON_PUBLIC(type) type
-#endif
+/* Limits how deeply nested arrays/objects can be before cJSON rejects to parse them.
+ * This is to prevent stack overflows. */
+#ifndef CJSON_NESTING_LIMIT
+#define CJSON_NESTING_LIMIT 1000
 #endif
 
 /* returns the version of cJSON as a string */
@@ -121,36 +135,56 @@ CJSON_PUBLIC(const char*) cJSON_Version(void);
 /* Supply malloc, realloc and free functions to cJSON */
 CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks* hooks);
 
-
-/* Supply a block of JSON, and this returns a cJSON object you can interrogate. Call cJSON_Delete when finished. */
+/* Memory Management: the caller is always responsible to free the results from all variants of cJSON_Parse (with cJSON_Delete) and cJSON_Print (with stdlib free, cJSON_Hooks.free_fn, or cJSON_free as appropriate). The exception is cJSON_PrintPreallocated, where the caller has full responsibility of the buffer. */
+/* Supply a block of JSON, and this returns a cJSON object you can interrogate. */
 CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value);
-/* Render a cJSON entity to text for transfer/storage. Free the char* when finished. */
+/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */
+/* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error so will match cJSON_GetErrorPtr(). */
+CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated);
+
+/* Render a cJSON entity to text for transfer/storage. */
 CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item);
-/* Render a cJSON entity to text for transfer/storage without any formatting. Free the char* when finished. */
+/* Render a cJSON entity to text for transfer/storage without any formatting. */
 CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item);
 /* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess at the final size. guessing well reduces reallocation. fmt=0 gives unformatted, =1 gives formatted */
-CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, int fmt);
-/* Render a cJSON entity to text using a buffer already allocated in memory with length buf_len. Returns 1 on success and 0 on failure. */
-CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const int fmt);
+CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt);
+/* Render a cJSON entity to text using a buffer already allocated in memory with given length. Returns 1 on success and 0 on failure. */
+/* NOTE: cJSON is not always 100% accurate in estimating how much memory it will use, so to be safe allocate 5 bytes more than you actually need */
+CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buffer, const int length, const cJSON_bool format);
 /* Delete a cJSON entity and all subentities. */
 CJSON_PUBLIC(void) cJSON_Delete(cJSON *c);
 
 /* Returns the number of items in an array (or object). */
 CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array);
-/* Retrieve item number "item" from array "array". Returns NULL if unsuccessful. */
-CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int item);
+/* Retrieve item number "index" from array "array". Returns NULL if unsuccessful. */
+CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index);
 /* Get item "string" from object. Case insensitive. */
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON *object, const char *string);
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON *object, const char *string);
-CJSON_PUBLIC(int) cJSON_HasObjectItem(const cJSON *object, const char *string);
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string);
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string);
+CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string);
 /* For analysing failed parses. This returns a pointer to the parse error. You'll probably need to look a few chars back to make sense of it. Defined when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */
 CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void);
 
+/* Check if the item is a string and return its valuestring */
+CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item);
+
+/* These functions check the type of an item */
+CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item);
+
 /* These calls create a cJSON item of the appropriate type. */
 CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void);
-CJSON_PUBLIC(cJSON *) cJSON_CreateBool(int b);
+CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean);
 CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num);
 CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string);
 /* raw json */
@@ -158,6 +192,14 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw);
 CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void);
 
+/* Create a string where valuestring references a string so
+ * it will not be freed by cJSON_Delete */
+CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string);
+/* Create an object/arrray that only references it's elements so
+ * they will not be freed by cJSON_Delete */
+CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child);
+CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child);
+
 /* These utilities create an Array of count items. */
 CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count);
 CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count);
@@ -176,36 +218,44 @@ CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item);
 CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item);
 
 /* Remove/Detatch items from Arrays/Objects. */
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item);
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which);
 CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which);
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string);
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string);
 CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string);
+CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string);
 
 /* Update array items. */
 CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem); /* Shifts pre-existing items to the right. */
+CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement);
 CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem);
 CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object,const char *string,cJSON *newitem);
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object,const char *string,cJSON *newitem);
 
 /* Duplicate a cJSON item */
-CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, int recurse);
+CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse);
 /* Duplicate will create a new, identical cJSON item to the one you pass, in new memory that will
 need to be released. With recurse!=0, it will duplicate any children connected to the item.
 The item->next and ->prev pointers are always zero on return from Duplicate. */
+/* Recursively compare two cJSON items for equality. If either a or b is NULL or invalid, they will be considered unequal.
+ * case_sensitive determines if object keys are treated case sensitive (1) or case insensitive (0) */
+CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive);
 
-/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */
-/* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error. If not, then cJSON_GetErrorPtr() does the job. */
-CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, int require_null_terminated);
 
 CJSON_PUBLIC(void) cJSON_Minify(char *json);
 
-/* Macros for creating things quickly. */
-#define cJSON_AddNullToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateNull())
-#define cJSON_AddTrueToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateTrue())
-#define cJSON_AddFalseToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateFalse())
-#define cJSON_AddBoolToObject(object,name,b) cJSON_AddItemToObject(object, name, cJSON_CreateBool(b))
-#define cJSON_AddNumberToObject(object,name,n) cJSON_AddItemToObject(object, name, cJSON_CreateNumber(n))
-#define cJSON_AddStringToObject(object,name,s) cJSON_AddItemToObject(object, name, cJSON_CreateString(s))
-#define cJSON_AddRawToObject(object,name,s) cJSON_AddItemToObject(object, name, cJSON_CreateRaw(s))
+/* Helper functions for creating and adding items to an object at the same time.
+ * They return the added item or NULL on failure. */
+CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean);
+CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number);
+CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string);
+CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw);
+CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name);
 
 /* When assigning an integer value, it needs to be propagated to valuedouble too. */
 #define cJSON_SetIntValue(object, number) ((object) ? (object)->valueint = (object)->valuedouble = (number) : (number))
@@ -213,9 +263,13 @@ CJSON_PUBLIC(void) cJSON_Minify(char *json);
 CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number);
 #define cJSON_SetNumberValue(object, number) ((object != NULL) ? cJSON_SetNumberHelper(object, (double)number) : (number))
 
-/* Macro for iterating over an array */
+/* Macro for iterating over an array or object */
 #define cJSON_ArrayForEach(element, array) for(element = (array != NULL) ? (array)->child : NULL; element != NULL; element = element->next)
 
+/* malloc/free objects using the malloc/free functions that have been set with cJSON_InitHooks */
+CJSON_PUBLIC(void *) cJSON_malloc(size_t size);
+CJSON_PUBLIC(void) cJSON_free(void *object);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libs/esl/src/include/esl_cJSON_Utils.h b/libs/esl/src/include/esl_cJSON_Utils.h
index 4418c8c5552..95c5a37edd4 100644
--- a/libs/esl/src/include/esl_cJSON_Utils.h
+++ b/libs/esl/src/include/esl_cJSON_Utils.h
@@ -1,14 +1,45 @@
+/*
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#ifndef cJSON_Utils__h
+#define cJSON_Utils__h
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 #include "esl_cJSON.h"
 
 /* Implement RFC6901 (https://tools.ietf.org/html/rfc6901) JSON Pointer spec. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON *object, const char *pointer);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON * const object, const char *pointer);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointerCaseSensitive(cJSON * const object, const char *pointer);
 
 /* Implement RFC6902 (https://tools.ietf.org/html/rfc6902) JSON Patch spec. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON *from, cJSON *to);
+/* NOTE: This modifies objects in 'from' and 'to' by sorting the elements by their key */
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON * const from, cJSON * const to);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatchesCaseSensitive(cJSON * const from, cJSON * const to);
 /* Utility for generating patch array entries. */
-CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON *array, const char *op, const char *path, cJSON *val);
+CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON * const array, const char * const operation, const char * const path, const cJSON * const value);
 /* Returns 0 for success. */
-CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches);
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON * const object, const cJSON * const patches);
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatchesCaseSensitive(cJSON * const object, const cJSON * const patches);
 
 /*
 // Note that ApplyPatches is NOT atomic on failure. To implement an atomic ApplyPatches, use:
@@ -33,12 +64,22 @@ CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches);
 
 /* Implement RFC7386 (https://tools.ietf.org/html/rfc7396) JSON Merge Patch spec. */
 /* target will be modified by patch. return value is new ptr for target. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, cJSON *patch);
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, const cJSON * const patch);
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatchCaseSensitive(cJSON *target, const cJSON * const patch);
 /* generates a patch to move from -> to */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON *from, cJSON *to);
+/* NOTE: This modifies objects in 'from' and 'to' by sorting the elements by their key */
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON * const from, cJSON * const to);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatchCaseSensitive(cJSON * const from, cJSON * const to);
 
 /* Given a root object and a target object, construct a pointer from one to the other. */
-CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *target);
+CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(const cJSON * const object, const cJSON * const target);
 
 /* Sorts the members of the object into alphabetical order. */
-CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON *object);
+CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON * const object);
+CJSON_PUBLIC(void) cJSONUtils_SortObjectCaseSensitive(cJSON * const object);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/libs/iksemel/src/ikstack.c b/libs/iksemel/src/ikstack.c
index 9771a41e8b0..b5fd75ef682 100644
--- a/libs/iksemel/src/ikstack.c
+++ b/libs/iksemel/src/ikstack.c
@@ -32,7 +32,7 @@ static ikschunk *
 find_space (ikstack *s, ikschunk *c, size_t size)
 {
 	/* FIXME: dont use *2 after over allocated chunks */
-	while (1) {
+	while (c) {
 		if (c->size - c->used >= size) return c;
 		if (!c->next) {
 			if ((c->size * 2) > size) size = c->size * 2;
diff --git a/libs/iksemel/src/sax.c b/libs/iksemel/src/sax.c
index 338a5cf7f94..beb3621588d 100644
--- a/libs/iksemel/src/sax.c
+++ b/libs/iksemel/src/sax.c
@@ -213,22 +213,16 @@ sax_core (iksparser *prs, char *buf, int len)
 			goto cont;
 		} else {
 			if (c & 0x80) {
-				unsigned char mask;
 				if ((c & 0x60) == 0x40) {
 					prs->uni_max = 2;
-					mask = 0x1F;
 				} else if ((c & 0x70) == 0x60) {
 					prs->uni_max = 3;
-					mask = 0x0F;
 				} else if ((c & 0x78) == 0x70) {
 					prs->uni_max = 4;
-					mask = 0x07;
 				} else if ((c & 0x7C) == 0x78) {
 					prs->uni_max = 5;
-					mask = 0x03;
 				} else if ((c & 0x7E) == 0x7C) {
 					prs->uni_max = 6;
-					mask = 0x01;
 				} else {
 					return IKS_BADXML;
 				}
diff --git a/libs/iksemel/test/tst-ikstack.c b/libs/iksemel/test/tst-ikstack.c
index 1a761ad708b..d123e200472 100644
--- a/libs/iksemel/test/tst-ikstack.c
+++ b/libs/iksemel/test/tst-ikstack.c
@@ -36,7 +36,7 @@ test_stack (int cs)
 		memset (mem, 'x', i);
 		old = iks_stack_strcat (s, old, 0, buf + i, 1);
 	}
-	if (strcmp (old, buf) != 0) {
+	if (old && strcmp (old, buf) != 0) {
 		printf ("ikstack strcat bug:\nExpected: %s\n  Result: %s\n", buf, old);
 		exit (1);
 	}
diff --git a/libs/iksemel/test/tst-sax.c b/libs/iksemel/test/tst-sax.c
index 77a5c13e95d..c0a041aee1c 100644
--- a/libs/iksemel/test/tst-sax.c
+++ b/libs/iksemel/test/tst-sax.c
@@ -160,7 +160,7 @@ tagHook (void *udata, char *name, char **atts, int type)
 	nr = tester.cur->nr_atts;
 	while (nr) {
 		flag = 0;
-		for (i = 0;atts[i]; i+= 2) {
+		for (i = 0;atts&&atts[i]; i+= 2) {
 			if (iks_strcmp (atts[i], tester.cur->atts[nr-1]) == 0 && iks_strcmp (atts[i+1], tester.cur->vals[nr-1]) == 0) {
 				flag = 1;
 				break;
diff --git a/libs/iksemel/tools/hash.c b/libs/iksemel/tools/hash.c
index 8ba83a0eeac..c4b1179a873 100644
--- a/libs/iksemel/tools/hash.c
+++ b/libs/iksemel/tools/hash.c
@@ -117,7 +117,7 @@ hash_print (hash *h, char *title_fmt, char *line_fmt)
 	struct item **tags, *t;
 	unsigned int i = 0, pos = 0;
 
-	tags = calloc (sizeof (struct tag *), h->count);
+	tags = calloc (sizeof (struct item *), h->count);
 
 	for (; i < h->size; i ++) {
 		for (t = h->table[i]; t; t = t->next) {
diff --git a/libs/iksemel/tools/iksperf.c b/libs/iksemel/tools/iksperf.c
index 81e98ad1c4f..4bcd4b3e57a 100644
--- a/libs/iksemel/tools/iksperf.c
+++ b/libs/iksemel/tools/iksperf.c
@@ -207,7 +207,6 @@ serialize_test (char *buf, int len)
 	unsigned long time;
 	iks *x;
 	iksparser *prs;
-	char *xml;
 	int err;
 
 	prs = iks_dom_new (&x);
@@ -228,7 +227,7 @@ serialize_test (char *buf, int len)
 
 	t_reset ();
 
-	xml = iks_string (iks_stack (x), x);
+	iks_string (iks_stack (x), x);
 
 	time = t_elapsed ();
 
diff --git a/libs/libdingaling/src/libdingaling.c b/libs/libdingaling/src/libdingaling.c
index 8efe5c4af53..6fbedfda6ed 100644
--- a/libs/libdingaling/src/libdingaling.c
+++ b/libs/libdingaling/src/libdingaling.c
@@ -499,7 +499,10 @@ static ldl_status parse_session_code(ldl_handle_t *handle, char *id, char *from,
 						unsigned int *candidate_len = NULL;
 						ldl_candidate_t (*candidates)[LDL_MAX_CANDIDATES] = NULL;
 						
-						if ((key = iks_find_attrib(tag, "preference"))) {
+						if (!(key = iks_find_attrib(tag, "preference"))) {
+							globals.logger(DL_LOG_WARNING, "Field preference was not set\n");
+							continue;
+						} else {
 							unsigned int x;
 							
 							pref = strtod(key, NULL);
@@ -927,7 +930,7 @@ static int on_disco_default(void *user_data, ikspak *pak)
 	char *node = NULL;
 	char *ns = NULL;
 	ldl_handle_t *handle = user_data;
-	iks *iq, *query, *tag;
+	iks *iq = NULL, *query, *tag;
 	uint8_t send = 0;
 	int x;
 
@@ -936,10 +939,10 @@ static int on_disco_default(void *user_data, ikspak *pak)
 		node = iks_find_attrib(pak->query, "node");
 	}
 
-	if (pak->subtype == IKS_TYPE_RESULT) {
+	if (pak && pak->subtype == IKS_TYPE_RESULT) {
 		globals.logger(DL_LOG_CRIT, "FixME!!! node=[%s]\n", node?node:"");		
-	} else if (pak->subtype == IKS_TYPE_GET) {
-		if ((iq = iks_new("iq"))) {
+	} else if (pak && pak->subtype == IKS_TYPE_GET) {
+		if (ns && (iq = iks_new("iq"))) {
 			int all = 0;
 			
 			iks_insert_attrib(iq, "from", handle->login);
@@ -1081,7 +1084,6 @@ static int on_presence(void *user_data, ikspak *pak)
 			if (ext && strstr(ext, "voice-v1") && (buffer = apr_hash_get(handle->probe_hash, id, APR_HASH_KEY_STRING))) {
 				apr_cpystrn(buffer->buf, from, buffer->len);
 				buffer->hit = 1;
-				done = 1;
 			}
 		}
 	}
@@ -1149,7 +1151,6 @@ static ldl_avatar_t *ldl_get_avatar(ldl_handle_t *handle, char *path, char *from
 	
 	bytes = read(fd, image, sizeof(image));
 	close(fd);
-	fd = -1;
 
 	ap = malloc(sizeof(*ap));
 	assert(ap != NULL);
@@ -2565,7 +2566,6 @@ unsigned int ldl_session_candidates(ldl_session_t *session,
 		iq = NULL;
 		sess = NULL;
 		tag = NULL;
-		x = 0;
 		id = 0;
 	}
 
diff --git a/libs/libnatpmp/natpmp.c b/libs/libnatpmp/natpmp.c
index e3ab64af7dd..0e2c4ea0b49 100644
--- a/libs/libnatpmp/natpmp.c
+++ b/libs/libnatpmp/natpmp.c
@@ -195,7 +195,7 @@ int sendnewportmappingrequest(natpmp_t * p, int protocol,
 int readnatpmpresponse(natpmp_t * p, natpmpresp_t * response)
 {
 	unsigned char buf[16];
-	struct sockaddr_in addr;
+	struct sockaddr_in addr = {0};
 	socklen_t addrlen = sizeof(addr);
 	int n;
 	if(!p)
diff --git a/libs/libvpx/AUTHORS b/libs/libvpx/AUTHORS
index 04c28724329..2f1f8a6946a 100644
--- a/libs/libvpx/AUTHORS
+++ b/libs/libvpx/AUTHORS
@@ -4,12 +4,13 @@
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adrian Grange <agrange@google.com>
-Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
+Aidan Welch <aidansw@yahoo.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Alexandra Hájková <alexandra.khirnova@gmail.com>
+Aℓex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -26,11 +27,13 @@ Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
+Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
+Dan Zhu <zxdan@google.com>
 Deb Mukherjee <debargha@google.com>
 Deepa K G <deepa.kg@ittiam.com>
 Dim Temp <dimtemp0@gmail.com>
@@ -38,11 +41,13 @@ Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
 Ed Baker <edward.baker@intel.com>
 Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Elliott Karpilovsky <elliottk@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
+Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Geza Lore <gezalore@gmail.com>
@@ -55,7 +60,9 @@ Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
+Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
+Hien Ho <hienho@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
@@ -81,6 +88,7 @@ Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
 John Stark <jhnstrk@gmail.com>
+Jon Kunkee <jkunkee@microsoft.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Julia Robson <juliamrobson@gmail.com>
@@ -91,15 +99,19 @@ KO Myung-Hun <komh@chollian.net>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
+Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
+Luc Trudeau <luc@trud.ca>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
 Mark Mentovai <mark@chromium.org>
 Martin Ettl <ettl.martin78@googlemail.com>
-Martin Storsjo <martin@martin.st>
+Martin Storsjö <martin@martin.st>
 Matthew Heaney <matthewjheaney@chromium.org>
+Matthias Räncker <theonetruecamper@gmx.de>
+Michael Horowitz <mhoro@webrtc.org>
 Michael Kohler <michaelkohler@live.com>
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
@@ -107,10 +119,12 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Mirko Bonadei <mbonadei@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
+Niveditha Rau <niveditha.rau@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
@@ -129,9 +143,13 @@ Rafael de Lucena Valle <rafaeldelucena@gmail.com>
 Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
+Raphael Kubo da Costa <raphael.kubo.da.costa@intel.com>
+Ravi Chaudhary <ravi.chaudhary@ittiam.com>
+Ritu Baldwa <ritu.baldwa@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
+Sai Deng <sdeng@google.com>
 Sami Pietilä <samipietila@google.com>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
@@ -139,12 +157,15 @@ Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
 Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
+Sergey Silkin <ssilkin@google.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Shiyou Yin <yinshiyou-hf@loongson.cn>
+Shubham Tandle <shubham.tandle@ittiam.com>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Supradeep T R <supradeep.tr@ittiam.com>
 Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
@@ -157,11 +178,15 @@ Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
+Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
+Wan-Teh Chang <wtc@google.com>
+xiwei gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
+Yue Chen <yuec@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Yury Gitman <yuryg@google.com>
 Zoe Liu <zoeliu@google.com>
diff --git a/libs/libvpx/CHANGELOG b/libs/libvpx/CHANGELOG
index 2281394c8ed..a7d8311c5f9 100644
--- a/libs/libvpx/CHANGELOG
+++ b/libs/libvpx/CHANGELOG
@@ -1,4 +1,63 @@
-2017-01-04 v1.7.0 "Mandarin Duck"
+2019-07-15 v1.8.1 "Orpington Duck"
+  This release collects incremental improvements to many aspects of the library.
+
+  - Upgrading:
+    VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
+    VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
+    The --sdk-path option has been removed. If you were using it to build for
+      Android please read build/make/Android.mk for alternatives.
+    All PPC optimizations have been disabled:
+      https://bugs.chromium.org/p/webm/issues/detail?id=1522.
+
+  - Enhancements:
+    Various changes to improve encoder rate control, quality and speed
+      for practically every use case.
+
+  - Bug fixes:
+    vp9-rtc: Fix color artifacts for speed >= 8.
+
+2019-01-31 v1.8.0 "Northern Shoveler Duck"
+  This release focused on encoding performance for realtime and VOD use cases.
+
+  - Upgrading:
+    This adds and improves several vp9 controls. Most are related to SVC:
+      VP9E_SET_SVC_FRAME_DROP_LAYER:
+        - Frame dropping in SVC.
+      VP9E_SET_SVC_INTER_LAYER_PRED:
+        - Inter-layer prediction in SVC.
+      VP9E_SET_SVC_GF_TEMPORAL_REF:
+        - Enable long term temporal reference in SVC.
+      VP9E_SET_SVC_REF_FRAME_CONFIG/VP9E_GET_SVC_REF_FRAME_CONFIG:
+        - Extend and improve this control for better flexibility in setting SVC
+          pattern dynamically.
+      VP9E_SET_POSTENCODE_DROP:
+        - Allow for post-encode frame dropping (applies to non-SVC too).
+      VP9E_SET_SVC_SPATIAL_LAYER_SYNC:
+        - Enable spatial layer sync frames.
+      VP9E_SET_SVC_LAYER_ID:
+        - Extend api to specify temporal id for each spatial layers.
+      VP9E_SET_ROI_MAP:
+        - Extend Region of Interest functionality to VP9.
+
+  - Enhancements:
+    2 pass vp9 encoding has improved substantially. When using --auto-alt-ref=6,
+    we see approximately 8% for VBR and 10% for CQ. When using --auto-alt-ref=1,
+    the gains are approximately 4% for VBR and 5% for CQ.
+
+    For real-time encoding, speed 7 has improved by ~5-10%. Encodes targeted at
+    screen sharing have improved when the content changes significantly (slide
+    sharing) or scrolls. There is a new speed 9 setting for mobile devices which
+    is about 10-20% faster than speed 8.
+
+  - Bug fixes:
+    VP9 denoiser issue.
+    VP9 partition issue for 1080p.
+    VP9 rate control improvments.
+    Postprocessing Multi Frame Quality Enhancement (MFQE) issue.
+    VP8 multithread decoder issues.
+    A variety of fuzzing issues.
+
+2018-01-04 v1.7.0 "Mandarin Duck"
   This release focused on high bit depth performance (10/12 bit) and vp9
   encoding improvements.
 
diff --git a/libs/libvpx/README b/libs/libvpx/README
index 73304dd62f4..a1000e08509 100644
--- a/libs/libvpx/README
+++ b/libs/libvpx/README
@@ -1,4 +1,4 @@
-README - 24 January 2018
+README - 15 July 2019
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -9,22 +9,26 @@ COMPILING THE APPLICATIONS/LIBRARIES:
 
   1. Prerequisites
 
-    * All x86 targets require the Yasm[1] assembler be installed.
-    * All Windows builds require that Cygwin[2] be installed.
-    * Building the documentation requires Doxygen[3]. If you do not
+    * All x86 targets require the Yasm[1] assembler be installed[2].
+    * All Windows builds require that Cygwin[3] be installed.
+    * Building the documentation requires Doxygen[4]. If you do not
       have this package, the install-docs option will be disabled.
-    * Downloading the data for the unit tests requires curl[4] and sha1sum.
+    * Downloading the data for the unit tests requires curl[5] and sha1sum.
       sha1sum is provided via the GNU coreutils, installed by default on
       many *nix platforms, as well as MinGW and Cygwin. If coreutils is not
       available, a compatible version of sha1sum can be built from
-      source[5]. These requirements are optional if not running the unit
+      source[6]. These requirements are optional if not running the unit
       tests.
 
     [1]: http://www.tortall.net/projects/yasm
-    [2]: http://www.cygwin.com
-    [3]: http://www.doxygen.org
-    [4]: http://curl.haxx.se
-    [5]: http://www.microbrew.org/tools/md5sha1sum/
+    [2]: For Visual Studio the base yasm binary (not vsyasm) should be in the
+         PATH for Visual Studio. For VS2017 it is sufficient to rename
+         yasm-<version>-<arch>.exe to yasm.exe and place it in:
+         Program Files (x86)/Microsoft Visual Studio/2017/<level>/Common7/Tools/
+    [3]: http://www.cygwin.com
+    [4]: http://www.doxygen.org
+    [5]: http://curl.haxx.se
+    [6]: http://www.microbrew.org/tools/md5sha1sum/
 
   2. Out-of-tree builds
   Out of tree builds are a supported method of building the application. For
@@ -41,7 +45,16 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   used to get a list of supported options:
     $ ../libvpx/configure --help
 
-  4. Cross development
+  4. Compiler analyzers
+  Compilers have added sanitizers which instrument binaries with information
+  about address calculation, memory usage, threading, undefined behavior, and
+  other common errors. To simplify building libvpx with some of these features
+  use tools/set_analyzer_env.sh before running configure. It will set the
+  compiler and necessary flags for building as well as environment variables
+  read by the analyzer when testing the binaries.
+    $ source ../libvpx/tools/set_analyzer_env.sh address
+
+  5. Cross development
   For cross development, the most notable option is the --target option. The
   most up-to-date list of supported targets can be found at the bottom of the
   --help output of the configure script. As of this writing, the list of
@@ -50,20 +63,20 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     arm64-android-gcc
     arm64-darwin-gcc
     arm64-linux-gcc
+    arm64-win64-gcc
+    arm64-win64-vs15
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
     armv7-linux-gcc
     armv7-none-rvct
-    armv7-win32-vs11
-    armv7-win32-vs12
+    armv7-win32-gcc
     armv7-win32-vs14
     armv7-win32-vs15
     armv7s-darwin-gcc
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
-    ppc64-linux-gcc
     ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
@@ -78,15 +91,13 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-darwin14-gcc
     x86-darwin15-gcc
     x86-darwin16-gcc
+    x86-darwin17-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
     x86-os2-gcc
     x86-solaris-gcc
     x86-win32-gcc
-    x86-win32-vs10
-    x86-win32-vs11
-    x86-win32-vs12
     x86-win32-vs14
     x86-win32-vs15
     x86_64-android-gcc
@@ -98,14 +109,12 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin14-gcc
     x86_64-darwin15-gcc
     x86_64-darwin16-gcc
+    x86_64-darwin17-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
     x86_64-solaris-gcc
     x86_64-win64-gcc
-    x86_64-win64-vs10
-    x86_64-win64-vs11
-    x86_64-win64-vs12
     x86_64-win64-vs14
     x86_64-win64-vs15
     generic-gnu
@@ -123,7 +132,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
   environment variables: CC, AR, LD, AS, STRIP, NM. Additional flags can be
   passed to these executables with CFLAGS, LDFLAGS, and ASFLAGS.
 
-  5. Configuration errors
+  6. Configuration errors
   If the configuration step fails, the first step is to look in the error log.
   This defaults to config.log. This should give a good indication of what went
   wrong. If not, contact us for support.
diff --git a/libs/libvpx/args.h b/libs/libvpx/args.h
index 54abe04607d..aae8ec06a53 100644
--- a/libs/libvpx/args.h
+++ b/libs/libvpx/args.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef ARGS_H_
-#define ARGS_H_
+#ifndef VPX_ARGS_H_
+#define VPX_ARGS_H_
 #include <stdio.h>
 
 #ifdef __cplusplus
@@ -60,4 +60,4 @@ int arg_parse_enum_or_int(const struct arg *arg);
 }  // extern "C"
 #endif
 
-#endif  // ARGS_H_
+#endif  // VPX_ARGS_H_
diff --git a/libs/libvpx/build/.gitattributes b/libs/libvpx/build/.gitattributes
deleted file mode 100644
index 03db79bc088..00000000000
--- a/libs/libvpx/build/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*-vs8/*.rules -crlf
-*-msvs/*.rules -crlf
diff --git a/libs/libvpx/build/.gitignore b/libs/libvpx/build/.gitignore
deleted file mode 100644
index 1350fcb5eba..00000000000
--- a/libs/libvpx/build/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-x86*-win32-vs*
diff --git a/libs/libvpx/build/make/Android.mk b/libs/libvpx/build/make/Android.mk
index a88f90056e4..6cb3af027b4 100644
--- a/libs/libvpx/build/make/Android.mk
+++ b/libs/libvpx/build/make/Android.mk
@@ -14,7 +14,7 @@
 # Run the configure script from the jni directory.  Base libvpx
 # encoder/decoder configuration will look similar to:
 # ./libvpx/configure --target=armv7-android-gcc --disable-examples \
-#                    --sdk-path=/opt/android-ndk-r6b/
+#                    --enable-external-build
 #
 # When targeting Android, realtime-only is enabled by default.  This can
 # be overridden by adding the command line flag:
@@ -29,37 +29,20 @@
 # include $(CLEAR_VARS)
 # include jni/libvpx/build/make/Android.mk
 #
-# By default libvpx will detect at runtime the existance of NEON extension.
-# For this we import the 'cpufeatures' module from the NDK sources.
-# libvpx can also be configured without this runtime detection method.
-# Configuring with --disable-runtime-cpu-detect will assume presence of NEON.
-# Configuring with --disable-runtime-cpu-detect --disable-neon \
-#     --disable-neon-asm
-# will remove any NEON dependency.
+# By default libvpx will use the 'cpufeatures' module from the NDK. This allows
+# the library to be built with all available optimizations (SSE2->AVX512 for
+# x86, NEON for arm, DSPr2 for mips). This can be disabled with
+#   --disable-runtime-cpu-detect
+# but the resulting library *must* be run on devices supporting all of the
+# enabled extensions. They can be disabled individually with
+#   --disable-{sse2, sse3, ssse3, sse4_1, avx, avx2, avx512}
+#   --disable-neon[-asm]
+#   --disable-{dspr2, msa}
 
 #
 # Running ndk-build will build libvpx and include it in your project.
 #
 
-# Alternatively, building the examples and unit tests can be accomplished in the
-# following way:
-#
-# Create a standalone toolchain from the NDK:
-# https://developer.android.com/ndk/guides/standalone_toolchain.html
-#
-# For example - to test on arm64 devices with clang:
-# $NDK/build/tools/make_standalone_toolchain.py \
-#   --arch arm64 --install-dir=/tmp/my-android-toolchain
-# export PATH=/tmp/my-android-toolchain/bin:$PATH
-# CROSS=aarch64-linux-android- CC=clang CXX=clang++ /path/to/libvpx/configure \
-#   --target=arm64-android-gcc
-#
-# Push the resulting binaries to a device and run them:
-# adb push test_libvpx /data/tmp/test_libvpx
-# adb shell /data/tmp/test_libvpx --gtest_filter=\*Sixtap\*
-#
-# Make sure to push the test data as well and set LIBVPX_TEST_DATA
-
 CONFIG_DIR := $(LOCAL_PATH)/
 LIBVPX_PATH := $(LOCAL_PATH)/libvpx
 ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas
diff --git a/libs/libvpx/build/make/Makefile b/libs/libvpx/build/make/Makefile
index f6b3f0630f1..c070cd0e0c6 100644
--- a/libs/libvpx/build/make/Makefile
+++ b/libs/libvpx/build/make/Makefile
@@ -99,6 +99,7 @@ distclean: clean
       rm -f Makefile; \
       rm -f config.log config.mk; \
       rm -f vpx_config.[hc] vpx_config.asm; \
+      rm -f arm_neon.h; \
     else \
       rm -f $(target)-$(TOOLCHAIN).mk; \
     fi
diff --git a/libs/libvpx/build/make/ads2gas.pl b/libs/libvpx/build/make/ads2gas.pl
index 029cc4a56f2..b6a8f53eae2 100755
--- a/libs/libvpx/build/make/ads2gas.pl
+++ b/libs/libvpx/build/make/ads2gas.pl
@@ -23,16 +23,17 @@
 use thumb;
 
 my $thumb = 0;
+my $elf = 1;
 
 foreach my $arg (@ARGV) {
     $thumb = 1 if ($arg eq "-thumb");
+    $elf = 0 if ($arg eq "-noelf");
 }
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
-print "\t.equ DO1STROUNDING, 0\n";
+print "\t.syntax unified\n";
 if ($thumb) {
-    print "\t.syntax unified\n";
     print "\t.thumb\n";
 }
 
@@ -140,7 +141,11 @@
 
     # Make function visible to linker, and make additional symbol with
     # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+    if ($elf) {
+        s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+    } else {
+        s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
+    }
     s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
 
     s/EXPORT\s+([\$\w]*)/.global $1/;
@@ -181,11 +186,16 @@
     # eabi_attributes numerical equivalents can be found in the
     # "ARM IHI 0045C" document.
 
-    # REQUIRE8 Stack is required to be 8-byte aligned
-    s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+    if ($elf) {
+        # REQUIRE8 Stack is required to be 8-byte aligned
+        s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
-    # PRESERVE8 Stack 8-byte align is preserved
-    s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        # PRESERVE8 Stack 8-byte align is preserved
+        s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+    } else {
+        s/\sREQUIRE8//;
+        s/\sPRESERVE8//;
+    }
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
@@ -202,7 +212,7 @@
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc);
+        $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf);
     }
 
     # EQU directive
@@ -225,4 +235,4 @@
 }
 
 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n");
+printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf;
diff --git a/libs/libvpx/build/make/ads2gas_apple.pl b/libs/libvpx/build/make/ads2gas_apple.pl
index e1ae7b4f871..848872fa7d1 100755
--- a/libs/libvpx/build/make/ads2gas_apple.pl
+++ b/libs/libvpx/build/make/ads2gas_apple.pl
@@ -20,9 +20,7 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.set WIDE_REFERENCE, 0\n";
-print "\t.set ARCHITECTURE, 5\n";
-print "\t.set DO1STROUNDING, 0\n";
+print "\t.syntax unified\n";
 
 my %register_aliases;
 my %macro_aliases;
diff --git a/libs/libvpx/build/make/configure.sh b/libs/libvpx/build/make/configure.sh
index 4bf61eb5eb8..45f97adacdb 100644
--- a/libs/libvpx/build/make/configure.sh
+++ b/libs/libvpx/build/make/configure.sh
@@ -319,6 +319,12 @@ check_ld() {
     && check_cmd ${LD} ${LDFLAGS} "$@" -o ${TMP_X} ${TMP_O} ${extralibs}
 }
 
+check_lib() {
+  log check_lib "$@"
+  check_cc $@ \
+    && check_cmd ${LD} ${LDFLAGS} -o ${TMP_X} ${TMP_O} "$@" ${extralibs}
+}
+
 check_header(){
   log check_header "$@"
   header=$1
@@ -420,6 +426,26 @@ check_gcc_machine_options() {
   fi
 }
 
+check_gcc_avx512_compiles() {
+  if disabled gcc; then
+    return
+  fi
+
+  check_cc -mavx512f <<EOF
+#include <immintrin.h>
+void f(void) {
+  __m512i x = _mm512_set1_epi16(0);
+  (void)x;
+}
+EOF
+  compile_result=$?
+  if [ ${compile_result} -ne 0 ]; then
+    log_echo "    disabling avx512: not supported by compiler"
+    disable_feature avx512
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+  fi
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -481,6 +507,7 @@ AS_SFX    = ${AS_SFX:-.asm}
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
   if enabled rvct; then cat >> $1 << EOF
@@ -520,6 +547,24 @@ EOF
   cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
 }
 
+write_win_arm64_neon_h_workaround() {
+  print_webm_license ${TMP_H} "/*" " */"
+  cat >> ${TMP_H} << EOF
+/* This file automatically generated by configure. Do not edit! */
+#ifndef VPX_WIN_ARM_NEON_H_WORKAROUND
+#define VPX_WIN_ARM_NEON_H_WORKAROUND
+/* The Windows SDK has arm_neon.h, but unlike on other platforms it is
+ * ARM32-only. ARM64 NEON support is provided by arm64_neon.h, a proper
+ * superset of arm_neon.h. Work around this by providing a more local
+ * arm_neon.h that simply #includes arm64_neon.h.
+ */
+#include <arm64_neon.h>
+#endif /* VPX_WIN_ARM_NEON_H_WORKAROUND */
+EOF
+  mkdir -p `dirname "$1"`
+  cmp "$1" ${TMP_H} >/dev/null 2>&1 || mv ${TMP_H} "$1"
+}
+
 process_common_cmdline() {
   for opt in "$@"; do
     optval="${opt#*=}"
@@ -602,11 +647,7 @@ process_common_cmdline() {
       --libdir=*)
         libdir="${optval}"
         ;;
-      --sdk-path=*)
-        [ -d "${optval}" ] || die "Not a directory: ${optval}"
-        sdk_path="${optval}"
-        ;;
-      --libc|--as|--prefix|--libdir|--sdk-path)
+      --libc|--as|--prefix|--libdir)
         die "Option ${opt} requires argument"
         ;;
       --help|-h)
@@ -713,11 +754,8 @@ process_common_toolchain() {
       *sparc*)
         tgt_isa=sparc
         ;;
-      power*64*-*)
-        tgt_isa=ppc64
-        ;;
-      power*)
-        tgt_isa=ppc
+      power*64le*-*)
+        tgt_isa=ppc64le
         ;;
       *mips64el*)
         tgt_isa=mips64
@@ -837,7 +875,7 @@ process_common_toolchain() {
     IOS_VERSION_MIN="8.0"
   else
     IOS_VERSION_OPTIONS=""
-    IOS_VERSION_MIN="6.0"
+    IOS_VERSION_MIN="7.0"
   fi
 
   # Handle darwin variants. Newer SDKs allow targeting older
@@ -957,7 +995,6 @@ process_common_toolchain() {
           setup_gnu_toolchain
           arch_int=${tgt_isa##armv}
           arch_int=${arch_int%%te}
-          check_add_asflags --defsym ARCHITECTURE=${arch_int}
           tune_cflags="-mtune="
           if [ ${tgt_isa} = "armv7" ] || [ ${tgt_isa} = "armv7s" ]; then
             if [ -z "${float_abi}" ]; then
@@ -984,6 +1021,16 @@ EOF
 
           enabled debug && add_asflags -g
           asm_conversion_cmd="${source_path}/build/make/ads2gas.pl"
+
+          case ${tgt_os} in
+            win*)
+              asm_conversion_cmd="$asm_conversion_cmd -noelf"
+              AS="$CC -c"
+              EXE_SFX=.exe
+              enable_feature thumb
+              ;;
+          esac
+
           if enabled thumb; then
             asm_conversion_cmd="$asm_conversion_cmd -thumb"
             check_add_cflags -mthumb
@@ -991,18 +1038,41 @@ EOF
           fi
           ;;
         vs*)
-          asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
-          AS_SFX=.S
-          msvs_arch_dir=arm-msvs
-          disable_feature multithread
-          disable_feature unit_tests
-          vs_version=${tgt_cc##vs}
-          if [ $vs_version -ge 12 ]; then
-            # MSVC 2013 doesn't allow doing plain .exe projects for ARM,
-            # only "AppContainerApplication" which requires an AppxManifest.
-            # Therefore disable the examples, just build the library.
-            disable_feature examples
-            disable_feature tools
+          # A number of ARM-based Windows platforms are constrained by their
+          # respective SDKs' limitations. Fortunately, these are all 32-bit ABIs
+          # and so can be selected as 'win32'.
+          if [ ${tgt_os} = "win32" ]; then
+            asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl"
+            AS_SFX=.S
+            msvs_arch_dir=arm-msvs
+            disable_feature multithread
+            disable_feature unit_tests
+            if [ ${tgt_cc##vs} -ge 12 ]; then
+              # MSVC 2013 doesn't allow doing plain .exe projects for ARM32,
+              # only "AppContainerApplication" which requires an AppxManifest.
+              # Therefore disable the examples, just build the library.
+              disable_feature examples
+              disable_feature tools
+            fi
+          else
+            # Windows 10 on ARM, on the other hand, has full Windows SDK support
+            # for building Win32 ARM64 applications in addition to ARM64
+            # Windows Store apps. It is the only 64-bit ARM ABI that
+            # Windows supports, so it is the default definition of 'win64'.
+            # ARM64 build support officially shipped in Visual Studio 15.9.0.
+
+            # Because the ARM64 Windows SDK's arm_neon.h is ARM32-specific
+            # while LLVM's is not, probe its validity.
+            if enabled neon; then
+              if [ -n "${CC}" ]; then
+                check_header arm_neon.h || check_header arm64_neon.h && \
+                    enable_feature win_arm64_neon_h_workaround
+              else
+                # If a probe is not possible, assume this is the pure Windows
+                # SDK and so the workaround is necessary.
+                enable_feature win_arm64_neon_h_workaround
+              fi
+            fi
           fi
           ;;
         rvct)
@@ -1030,7 +1100,6 @@ EOF
           fi
           arch_int=${tgt_isa##armv}
           arch_int=${arch_int%%te}
-          check_add_asflags --pd "\"ARCHITECTURE SETA ${arch_int}\""
           enabled debug && add_asflags -g
           add_cflags --gnu
           add_cflags --enum_is_int
@@ -1045,51 +1114,10 @@ EOF
           ;;
 
         android*)
-          if [ -n "${sdk_path}" ]; then
-            SDK_PATH=${sdk_path}
-            COMPILER_LOCATION=`find "${SDK_PATH}" \
-              -name "arm-linux-androideabi-gcc*" -print -quit`
-            TOOLCHAIN_PATH=${COMPILER_LOCATION%/*}/arm-linux-androideabi-
-            CC=${TOOLCHAIN_PATH}gcc
-            CXX=${TOOLCHAIN_PATH}g++
-            AR=${TOOLCHAIN_PATH}ar
-            LD=${TOOLCHAIN_PATH}gcc
-            AS=${TOOLCHAIN_PATH}as
-            STRIP=${TOOLCHAIN_PATH}strip
-            NM=${TOOLCHAIN_PATH}nm
-
-            if [ -z "${alt_libc}" ]; then
-              alt_libc=`find "${SDK_PATH}" -name arch-arm -print | \
-                awk '{n = split($0,a,"/"); \
-                split(a[n-1],b,"-"); \
-                print $0 " " b[2]}' | \
-                sort -g -k 2 | \
-                awk '{ print $1 }' | tail -1`
-            fi
-
-            if [ -d "${alt_libc}" ]; then
-              add_cflags "--sysroot=${alt_libc}"
-              add_ldflags "--sysroot=${alt_libc}"
-            fi
-
-            # linker flag that routes around a CPU bug in some
-            # Cortex-A8 implementations (NDK Dev Guide)
-            add_ldflags "-Wl,--fix-cortex-a8"
-
-            enable_feature pic
-            soft_enable realtime_only
-            if [ ${tgt_isa} = "armv7" ]; then
-              soft_enable runtime_cpu_detect
-            fi
-            if enabled runtime_cpu_detect; then
-              add_cflags "-I${SDK_PATH}/sources/android/cpufeatures"
-            fi
-          else
-            echo "Assuming standalone build with NDK toolchain."
-            echo "See build/make/Android.mk for details."
-            check_add_ldflags -static
-            soft_enable unit_tests
-          fi
+          echo "Assuming standalone build with NDK toolchain."
+          echo "See build/make/Android.mk for details."
+          check_add_ldflags -static
+          soft_enable unit_tests
           ;;
 
         darwin*)
@@ -1204,6 +1232,11 @@ EOF
         esac
 
         if enabled msa; then
+          # TODO(libyuv:793)
+          # The new mips functions in libyuv do not build
+          # with the toolchains we currently use for testing.
+          soft_disable libyuv
+
           add_cflags -mmsa
           add_asflags -mmsa
           add_ldflags -mmsa
@@ -1219,13 +1252,25 @@ EOF
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
-    ppc*)
+    ppc64le*)
       link_with_cc=gcc
       setup_gnu_toolchain
-      check_gcc_machine_option "vsx"
+      # Do not enable vsx by default.
+      # https://bugs.chromium.org/p/webm/issues/detail?id=1522
+      enabled vsx || RTCD_OPTIONS="${RTCD_OPTIONS}--disable-vsx "
+      if [ -n "${tune_cpu}" ]; then
+        case ${tune_cpu} in
+          power?)
+            tune_cflags="-mcpu="
+            ;;
+        esac
+      fi
       ;;
     x86*)
       case  ${tgt_os} in
+        android)
+          soft_enable realtime_only
+          ;;
         win*)
           enabled gcc && add_cflags -fno-common
           ;;
@@ -1277,28 +1322,13 @@ EOF
           # Skip the check by setting AS arbitrarily
           AS=msvs
           msvs_arch_dir=x86-msvs
-          vc_version=${tgt_cc##vs}
-          case $vc_version in
-            7|8|9|10|11|12|13|14)
+          case ${tgt_cc##vs} in
+            14)
               echo "${tgt_cc} does not support avx512, disabling....."
               RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
               soft_disable avx512
               ;;
           esac
-          case $vc_version in
-            7|8|9|10)
-              echo "${tgt_cc} does not support avx/avx2, disabling....."
-              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
-              soft_disable avx
-              soft_disable avx2
-              ;;
-          esac
-          case $vc_version in
-            7|8|9)
-              echo "${tgt_cc} omits stdint.h, disabling webm-io..."
-              soft_disable webm_io
-              ;;
-          esac
           ;;
       esac
 
@@ -1331,16 +1361,12 @@ EOF
         else
           if [ "$ext" = "avx512" ]; then
             check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+            check_gcc_avx512_compiles
           else
             # use the shortened version for the flag: sse4_1 -> sse4
             check_gcc_machine_option ${ext%_*} $ext
           fi
         fi
-
-        # https://bugs.chromium.org/p/webm/issues/detail?id=1464
-        # The assembly optimizations for vpx_sub_pixel_variance do not link with
-        # gcc 6.
-        enabled sse2 && soft_enable pic
       done
 
       if enabled external_build; then
@@ -1400,7 +1426,8 @@ EOF
           add_cflags  ${sim_arch}
           add_ldflags ${sim_arch}
 
-          if [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
+          if [ "$(disabled external_build)" ] &&
+              [ "$(show_darwin_sdk_major_version iphonesimulator)" -gt 8 ]; then
             # yasm v1.3.0 doesn't know what -fembed-bitcode means, so turning it
             # on is pointless (unless building a C-only lib). Warn the user, but
             # do nothing here.
@@ -1490,7 +1517,13 @@ EOF
         # bionic includes basic pthread functionality, obviating -lpthread.
         ;;
       *)
-        check_header pthread.h && add_extralibs -lpthread
+        check_header pthread.h && check_lib -lpthread <<EOF && add_extralibs -lpthread || disable_feature pthread_h
+#include <pthread.h>
+#include <stddef.h>
+void *thread_check (void *);
+void *thread_check (void *obj){ return obj;};
+int main(void) { pthread_t thread_id; return pthread_create(&thread_id, NULL, thread_check, NULL); }
+EOF
         ;;
     esac
   fi
diff --git a/libs/libvpx/build/make/gen_msvs_vcxproj.sh b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
index 171d0b99b6e..84515ecff4e 100755
--- a/libs/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/libs/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -261,6 +261,11 @@ case "$target" in
         asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
         asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;%(FullPath)&quot;"
     ;;
+    arm64*)
+        platforms[0]="ARM64"
+        asm_Debug_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
+        asm_Release_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
+    ;;
     arm*)
         platforms[0]="ARM"
         asm_Debug_cmdline="armasm -nologo -oldit &quot;%(FullPath)&quot;"
@@ -307,6 +312,16 @@ generate_vcxproj() {
             tag_content ApplicationType "Windows Store"
             tag_content ApplicationTypeRevision 8.1
         fi
+        if [ "${platforms[0]}" = "ARM64" ]; then
+            # Require the first Visual Studio version to have ARM64 support.
+            tag_content MinimumVisualStudioVersion 15.9
+        fi
+        if [ $vs_ver -eq 15 ] && [ "${platforms[0]}" = "ARM64" ]; then
+            # Since VS 15 does not have a 'use latest SDK version' facility,
+            # specifically require the contemporaneous SDK with official ARM64
+            # support.
+            tag_content WindowsTargetPlatformVersion 10.0.17763.0
+        fi
     close_tag PropertyGroup
 
     tag Import \
diff --git a/libs/libvpx/build/make/iosbuild.sh b/libs/libvpx/build/make/iosbuild.sh
index 3211d4f5efe..e1633a89a82 100755
--- a/libs/libvpx/build/make/iosbuild.sh
+++ b/libs/libvpx/build/make/iosbuild.sh
@@ -132,7 +132,8 @@ create_vpx_framework_config_shim() {
   done
 
   # Consume the last line of output from the loop: We don't want it.
-  sed -i '' -e '$d' "${config_file}"
+  sed -i.bak -e '$d' "${config_file}"
+  rm "${config_file}.bak"
 
   printf "#endif\n\n" >> "${config_file}"
   printf "#endif  // ${include_guard}" >> "${config_file}"
@@ -244,7 +245,7 @@ build_framework() {
 # Trap function. Cleans up the subtree used to build all targets contained in
 # $TARGETS.
 cleanup() {
-  local readonly res=$?
+  local res=$?
   cd "${ORIG_PWD}"
 
   if [ $res -ne 0 ]; then
@@ -350,7 +351,7 @@ if [ "$ENABLE_SHARED" = "yes" ]; then
   IOS_VERSION_MIN="8.0"
 else
   IOS_VERSION_OPTIONS=""
-  IOS_VERSION_MIN="6.0"
+  IOS_VERSION_MIN="7.0"
 fi
 
 if [ "${VERBOSE}" = "yes" ]; then
diff --git a/libs/libvpx/build/make/msvs_common.sh b/libs/libvpx/build/make/msvs_common.sh
index 88f1cf9b570..27ddf7fd91d 100644
--- a/libs/libvpx/build/make/msvs_common.sh
+++ b/libs/libvpx/build/make/msvs_common.sh
@@ -41,6 +41,15 @@ fix_path() {
 # Corrects the paths in file_list in one pass for efficiency.
 # $1 is the name of the array to be modified.
 fix_file_list() {
+    if [ "${FIXPATH}" = "echo_path" ] ; then
+      # When used with echo_path, fix_file_list is a no-op. Avoid warning about
+      # unsupported 'declare -n' when it is not important.
+      return 0
+    elif [ "${BASH_VERSINFO}" -lt 4 ] ; then
+      echo "Cygwin path conversion has failed. Please use a version of bash"
+      echo "which supports nameref (-n), introduced in bash 4.3"
+      return 1
+    fi
     declare -n array_ref=$1
     files=$(fix_path "${array_ref[@]}")
     local IFS=$'\n'
diff --git a/libs/libvpx/build/make/rtcd.pl b/libs/libvpx/build/make/rtcd.pl
index 68e92b52cc5..7483200411a 100755
--- a/libs/libvpx/build/make/rtcd.pl
+++ b/libs/libvpx/build/make/rtcd.pl
@@ -400,12 +400,13 @@ ()
 #
 
 &require("c");
+&require(keys %required);
 if ($opts{arch} eq 'x86') {
   @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
   @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
-  @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
+  @REQUIRES = filter(qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
 } elsif ($opts{arch} eq 'mips32' || $opts{arch} eq 'mips64') {
@@ -433,6 +434,7 @@ ()
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
+  &require("neon");
   arm;
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);
diff --git a/libs/libvpx/build/make/thumb.pm b/libs/libvpx/build/make/thumb.pm
index 483c2539c68..9c49e2d8b72 100644
--- a/libs/libvpx/build/make/thumb.pm
+++ b/libs/libvpx/build/make/thumb.pm
@@ -54,13 +54,6 @@ sub FixThumbInstructions($$)
     # "addne r0, r0, r2".
     s/^(\s*)((ldr|str)(ne)?[bhd]?)(\s+)(\w+),(\s*\w+,)?\s*\[(\w+)\],\s*(\w+)/$1$2$5$6,$7 [$8]\n$1add$4$5$8, $8, $9/g;
 
-    # Convert a conditional addition to the pc register into a series of
-    # instructions. This converts "addlt pc, pc, r3, lsl #2" into
-    # "itttt lt", "movlt.n r12, pc", "addlt.w r12, #12",
-    # "addlt.w r12, r12, r3, lsl #2", "movlt.n pc, r12".
-    # This assumes that r12 is free at this point.
-    s/^(\s*)addlt(\s+)pc,\s*pc,\s*(\w+),\s*lsl\s*#(\d+)/$1itttt$2lt\n$1movlt.n$2r12, pc\n$1addlt.w$2r12, #12\n$1addlt.w$2r12, r12, $3, lsl #($4-$branch_shift_offset)\n$1movlt.n$2pc, r12/g;
-
     # Convert "mov pc, lr" into "bx lr", since the former only works
     # for switching from arm to thumb (and only in armv7), but not
     # from thumb to arm.
diff --git a/libs/libvpx/codereview.settings b/libs/libvpx/codereview.settings
index 34c6f1d9dea..ccba2eeed24 100644
--- a/libs/libvpx/codereview.settings
+++ b/libs/libvpx/codereview.settings
@@ -1,5 +1,4 @@
-# This file is used by gcl to get repository specific information.
-GERRIT_HOST: chromium-review.googlesource.com
-GERRIT_PORT: 29418
+# This file is used by git cl to get repository specific information.
+GERRIT_HOST: True
 CODE_REVIEW_SERVER: chromium-review.googlesource.com
 GERRIT_SQUASH_UPLOADS: False
diff --git a/libs/libvpx/configure b/libs/libvpx/configure
index c84c891c0b6..fb4d7b4ba8c 100755
--- a/libs/libvpx/configure
+++ b/libs/libvpx/configure
@@ -31,7 +31,6 @@ Advanced options:
   --libc=PATH                     path to alternate libc
   --size-limit=WxH                max size to allow in the decoder
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
-  --sdk-path=PATH                 path to root of sdk (android builds only)
   ${toggle_codec_srcs}            in/exclude codec library source code
   ${toggle_debug_libs}            in/exclude debug version of libraries
   ${toggle_static_msvcrt}         use static MSVCRT (VS builds only)
@@ -101,20 +100,20 @@ EOF
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
+all_platforms="${all_platforms} arm64-win64-gcc"
+all_platforms="${all_platforms} arm64-win64-vs15"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
-all_platforms="${all_platforms} armv7-win32-vs11"
-all_platforms="${all_platforms} armv7-win32-vs12"
+all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7-win32-vs15"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
@@ -137,9 +136,6 @@ all_platforms="${all_platforms} x86-linux-icc"
 all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
-all_platforms="${all_platforms} x86-win32-vs10"
-all_platforms="${all_platforms} x86-win32-vs11"
-all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86-win32-vs14"
 all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86_64-android-gcc"
@@ -159,9 +155,6 @@ all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
 all_platforms="${all_platforms} x86_64-win64-gcc"
-all_platforms="${all_platforms} x86_64-win64-vs10"
-all_platforms="${all_platforms} x86_64-win64-vs11"
-all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} x86_64-win64-vs14"
 all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} generic-gnu"
@@ -278,9 +271,9 @@ HAVE_LIST="
     unistd_h
 "
 EXPERIMENT_LIST="
-    spatial_svc
     fp_mb_stats
     emulate_hardware
+    non_greedy_mv
 "
 CONFIG_LIST="
     dependency_tracking
@@ -330,12 +323,15 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
+    consistent_recode
     coefficient_range_checking
     vp9_highbitdepth
     better_hw_compatibility
     experimental
     size_limit
     always_adjust_bpm
+    bitstream_debug
+    mismatch_debug
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -391,11 +387,14 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
+    consistent_recode
     coefficient_range_checking
     better_hw_compatibility
     vp9_highbitdepth
     experimental
     always_adjust_bpm
+    bitstream_debug
+    mismatch_debug
 "
 
 process_cmdline() {
@@ -426,6 +425,12 @@ process_cmdline() {
 }
 
 post_process_cmdline() {
+    if enabled coefficient_range_checking; then
+      echo "coefficient-range-checking is for decoders only, disabling encoders:"
+      soft_disable vp8_encoder
+      soft_disable vp9_encoder
+    fi
+
     c=""
 
     # Enable all detected codecs, if they haven't been disabled
@@ -447,6 +452,7 @@ process_targets() {
     enabled child || write_common_config_banner
     write_common_target_config_h ${BUILD_PFX}vpx_config.h
     write_common_config_targets
+    enabled win_arm64_neon_h_workaround && write_win_arm64_neon_h_workaround ${BUILD_PFX}arm_neon.h
 
     # Calculate the default distribution name, based on the enabled features
     cf=""
@@ -523,7 +529,7 @@ process_detect() {
         # here rather than at option parse time because the target auto-detect
         # magic happens after the command line has been parsed.
         case "${tgt_os}" in
-        linux|os2|darwin*|iphonesimulator*)
+        linux|os2|solaris|darwin*|iphonesimulator*)
             # Supported platforms
             ;;
         *)
@@ -575,16 +581,32 @@ process_detect() {
         check_ld() {
             true
         }
+        check_lib() {
+            true
+        }
     fi
     check_header stdio.h || die "Unable to invoke compiler: ${CC} ${CFLAGS}"
     check_ld <<EOF || die "Toolchain is unable to link executables"
 int main(void) {return 0;}
 EOF
     # check system headers
-    check_header pthread.h
+
+    # Use both check_header and check_lib here, since check_lib
+    # could be a stub that always returns true.
+    check_header pthread.h && check_lib -lpthread <<EOF || disable_feature pthread_h
+#include <pthread.h>
+#include <stddef.h>
+void *thread_check (void *);
+void *thread_check (void *obj){ return obj;};
+int main(void) { pthread_t thread_id; return pthread_create(&thread_id, NULL, thread_check, NULL); }
+EOF
     check_header unistd.h # for sysconf(3) and friends.
 
     check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+
+    if enabled neon && ! enabled external_build; then
+      check_header arm_neon.h || die "Unable to find arm_neon.h"
+    fi
 }
 
 process_toolchain() {
@@ -603,22 +625,39 @@ process_toolchain() {
         check_add_cflags -Wcast-qual
         check_add_cflags -Wvla
         check_add_cflags -Wimplicit-function-declaration
+        check_add_cflags -Wmissing-declarations
+        check_add_cflags -Wmissing-prototypes
         check_add_cflags -Wuninitialized
         check_add_cflags -Wunused
-        # -Wextra has some tricky cases. Rather than fix them all now, get the
-        # flag for as many files as possible and fix the remaining issues
-        # piecemeal.
-        # https://bugs.chromium.org/p/webm/issues/detail?id=1069
         check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
-        # -Wundef so add it explicitly to CFLAGS only.
+        # these flags so add them explicitly to CFLAGS only.
         check_cflags -Wundef && add_cflags_only -Wundef
+        check_cflags -Wframe-larger-than=52000 && \
+          add_cflags_only -Wframe-larger-than=52000
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
+        # Enforce c89 for c files. Don't be too strict about it though. Allow
+        # gnu extensions like "//" for comments.
+        check_cflags -std=gnu89 && add_cflags_only -std=gnu89
         # Avoid this warning for third_party C++ sources. Some reorganization
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
+
+        # Quiet gcc 6 vs 7 abi warnings:
+        # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77728
+        if enabled arm; then
+          check_add_cxxflags -Wno-psabi
+        fi
+
+        # disable some warnings specific to libyuv.
+        check_cxxflags -Wno-missing-declarations \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
+        check_cxxflags -Wno-missing-prototypes \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+        check_cxxflags -Wno-unused-parameter \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
 
     if enabled icc; then
@@ -689,7 +728,7 @@ process_toolchain() {
             soft_enable libyuv
         ;;
         *-android-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             soft_enable libyuv
             # GTestLog must be modified to use Android logging utilities.
         ;;
@@ -698,30 +737,23 @@ process_toolchain() {
             # x86 targets.
         ;;
         *-iphonesimulator-*)
-            soft_enable webm_io
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             soft_enable libyuv
         ;;
         *-win*)
             # Some mingw toolchains don't have pthread available by default.
             # Treat these more like visual studio where threading in gtest
             # would be disabled for the same reason.
-            check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            check_add_cxxflags -std=c++11 && soft_enable unit_tests \
+              && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
         ;;
         *)
-            enabled pthread_h && check_cxx "$@" <<EOF && soft_enable unit_tests
-int z;
-EOF
-            check_cxx "$@" <<EOF && soft_enable webm_io
-int z;
-EOF
+            enabled pthread_h && check_add_cxxflags -std=c++11 \
+              && soft_enable unit_tests
+            check_add_cxxflags -std=c++11 && soft_enable webm_io
             check_cxx "$@" <<EOF && soft_enable libyuv
 int z;
 EOF
diff --git a/libs/libvpx/examples.mk b/libs/libvpx/examples.mk
index 38c4d75c514..758ca7f8894 100644
--- a/libs/libvpx/examples.mk
+++ b/libs/libvpx/examples.mk
@@ -23,7 +23,7 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/row_any.cc \
                 third_party/libyuv/source/row_common.cc \
                 third_party/libyuv/source/row_gcc.cc \
-                third_party/libyuv/source/row_mips.cc \
+                third_party/libyuv/source/row_msa.cc \
                 third_party/libyuv/source/row_neon.cc \
                 third_party/libyuv/source/row_neon64.cc \
                 third_party/libyuv/source/row_win.cc \
@@ -31,7 +31,7 @@ LIBYUV_SRCS +=  third_party/libyuv/include/libyuv/basic_types.h  \
                 third_party/libyuv/source/scale_any.cc \
                 third_party/libyuv/source/scale_common.cc \
                 third_party/libyuv/source/scale_gcc.cc \
-                third_party/libyuv/source/scale_mips.cc \
+                third_party/libyuv/source/scale_msa.cc \
                 third_party/libyuv/source/scale_neon.cc \
                 third_party/libyuv/source/scale_neon64.cc \
                 third_party/libyuv/source/scale_win.cc \
@@ -72,11 +72,12 @@ vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
 vpxdec.SRCS                 += ivfdec.c ivfdec.h
+vpxdec.SRCS                 += y4minput.c y4minput.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
 vpxdec.SRCS                 += y4menc.c y4menc.h
 ifeq ($(CONFIG_LIBYUV),yes)
   vpxdec.SRCS                 += $(LIBYUV_SRCS)
-  $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += -Wno-unused-parameter
+  $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
   vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
@@ -109,18 +110,20 @@ ifeq ($(CONFIG_WEBM_IO),yes)
 endif
 vpxenc.GUID                  = 548DEC74-7A15-4B2B-AFC3-AA102E7C25C1
 vpxenc.DESCRIPTION           = Full featured encoder
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-  EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
-  vp9_spatial_svc_encoder.SRCS        += args.c args.h
-  vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
-  vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_common.h
-  vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
-  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
-  vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
-  vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
-  vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
-endif
+
+EXAMPLES-$(CONFIG_VP9_ENCODER)      += vp9_spatial_svc_encoder.c
+vp9_spatial_svc_encoder.SRCS        += args.c args.h
+vp9_spatial_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vp9_spatial_svc_encoder.SRCS        += y4minput.c y4minput.h
+vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
+vp9_spatial_svc_encoder.SRCS        += video_common.h
+vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
+vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
+vp9_spatial_svc_encoder.SRCS        += examples/svc_encodeframe.c
+vp9_spatial_svc_encoder.SRCS        += examples/svc_context.h
+vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
+vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
 
 ifneq ($(CONFIG_SHARED),yes)
 EXAMPLES-$(CONFIG_VP9_ENCODER)    += resize_util.c
@@ -128,6 +131,7 @@ endif
 
 EXAMPLES-$(CONFIG_ENCODERS)          += vpx_temporal_svc_encoder.c
 vpx_temporal_svc_encoder.SRCS        += ivfenc.c ivfenc.h
+vpx_temporal_svc_encoder.SRCS        += y4minput.c y4minput.h
 vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
 vpx_temporal_svc_encoder.SRCS        += video_common.h
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
@@ -137,6 +141,7 @@ vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
 EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
 simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
 simple_decoder.SRCS                += ivfdec.h ivfdec.c
+simple_decoder.SRCS                += y4minput.c y4minput.h
 simple_decoder.SRCS                += tools_common.h tools_common.c
 simple_decoder.SRCS                += video_common.h
 simple_decoder.SRCS                += video_reader.h video_reader.c
@@ -146,6 +151,7 @@ simple_decoder.SRCS                += vpx_ports/msvc.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
+postproc.SRCS                      += y4minput.c y4minput.h
 postproc.SRCS                      += tools_common.h tools_common.c
 postproc.SRCS                      += video_common.h
 postproc.SRCS                      += video_reader.h video_reader.c
@@ -157,6 +163,7 @@ postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
 decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
 decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
+decode_to_md5.SRCS                 += y4minput.c y4minput.h
 decode_to_md5.SRCS                 += tools_common.h tools_common.c
 decode_to_md5.SRCS                 += video_common.h
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
@@ -167,6 +174,7 @@ decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
 simple_encoder.SRCS             += ivfenc.h ivfenc.c
+simple_encoder.SRCS             += y4minput.c y4minput.h
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
@@ -175,6 +183,7 @@ simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
 EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
 vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += y4minput.c y4minput.h
 vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
 vp9_lossless_encoder.SRCS       += video_common.h
 vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
@@ -183,6 +192,7 @@ vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
 EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
 twopass_encoder.SRCS            += ivfenc.h ivfenc.c
+twopass_encoder.SRCS            += y4minput.c y4minput.h
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
@@ -191,6 +201,7 @@ twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
 EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
 decode_with_drops.SRCS          += ivfdec.h ivfdec.c
+decode_with_drops.SRCS          += y4minput.c y4minput.h
 decode_with_drops.SRCS          += tools_common.h tools_common.c
 decode_with_drops.SRCS          += video_common.h
 decode_with_drops.SRCS          += video_reader.h video_reader.c
@@ -201,6 +212,7 @@ decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
 set_maps.SRCS                      += ivfenc.h ivfenc.c
+set_maps.SRCS                      += y4minput.c y4minput.h
 set_maps.SRCS                      += tools_common.h tools_common.c
 set_maps.SRCS                      += video_common.h
 set_maps.SRCS                      += video_writer.h video_writer.c
@@ -209,6 +221,7 @@ set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 set_maps.DESCRIPTION                = Set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
 vp8cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp8cx_set_ref.SRCS                 += y4minput.c y4minput.h
 vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
@@ -220,6 +233,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 ifeq ($(CONFIG_DECODERS),yes)
 EXAMPLES-yes                       += vp9cx_set_ref.c
 vp9cx_set_ref.SRCS                 += ivfenc.h ivfenc.c
+vp9cx_set_ref.SRCS                 += y4minput.c y4minput.h
 vp9cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp9cx_set_ref.SRCS                 += video_common.h
 vp9cx_set_ref.SRCS                 += video_writer.h video_writer.c
@@ -232,6 +246,7 @@ ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
 EXAMPLES-$(CONFIG_VP8_ENCODER)          += vp8_multi_resolution_encoder.c
 vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
+vp8_multi_resolution_encoder.SRCS       += y4minput.c y4minput.h
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
 vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
 vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
@@ -403,3 +418,4 @@ CLEAN-OBJS += examples.doxy samples.dox $(ALL_EXAMPLES:.c=.dox)
 DOCS-yes += examples.doxy samples.dox
 examples.doxy: samples.dox $(ALL_EXAMPLES:.c=.dox)
 	@echo "INPUT += $^" > $@
+	@echo "ENABLED_SECTIONS += samples" >> $@
diff --git a/libs/libvpx/vpx/svc_context.h b/libs/libvpx/examples/svc_context.h
similarity index 83%
rename from libs/libvpx/vpx/svc_context.h
rename to libs/libvpx/examples/svc_context.h
index 462785075cb..c5779ce8a9f 100644
--- a/libs/libvpx/vpx/svc_context.h
+++ b/libs/libvpx/examples/svc_context.h
@@ -13,11 +13,11 @@
  * spatial SVC frame
  */
 
-#ifndef VPX_SVC_CONTEXT_H_
-#define VPX_SVC_CONTEXT_H_
+#ifndef VPX_EXAMPLES_SVC_CONTEXT_H_
+#define VPX_EXAMPLES_SVC_CONTEXT_H_
 
-#include "./vp8cx.h"
-#include "./vpx_encoder.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -35,10 +35,8 @@ typedef struct {
   int temporal_layers;  // number of temporal layers
   int temporal_layering_mode;
   SVC_LOG_LEVEL log_level;  // amount of information to display
-  int log_print;       // when set, printf log messages instead of returning the
-                       // message with svc_get_message
-  int output_rc_stat;  // for outputting rc stats
-  int speed;           // speed setting for codec
+  int output_rc_stat;       // for outputting rc stats
+  int speed;                // speed setting for codec
   int threads;
   int aqmode;  // turns on aq-mode=3 (cyclic_refresh): 0=off, 1=on.
   // private storage for vpx_svc_encode
@@ -71,7 +69,6 @@ typedef struct SvcInternal {
   int layer;
   int use_multiple_frame_contexts;
 
-  char message_buffer[2048];
   vpx_codec_ctx_t *codec_ctx;
 } SvcInternal_t;
 
@@ -106,15 +103,10 @@ void vpx_svc_release(SvcContext *svc_ctx);
 /**
  * dump accumulated statistics and reset accumulated values
  */
-const char *vpx_svc_dump_statistics(SvcContext *svc_ctx);
-
-/**
- *  get status message from previous encode
- */
-const char *vpx_svc_get_message(const SvcContext *svc_ctx);
+void vpx_svc_dump_statistics(SvcContext *svc_ctx);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_SVC_CONTEXT_H_
+#endif  // VPX_EXAMPLES_SVC_CONTEXT_H_
diff --git a/libs/libvpx/vpx/src/svc_encodeframe.c b/libs/libvpx/examples/svc_encodeframe.c
similarity index 85%
rename from libs/libvpx/vpx/src/svc_encodeframe.c
rename to libs/libvpx/examples/svc_encodeframe.c
index f633600c799..a73ee8ed660 100644
--- a/libs/libvpx/vpx/src/svc_encodeframe.c
+++ b/libs/libvpx/examples/svc_encodeframe.c
@@ -22,7 +22,7 @@
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
 #include "./vpx_config.h"
-#include "vpx/svc_context.h"
+#include "./svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
@@ -95,17 +95,11 @@ static const SvcInternal_t *get_const_svc_internal(const SvcContext *svc_ctx) {
   return (const SvcInternal_t *)svc_ctx->internal;
 }
 
-static void svc_log_reset(SvcContext *svc_ctx) {
-  SvcInternal_t *const si = (SvcInternal_t *)svc_ctx->internal;
-  si->message_buffer[0] = '\0';
-}
-
 static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
                    ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
-  SvcInternal_t *const si = get_svc_internal(svc_ctx);
 
   if (level > svc_ctx->log_level) {
     return retval;
@@ -115,16 +109,8 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
   retval = vsnprintf(buf, sizeof(buf), fmt, ap);
   va_end(ap);
 
-  if (svc_ctx->log_print) {
-    printf("%s", buf);
-  } else {
-    strncat(si->message_buffer, buf,
-            sizeof(si->message_buffer) - strlen(si->message_buffer) - 1);
-  }
+  printf("%s", buf);
 
-  if (level == SVC_LOG_ERROR) {
-    si->codec_ctx->err_detail = si->message_buffer;
-  }
   return retval;
 }
 
@@ -169,6 +155,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx,
     return VPX_CODEC_INVALID_PARAM;
 
   input_string = strdup(input);
+  if (input_string == NULL) return VPX_CODEC_MEM_ERROR;
   token = strtok_r(input_string, delim, &save_ptr);
   for (i = 0; i < num_layers; ++i) {
     if (token != NULL) {
@@ -208,6 +195,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
 
   if (options == NULL) return VPX_CODEC_OK;
   input_string = strdup(options);
+  if (input_string == NULL) return VPX_CODEC_MEM_ERROR;
 
   // parse option name
   option_name = strtok_r(input_string, "=", &input_ptr);
@@ -294,8 +282,8 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   return VPX_CODEC_OK;
 }
 
-vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx,
-                                      vpx_codec_enc_cfg_t *const enc_cfg) {
+static vpx_codec_err_t assign_layer_bitrates(
+    const SvcContext *svc_ctx, vpx_codec_enc_cfg_t *const enc_cfg) {
   int i;
   const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
   int sl, tl, spatial_layer_target;
@@ -471,8 +459,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "spatial layers * temporal layers exceeds the maximum number of "
             "allowed layers of %d\n",
-            svc_ctx->spatial_layers * svc_ctx->temporal_layers,
-            (int)VPX_MAX_LAYERS);
+            svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
   res = assign_layer_bitrates(svc_ctx, enc_cfg);
@@ -485,11 +472,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     return VPX_CODEC_INVALID_PARAM;
   }
 
-#if CONFIG_SPATIAL_SVC
-  for (i = 0; i < svc_ctx->spatial_layers; ++i)
-    enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
-#endif
-
   if (svc_ctx->temporal_layers > 1) {
     int i;
     for (i = 0; i < svc_ctx->temporal_layers; ++i) {
@@ -514,7 +496,17 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     enc_cfg->rc_buf_initial_sz = 500;
     enc_cfg->rc_buf_optimal_sz = 600;
     enc_cfg->rc_buf_sz = 1000;
-    enc_cfg->rc_dropframe_thresh = 0;
+  }
+
+  for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
+    for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
+      i = sl * svc_ctx->temporal_layers + tl;
+      if (enc_cfg->rc_end_usage == VPX_CBR &&
+          enc_cfg->g_pass == VPX_RC_ONE_PASS) {
+        si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer;
+        si->svc_params.min_quantizers[i] = enc_cfg->rc_min_quantizer;
+      }
+    }
   }
 
   if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
@@ -548,8 +540,6 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
     return VPX_CODEC_INVALID_PARAM;
   }
 
-  svc_log_reset(svc_ctx);
-
   res =
       vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration, 0, deadline);
   if (res != VPX_CODEC_OK) {
@@ -559,56 +549,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
-      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
-        int i;
-        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
-          int j;
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].psnr[0],
-                  cx_pkt->data.layer_psnr[i].psnr[1],
-                  cx_pkt->data.layer_psnr[i].psnr[2],
-                  cx_pkt->data.layer_psnr[i].psnr[3]);
-          svc_log(svc_ctx, SVC_LOG_DEBUG,
-                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                  si->psnr_pkt_received, i, cx_pkt->data.layer_psnr[i].sse[0],
-                  cx_pkt->data.layer_psnr[i].sse[1],
-                  cx_pkt->data.layer_psnr[i].sse[2],
-                  cx_pkt->data.layer_psnr[i].sse[3]);
-
-          for (j = 0; j < COMPONENTS; ++j) {
-            si->psnr_sum[i][j] += cx_pkt->data.layer_psnr[i].psnr[j];
-            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
-          }
-        }
-        ++si->psnr_pkt_received;
-        break;
-      }
-      case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
-        int i;
-        for (i = 0; i < svc_ctx->spatial_layers; ++i)
-          si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
-        break;
-      }
-#endif
       case VPX_CODEC_PSNR_PKT: {
-#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
-        int j;
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->psnr_pkt_received, 0, cx_pkt->data.layer_psnr[0].psnr[0],
-                cx_pkt->data.layer_psnr[0].psnr[1],
-                cx_pkt->data.layer_psnr[0].psnr[2],
-                cx_pkt->data.layer_psnr[0].psnr[3]);
-        for (j = 0; j < COMPONENTS; ++j) {
-          si->psnr_sum[0][j] += cx_pkt->data.layer_psnr[0].psnr[j];
-          si->sse_sum[0][j] += cx_pkt->data.layer_psnr[0].sse[j];
-        }
-#endif
       }
         ++si->psnr_pkt_received;
         break;
@@ -619,19 +560,13 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   return VPX_CODEC_OK;
 }
 
-const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
-  const SvcInternal_t *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->message_buffer;
-}
-
 static double calc_psnr(double d) {
   if (d == 0) return 100;
   return -10.0 * log(d) / log(10.0);
 }
 
 // dump accumulated statistics and reset accumulated values
-const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
+void vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   int number_of_frames;
   int i, j;
   uint32_t bytes_total = 0;
@@ -641,21 +576,19 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   double y_scale;
 
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-
-  svc_log_reset(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return;
 
   number_of_frames = si->psnr_pkt_received;
-  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
+  if (number_of_frames <= 0) return;
 
   svc_log(svc_ctx, SVC_LOG_INFO, "\n");
   for (i = 0; i < svc_ctx->spatial_layers; ++i) {
     svc_log(svc_ctx, SVC_LOG_INFO,
             "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
-            i, (double)si->psnr_sum[i][0] / number_of_frames,
-            (double)si->psnr_sum[i][1] / number_of_frames,
-            (double)si->psnr_sum[i][2] / number_of_frames,
-            (double)si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
+            i, si->psnr_sum[i][0] / number_of_frames,
+            si->psnr_sum[i][1] / number_of_frames,
+            si->psnr_sum[i][2] / number_of_frames,
+            si->psnr_sum[i][3] / number_of_frames, si->bytes_sum[i]);
     // the following psnr calculation is deduced from ffmpeg.c#print_report
     y_scale = si->width * si->height * 255.0 * 255.0 * number_of_frames;
     scale[1] = y_scale;
@@ -686,7 +619,6 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
   si->psnr_pkt_received = 0;
 
   svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
-  return vpx_svc_get_message(svc_ctx);
 }
 
 void vpx_svc_release(SvcContext *svc_ctx) {
diff --git a/libs/libvpx/examples/vp8_multi_resolution_encoder.c b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
index b14b1ff3972..e72f8a01970 100644
--- a/libs/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libs/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -61,7 +61,7 @@ void usage_exit(void) { exit(EXIT_FAILURE); }
 
 int (*read_frame_p)(FILE *f, vpx_image_t *img);
 
-static int read_frame(FILE *f, vpx_image_t *img) {
+static int mulres_read_frame(FILE *f, vpx_image_t *img) {
   size_t nbytes, to_read;
   int res = 1;
 
@@ -75,7 +75,7 @@ static int read_frame(FILE *f, vpx_image_t *img) {
   return res;
 }
 
-static int read_frame_by_row(FILE *f, vpx_image_t *img) {
+static int mulres_read_frame_by_row(FILE *f, vpx_image_t *img) {
   size_t nbytes, to_read;
   int res = 1;
   int plane;
@@ -471,9 +471,9 @@ int main(int argc, char **argv) {
       die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
 
   if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
-    read_frame_p = read_frame;
+    read_frame_p = mulres_read_frame;
   else
-    read_frame_p = read_frame_by_row;
+    read_frame_p = mulres_read_frame_by_row;
 
   for (i = 0; i < NUM_ENCODERS; i++)
     if (outfile[i]) write_ivf_file_header(outfile[i], &cfg[i], 0);
diff --git a/libs/libvpx/examples/vp9_spatial_svc_encoder.c b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
index 0987cbfb858..b987989a86c 100644
--- a/libs/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libs/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -25,13 +25,19 @@
 #include "../video_writer.h"
 
 #include "../vpx_ports/vpx_timer.h"
-#include "vpx/svc_context.h"
+#include "./svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 #include "../vpxstats.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "./y4minput.h"
+
 #define OUTPUT_RC_STATS 1
 
+#define SIMULCAST_MODE 0
+
+static const arg_def_t outputfile =
+    ARG_DEF("o", "output", 1, "Output filename");
 static const arg_def_t skip_frames_arg =
     ARG_DEF("s", "skip-frames", 1, "input frames to skip");
 static const arg_def_t frames_arg =
@@ -86,6 +92,19 @@ static const arg_def_t aqmode_arg =
     ARG_DEF("aq", "aqmode", 1, "aq-mode off/on");
 static const arg_def_t bitrates_arg =
     ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]");
+static const arg_def_t dropframe_thresh_arg =
+    ARG_DEF(NULL, "drop-frame", 1, "Temporal resampling threshold (buf %)");
+static const struct arg_enum_list tune_content_enum[] = {
+  { "default", VP9E_CONTENT_DEFAULT },
+  { "screen", VP9E_CONTENT_SCREEN },
+  { "film", VP9E_CONTENT_FILM },
+  { NULL, 0 }
+};
+
+static const arg_def_t tune_content_arg = ARG_DEF_ENUM(
+    NULL, "tune-content", 1, "Tune content type", tune_content_enum);
+static const arg_def_t inter_layer_pred_arg = ARG_DEF(
+    NULL, "inter-layer-pred", 1, "0 - 3: On, Off, Key-frames, Constrained");
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -97,6 +116,7 @@ static const arg_def_t bitdepth_arg = ARG_DEF_ENUM(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static const arg_def_t *svc_args[] = { &frames_arg,
+                                       &outputfile,
                                        &width_arg,
                                        &height_arg,
                                        &timebase_arg,
@@ -127,6 +147,9 @@ static const arg_def_t *svc_args[] = { &frames_arg,
                                        &speed_arg,
                                        &rc_end_usage_arg,
                                        &bitrates_arg,
+                                       &dropframe_thresh_arg,
+                                       &tune_content_arg,
+                                       &inter_layer_pred_arg,
                                        NULL };
 
 static const uint32_t default_frames_to_skip = 0;
@@ -145,7 +168,6 @@ static const int32_t default_speed = -1;    // -1 means use library default.
 static const uint32_t default_threads = 0;  // zero means use library default.
 
 typedef struct {
-  const char *input_filename;
   const char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
@@ -153,12 +175,14 @@ typedef struct {
   stats_io_t rc_stats;
   int passes;
   int pass;
+  int tune_content;
+  int inter_layer_pred;
 } AppInput;
 
 static const char *exec_name;
 
 void usage_exit(void) {
-  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
+  fprintf(stderr, "Usage: %s <options> input_filename -o output_filename\n",
           exec_name);
   fprintf(stderr, "Options:\n");
   arg_show_usage(stderr, svc_args);
@@ -217,6 +241,8 @@ static void parse_command_line(int argc, const char **argv_,
 
     if (arg_match(&arg, &frames_arg, argi)) {
       app_input->frames_to_code = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &outputfile, argi)) {
+      app_input->output_filename = arg.val;
     } else if (arg_match(&arg, &width_arg, argi)) {
       enc_cfg->g_w = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &height_arg, argi)) {
@@ -237,6 +263,9 @@ static void parse_command_line(int argc, const char **argv_,
 #endif
     } else if (arg_match(&arg, &speed_arg, argi)) {
       svc_ctx->speed = arg_parse_uint(&arg);
+      if (svc_ctx->speed > 9) {
+        warn("Mapping speed %d to speed 9.\n", svc_ctx->speed);
+      }
     } else if (arg_match(&arg, &aqmode_arg, argi)) {
       svc_ctx->aqmode = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &threads_arg, argi)) {
@@ -251,11 +280,15 @@ static void parse_command_line(int argc, const char **argv_,
       enc_cfg->kf_min_dist = arg_parse_uint(&arg);
       enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
     } else if (arg_match(&arg, &scale_factors_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s scale-factors=%s",
-               string_options, arg.val);
+      strncat(string_options, " scale-factors=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &bitrates_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s bitrates=%s",
-               string_options, arg.val);
+      strncat(string_options, " bitrates=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &passes_arg, argi)) {
       passes = arg_parse_uint(&arg);
       if (passes < 1 || passes > 2) {
@@ -269,11 +302,15 @@ static void parse_command_line(int argc, const char **argv_,
     } else if (arg_match(&arg, &fpf_name_arg, argi)) {
       fpf_file_name = arg.val;
     } else if (arg_match(&arg, &min_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s min-quantizers=%s",
-               string_options, arg.val);
+      strncat(string_options, " min-quantizers=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &max_q_arg, argi)) {
-      snprintf(string_options, sizeof(string_options), "%s max-quantizers=%s",
-               string_options, arg.val);
+      strncat(string_options, " max-quantizers=",
+              sizeof(string_options) - strlen(string_options) - 1);
+      strncat(string_options, arg.val,
+              sizeof(string_options) - strlen(string_options) - 1);
     } else if (arg_match(&arg, &min_bitrate_arg, argi)) {
       min_bitrate = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &max_bitrate_arg, argi)) {
@@ -303,6 +340,12 @@ static void parse_command_line(int argc, const char **argv_,
           break;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+    } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
+      enc_cfg->rc_dropframe_thresh = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &tune_content_arg, argi)) {
+      app_input->tune_content = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &inter_layer_pred_arg, argi)) {
+      app_input->inter_layer_pred = arg_parse_uint(&arg);
     } else {
       ++argj;
     }
@@ -358,13 +401,18 @@ static void parse_command_line(int argc, const char **argv_,
     if (argi[0][0] == '-' && strlen(argi[0]) > 1)
       die("Error: Unrecognized option %s\n", *argi);
 
-  if (argv[0] == NULL || argv[1] == 0) {
+  if (argv[0] == NULL) {
     usage_exit();
   }
-  app_input->input_filename = argv[0];
-  app_input->output_filename = argv[1];
+  app_input->input_ctx.filename = argv[0];
   free(argv);
 
+  open_input_file(&app_input->input_ctx);
+  if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
+    enc_cfg->g_w = app_input->input_ctx.width;
+    enc_cfg->g_h = app_input->input_ctx.height;
+  }
+
   if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
       enc_cfg->g_h % 2)
     die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
@@ -429,8 +477,9 @@ static void set_rate_control_stats(struct RateControlStats *rc,
         rc->layer_framerate[layer] = framerate / cfg->ts_rate_decimator[tl];
       if (tl > 0) {
         rc->layer_pfb[layer] =
-            1000.0 * (cfg->layer_target_bitrate[layer] -
-                      cfg->layer_target_bitrate[layer - 1]) /
+            1000.0 *
+            (cfg->layer_target_bitrate[layer] -
+             cfg->layer_target_bitrate[layer - 1]) /
             (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]);
       } else {
         rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] /
@@ -502,14 +551,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
   printf("Average, rms-variance, and percent-fluct: %f %f %f \n",
          rc->avg_st_encoding_bitrate, sqrt(rc->variance_st_encoding_bitrate),
          perc_fluctuation);
-  if (frame_cnt != tot_num_frames)
-    die("Error: Number of input frames not equal to output encoded frames != "
-        "%d tot_num_frames = %d\n",
-        frame_cnt, tot_num_frames);
+  printf("Num of input, num of encoded (super) frames: %d %d \n", frame_cnt,
+         tot_num_frames);
 }
 
-vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                       uint64_t sizes[8], int *count) {
+static vpx_codec_err_t parse_superframe_index(const uint8_t *data,
+                                              size_t data_sz, uint64_t sizes[8],
+                                              int *count) {
   // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
   // it is a super frame index. If the last byte of real video compression
   // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@@ -561,106 +609,386 @@ vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
 // bypass/flexible mode. The pattern corresponds to the pattern
 // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
 // non-flexible mode.
-void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
-                                 int is_key_frame,
-                                 vpx_svc_ref_frame_config_t *ref_frame_config) {
+static void set_frame_flags_bypass_mode_ex0(
+    int tl, int num_spatial_layers, int is_key_frame,
+    vpx_svc_ref_frame_config_t *ref_frame_config) {
+  int sl;
+  for (sl = 0; sl < num_spatial_layers; ++sl)
+    ref_frame_config->update_buffer_slot[sl] = 0;
+
   for (sl = 0; sl < num_spatial_layers; ++sl) {
+    // Set the buffer idx.
+    if (tl == 0) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      if (sl) {
+        if (is_key_frame) {
+          ref_frame_config->lst_fb_idx[sl] = sl - 1;
+          ref_frame_config->gld_fb_idx[sl] = sl;
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = sl - 1;
+        }
+      } else {
+        ref_frame_config->gld_fb_idx[sl] = 0;
+      }
+      ref_frame_config->alt_fb_idx[sl] = 0;
+    } else if (tl == 1) {
+      ref_frame_config->lst_fb_idx[sl] = sl;
+      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+    }
+    // Set the reference and update flags.
     if (!tl) {
       if (!sl) {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF |
-            VP8_EFLAG_NO_UPD_ARF;
+        // Base spatial and base temporal (sl = 0, tl = 0)
+        ref_frame_config->reference_last[sl] = 1;
+        ref_frame_config->reference_golden[sl] = 0;
+        ref_frame_config->reference_alt_ref[sl] = 0;
+        ref_frame_config->update_buffer_slot[sl] |=
+            1 << ref_frame_config->lst_fb_idx[sl];
       } else {
         if (is_key_frame) {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_ARF |
-              VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->gld_fb_idx[sl];
         } else {
-          ref_frame_config->frame_flags[sl] =
-              VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+          // Non-zero spatiall layer.
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 1;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
         }
       }
     } else if (tl == 1) {
       if (!sl) {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST |
-            VP8_EFLAG_NO_UPD_GF;
+        // Base spatial and top temporal (tl = 1)
+        ref_frame_config->reference_last[sl] = 1;
+        ref_frame_config->reference_golden[sl] = 0;
+        ref_frame_config->reference_alt_ref[sl] = 0;
+        ref_frame_config->update_buffer_slot[sl] |=
+            1 << ref_frame_config->alt_fb_idx[sl];
       } else {
-        ref_frame_config->frame_flags[sl] =
-            VP8_EFLAG_NO_REF_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF;
+        // Non-zero spatial.
+        if (sl < num_spatial_layers - 1) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else if (sl == num_spatial_layers - 1) {
+          // Top spatial and top temporal (non-reference -- doesn't update any
+          // reference buffers)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 1;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+        }
       }
     }
-    if (tl == 0) {
-      ref_frame_config->lst_fb_idx[sl] = sl;
-      if (sl)
-        ref_frame_config->gld_fb_idx[sl] = sl - 1;
-      else
-        ref_frame_config->gld_fb_idx[sl] = 0;
-      ref_frame_config->alt_fb_idx[sl] = 0;
-    } else if (tl == 1) {
-      ref_frame_config->lst_fb_idx[sl] = sl;
-      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
-      ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+  }
+}
+
+// Example pattern for 2 spatial layers and 2 temporal layers used in the
+// bypass/flexible mode, except only 1 spatial layer when temporal_layer_id = 1.
+static void set_frame_flags_bypass_mode_ex1(
+    int tl, int num_spatial_layers, int is_key_frame,
+    vpx_svc_ref_frame_config_t *ref_frame_config) {
+  int sl;
+  for (sl = 0; sl < num_spatial_layers; ++sl)
+    ref_frame_config->update_buffer_slot[sl] = 0;
+
+  if (tl == 0) {
+    if (is_key_frame) {
+      ref_frame_config->lst_fb_idx[1] = 0;
+      ref_frame_config->gld_fb_idx[1] = 1;
+    } else {
+      ref_frame_config->lst_fb_idx[1] = 1;
+      ref_frame_config->gld_fb_idx[1] = 0;
+    }
+    ref_frame_config->alt_fb_idx[1] = 0;
+
+    ref_frame_config->lst_fb_idx[0] = 0;
+    ref_frame_config->gld_fb_idx[0] = 0;
+    ref_frame_config->alt_fb_idx[0] = 0;
+  }
+  if (tl == 1) {
+    ref_frame_config->lst_fb_idx[0] = 0;
+    ref_frame_config->gld_fb_idx[0] = 1;
+    ref_frame_config->alt_fb_idx[0] = 2;
+
+    ref_frame_config->lst_fb_idx[1] = 1;
+    ref_frame_config->gld_fb_idx[1] = 2;
+    ref_frame_config->alt_fb_idx[1] = 3;
+  }
+  // Set the reference and update flags.
+  if (tl == 0) {
+    // Base spatial and base temporal (sl = 0, tl = 0)
+    ref_frame_config->reference_last[0] = 1;
+    ref_frame_config->reference_golden[0] = 0;
+    ref_frame_config->reference_alt_ref[0] = 0;
+    ref_frame_config->update_buffer_slot[0] |=
+        1 << ref_frame_config->lst_fb_idx[0];
+
+    if (is_key_frame) {
+      ref_frame_config->reference_last[1] = 1;
+      ref_frame_config->reference_golden[1] = 0;
+      ref_frame_config->reference_alt_ref[1] = 0;
+      ref_frame_config->update_buffer_slot[1] |=
+          1 << ref_frame_config->gld_fb_idx[1];
+    } else {
+      // Non-zero spatiall layer.
+      ref_frame_config->reference_last[1] = 1;
+      ref_frame_config->reference_golden[1] = 1;
+      ref_frame_config->reference_alt_ref[1] = 1;
+      ref_frame_config->update_buffer_slot[1] |=
+          1 << ref_frame_config->lst_fb_idx[1];
+    }
+  }
+  if (tl == 1) {
+    // Top spatial and top temporal (non-reference -- doesn't update any
+    // reference buffers)
+    ref_frame_config->reference_last[1] = 1;
+    ref_frame_config->reference_golden[1] = 0;
+    ref_frame_config->reference_alt_ref[1] = 0;
+  }
+}
+
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+static void test_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder,
+                        const int frames_out, int *mismatch_seen) {
+  vpx_image_t enc_img, dec_img;
+  struct vp9_ref_frame ref_enc, ref_dec;
+  if (*mismatch_seen) return;
+  /* Get the internal reference frame */
+  ref_enc.idx = 0;
+  ref_dec.idx = 0;
+  vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc);
+  enc_img = ref_enc.img;
+  vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec);
+  dec_img = ref_dec.img;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
+      (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                    enc_img.d_w, enc_img.d_h, 16);
+      vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+    }
+    if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+                    dec_img.d_w, dec_img.d_h, 16);
+      vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+    }
+  }
+#endif
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+      find_mismatch_high(&enc_img, &dec_img, y, u, v);
+    } else {
+      find_mismatch(&enc_img, &dec_img, y, u, v);
+    }
+#else
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+#endif
+    decoder->err = 1;
+    printf(
+        "Encode/decode mismatch on frame %d at"
+        " Y[%d, %d] {%d/%d},"
+        " U[%d, %d] {%d/%d},"
+        " V[%d, %d] {%d/%d}\n",
+        frames_out, y[0], y[1], y[2], y[3], u[0], u[1], u[2], u[3], v[0], v[1],
+        v[2], v[3]);
+    *mismatch_seen = frames_out;
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+#endif
+
+#if OUTPUT_RC_STATS
+static void svc_output_rc_stats(
+    vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *enc_cfg,
+    vpx_svc_layer_id_t *layer_id, const vpx_codec_cx_pkt_t *cx_pkt,
+    struct RateControlStats *rc, VpxVideoWriter **outfile,
+    const uint32_t frame_cnt, const double framerate) {
+  int num_layers_encoded = 0;
+  unsigned int sl, tl;
+  uint64_t sizes[8];
+  uint64_t sizes_parsed[8];
+  int count = 0;
+  double sum_bitrate = 0.0;
+  double sum_bitrate2 = 0.0;
+  vp9_zero(sizes);
+  vp9_zero(sizes_parsed);
+  vpx_codec_control(codec, VP9E_GET_SVC_LAYER_ID, layer_id);
+  parse_superframe_index(cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
+                         sizes_parsed, &count);
+  if (enc_cfg->ss_number_layers == 1) sizes[0] = cx_pkt->data.frame.sz;
+  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+    sizes[sl] = 0;
+    if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+      sizes[sl] = sizes_parsed[num_layers_encoded];
+      num_layers_encoded++;
+    }
+  }
+  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+    unsigned int sl2;
+    uint64_t tot_size = 0;
+#if SIMULCAST_MODE
+    for (sl2 = 0; sl2 < sl; ++sl2) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2];
+    }
+    vpx_video_writer_write_frame(outfile[sl],
+                                 (uint8_t *)(cx_pkt->data.frame.buf) + tot_size,
+                                 (size_t)(sizes[sl]), cx_pkt->data.frame.pts);
+#else
+    for (sl2 = 0; sl2 <= sl; ++sl2) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl2]) tot_size += sizes[sl2];
+    }
+    if (tot_size > 0)
+      vpx_video_writer_write_frame(outfile[sl], cx_pkt->data.frame.buf,
+                                   (size_t)(tot_size), cx_pkt->data.frame.pts);
+#endif  // SIMULCAST_MODE
+  }
+  for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+    if (cx_pkt->data.frame.spatial_layer_encoded[sl]) {
+      for (tl = layer_id->temporal_layer_id; tl < enc_cfg->ts_number_layers;
+           ++tl) {
+        const int layer = sl * enc_cfg->ts_number_layers + tl;
+        ++rc->layer_tot_enc_frames[layer];
+        rc->layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
+        // Keep count of rate control stats per layer, for non-key
+        // frames.
+        if (tl == (unsigned int)layer_id->temporal_layer_id &&
+            !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
+          rc->layer_avg_frame_size[layer] += 8.0 * sizes[sl];
+          rc->layer_avg_rate_mismatch[layer] +=
+              fabs(8.0 * sizes[sl] - rc->layer_pfb[layer]) /
+              rc->layer_pfb[layer];
+          ++rc->layer_enc_frames[layer];
+        }
+      }
+    }
+  }
+
+  // Update for short-time encoding bitrate states, for moving
+  // window of size rc->window, shifted by rc->window / 2.
+  // Ignore first window segment, due to key frame.
+  if (frame_cnt > (unsigned int)rc->window_size) {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      if (cx_pkt->data.frame.spatial_layer_encoded[sl])
+        sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
+    }
+    if (frame_cnt % rc->window_size == 0) {
+      rc->window_count += 1;
+      rc->avg_st_encoding_bitrate += sum_bitrate / rc->window_size;
+      rc->variance_st_encoding_bitrate +=
+          (sum_bitrate / rc->window_size) * (sum_bitrate / rc->window_size);
+    }
+  }
+
+  // Second shifted window.
+  if (frame_cnt > (unsigned int)(rc->window_size + rc->window_size / 2)) {
+    for (sl = 0; sl < enc_cfg->ss_number_layers; ++sl) {
+      sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
+    }
+
+    if (frame_cnt > (unsigned int)(2 * rc->window_size) &&
+        frame_cnt % rc->window_size == 0) {
+      rc->window_count += 1;
+      rc->avg_st_encoding_bitrate += sum_bitrate2 / rc->window_size;
+      rc->variance_st_encoding_bitrate +=
+          (sum_bitrate2 / rc->window_size) * (sum_bitrate2 / rc->window_size);
     }
   }
 }
+#endif
 
 int main(int argc, const char **argv) {
   AppInput app_input;
   VpxVideoWriter *writer = NULL;
   VpxVideoInfo info;
-  vpx_codec_ctx_t codec;
+  vpx_codec_ctx_t encoder;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
+  vpx_svc_frame_drop_t svc_drop_frame;
   uint32_t i;
   uint32_t frame_cnt = 0;
   vpx_image_t raw;
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
-  FILE *infile = NULL;
   int end_of_stream = 0;
   int frames_received = 0;
 #if OUTPUT_RC_STATS
-  VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
+  VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
   struct RateControlStats rc;
   vpx_svc_layer_id_t layer_id;
   vpx_svc_ref_frame_config_t ref_frame_config;
-  unsigned int sl, tl;
-  double sum_bitrate = 0.0;
-  double sum_bitrate2 = 0.0;
+  unsigned int sl;
   double framerate = 30.0;
 #endif
   struct vpx_usec_timer timer;
   int64_t cx_time = 0;
+#if CONFIG_INTERNAL_STATS
+  FILE *f = fopen("opsnr.stt", "a");
+#endif
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+  int mismatch_seen = 0;
+  vpx_codec_ctx_t decoder;
+#endif
   memset(&svc_ctx, 0, sizeof(svc_ctx));
-  svc_ctx.log_print = 1;
+  memset(&app_input, 0, sizeof(AppInput));
+  memset(&info, 0, sizeof(VpxVideoInfo));
+  memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
+  memset(&rc, 0, sizeof(struct RateControlStats));
   exec_name = argv[0];
+
+  /* Setup default input stream settings */
+  app_input.input_ctx.framerate.numerator = 30;
+  app_input.input_ctx.framerate.denominator = 1;
+  app_input.input_ctx.only_i420 = 1;
+  app_input.input_ctx.bit_depth = 0;
+
   parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
 
+  // Y4M reader handles its own allocation.
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
 // Allocate image buffer
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw,
-                     enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
-                                                    : VPX_IMG_FMT_I42016,
-                     enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
+    if (!vpx_img_alloc(&raw,
+                       enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+                                                      : VPX_IMG_FMT_I42016,
+                       enc_cfg.g_w, enc_cfg.g_h, 32)) {
+      die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+    }
 #else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
-    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
-  }
+    if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32)) {
+      die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  if (!(infile = fopen(app_input.input_filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_filename);
+  }
 
   // Initialize codec
-  if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
+  if (vpx_svc_init(&svc_ctx, &encoder, vpx_codec_vp9_cx(), &enc_cfg) !=
       VPX_CODEC_OK)
     die("Failed to initialize encoder\n");
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+  if (vpx_codec_dec_init(
+          &decoder, get_vpx_decoder_by_name("vp9")->codec_interface(), NULL, 0))
+    die("Failed to initialize decoder\n");
+#endif
 
 #if OUTPUT_RC_STATS
+  rc.window_count = 1;
+  rc.window_size = 15;  // Silence a static analysis warning.
+  rc.avg_st_encoding_bitrate = 0.0;
+  rc.variance_st_encoding_bitrate = 0.0;
   if (svc_ctx.output_rc_stat) {
     set_rate_control_stats(&rc, &enc_cfg);
     framerate = enc_cfg.g_timebase.den / enc_cfg.g_timebase.num;
@@ -668,6 +996,8 @@ int main(int argc, const char **argv) {
 #endif
 
   info.codec_fourcc = VP9_FOURCC;
+  info.frame_width = enc_cfg.g_w;
+  info.frame_height = enc_cfg.g_h;
   info.time_base.numerator = enc_cfg.g_timebase.num;
   info.time_base.denominator = enc_cfg.g_timebase.den;
 
@@ -679,43 +1009,65 @@ int main(int argc, const char **argv) {
       die("Failed to open %s for writing\n", app_input.output_filename);
   }
 #if OUTPUT_RC_STATS
-  // For now, just write temporal layer streams.
-  // TODO(marpan): do spatial by re-writing superframe.
+  // Write out spatial layer stream.
+  // TODO(marpan/jianj): allow for writing each spatial and temporal stream.
   if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
       char file_name[PATH_MAX];
 
-      snprintf(file_name, sizeof(file_name), "%s_t%d.ivf",
-               app_input.output_filename, tl);
-      outfile[tl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
-      if (!outfile[tl]) die("Failed to open %s for writing", file_name);
+      snprintf(file_name, sizeof(file_name), "%s_s%d.ivf",
+               app_input.output_filename, sl);
+      outfile[sl] = vpx_video_writer_open(file_name, kContainerIVF, &info);
+      if (!outfile[sl]) die("Failed to open %s for writing", file_name);
     }
   }
 #endif
 
   // skip initial frames
-  for (i = 0; i < app_input.frames_to_skip; ++i) vpx_img_read(&raw, infile);
+  for (i = 0; i < app_input.frames_to_skip; ++i)
+    read_frame(&app_input.input_ctx, &raw);
 
   if (svc_ctx.speed != -1)
-    vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
+    vpx_codec_control(&encoder, VP8E_SET_CPUUSED, svc_ctx.speed);
   if (svc_ctx.threads) {
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
+    vpx_codec_control(&encoder, VP9E_SET_TILE_COLUMNS,
+                      get_msb(svc_ctx.threads));
     if (svc_ctx.threads > 1)
-      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+      vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 1);
     else
-      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
+      vpx_codec_control(&encoder, VP9E_SET_ROW_MT, 0);
   }
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
+    vpx_codec_control(&encoder, VP9E_SET_AQ_MODE, 3);
   if (svc_ctx.speed >= 5)
-    vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
-  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+    vpx_codec_control(&encoder, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(&encoder, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
+
+  vpx_codec_control(&encoder, VP9E_SET_SVC_INTER_LAYER_PRED,
+                    app_input.inter_layer_pred);
+
+  vpx_codec_control(&encoder, VP9E_SET_NOISE_SENSITIVITY, 0);
+
+  vpx_codec_control(&encoder, VP9E_SET_TUNE_CONTENT, app_input.tune_content);
+
+  svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+  for (sl = 0; sl < (unsigned int)svc_ctx.spatial_layers; ++sl)
+    svc_drop_frame.framedrop_thresh[sl] = enc_cfg.rc_dropframe_thresh;
+  svc_drop_frame.max_consec_drop = INT_MAX;
+  vpx_codec_control(&encoder, VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
 
   // Encode frames
   while (!end_of_stream) {
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *cx_pkt;
-    if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
+    // Example patterns for bypass/flexible mode:
+    // example_pattern = 0: 2 temporal layers, and spatial_layers = 1,2,3. Exact
+    // to fixed SVC patterns. example_pattern = 1: 2 spatial and 2 temporal
+    // layers, with SL0 only has TL0, and SL1 has both TL0 and TL1. This example
+    // uses the extended API.
+    int example_pattern = 0;
+    if (frame_cnt >= app_input.frames_to_code ||
+        !read_frame(&app_input.input_ctx, &raw)) {
       // We need one extra vpx_svc_encode call at end of stream to flush
       // encoder and get remaining data
       end_of_stream = 1;
@@ -723,140 +1075,97 @@ int main(int argc, const char **argv) {
 
     // For BYPASS/FLEXIBLE mode, set the frame flags (reference and updates)
     // and the buffer indices for each spatial layer of the current
-    // (super)frame to be encoded. The temporal layer_id for the current frame
-    // also needs to be set.
+    // (super)frame to be encoded. The spatial and temporal layer_id for the
+    // current frame also needs to be set.
     // TODO(marpan): Should rename the "VP9E_TEMPORAL_LAYERING_MODE_BYPASS"
     // mode to "VP9E_LAYERING_MODE_BYPASS".
     if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
       layer_id.spatial_layer_id = 0;
       // Example for 2 temporal layers.
-      if (frame_cnt % 2 == 0)
+      if (frame_cnt % 2 == 0) {
         layer_id.temporal_layer_id = 0;
-      else
+        for (i = 0; i < VPX_SS_MAX_LAYERS; i++)
+          layer_id.temporal_layer_id_per_spatial[i] = 0;
+      } else {
         layer_id.temporal_layer_id = 1;
-      // Note that we only set the temporal layer_id, since we are calling
-      // the encode for the whole superframe. The encoder will internally loop
-      // over all the spatial layers for the current superframe.
-      vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
-      set_frame_flags_bypass_mode(sl, layer_id.temporal_layer_id,
-                                  svc_ctx.spatial_layers, frame_cnt == 0,
-                                  &ref_frame_config);
-      vpx_codec_control(&codec, VP9E_SET_SVC_REF_FRAME_CONFIG,
+        for (i = 0; i < VPX_SS_MAX_LAYERS; i++)
+          layer_id.temporal_layer_id_per_spatial[i] = 1;
+      }
+      if (example_pattern == 1) {
+        // example_pattern 1 is hard-coded for 2 spatial and 2 temporal layers.
+        assert(svc_ctx.spatial_layers == 2);
+        assert(svc_ctx.temporal_layers == 2);
+        if (frame_cnt % 2 == 0) {
+          // Spatial layer 0 and 1 are encoded.
+          layer_id.temporal_layer_id_per_spatial[0] = 0;
+          layer_id.temporal_layer_id_per_spatial[1] = 0;
+          layer_id.spatial_layer_id = 0;
+        } else {
+          // Only spatial layer 1 is encoded here.
+          layer_id.temporal_layer_id_per_spatial[1] = 1;
+          layer_id.spatial_layer_id = 1;
+        }
+      }
+      vpx_codec_control(&encoder, VP9E_SET_SVC_LAYER_ID, &layer_id);
+      // TODO(jianj): Fix the parameter passing for "is_key_frame" in
+      // set_frame_flags_bypass_model() for case of periodic key frames.
+      if (example_pattern == 0) {
+        set_frame_flags_bypass_mode_ex0(layer_id.temporal_layer_id,
+                                        svc_ctx.spatial_layers, frame_cnt == 0,
+                                        &ref_frame_config);
+      } else if (example_pattern == 1) {
+        set_frame_flags_bypass_mode_ex1(layer_id.temporal_layer_id,
+                                        svc_ctx.spatial_layers, frame_cnt == 0,
+                                        &ref_frame_config);
+      }
+      ref_frame_config.duration[0] = frame_duration * 1;
+      ref_frame_config.duration[1] = frame_duration * 1;
+
+      vpx_codec_control(&encoder, VP9E_SET_SVC_REF_FRAME_CONFIG,
                         &ref_frame_config);
       // Keep track of input frames, to account for frame drops in rate control
       // stats/metrics.
-      for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) {
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
         ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
                                 layer_id.temporal_layer_id];
       }
+    } else {
+      // For the fixed pattern SVC, temporal layer is given by superframe count.
+      unsigned int tl = 0;
+      if (enc_cfg.ts_number_layers == 2)
+        tl = (frame_cnt % 2 != 0);
+      else if (enc_cfg.ts_number_layers == 3) {
+        if (frame_cnt % 2 != 0) tl = 2;
+        if ((frame_cnt > 1) && ((frame_cnt - 2) % 4 == 0)) tl = 1;
+      }
+      for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl)
+        ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + tl];
     }
 
     vpx_usec_timer_start(&timer);
     res = vpx_svc_encode(
-        &svc_ctx, &codec, (end_of_stream ? NULL : &raw), pts, frame_duration,
+        &svc_ctx, &encoder, (end_of_stream ? NULL : &raw), pts, frame_duration,
         svc_ctx.speed >= 5 ? VPX_DL_REALTIME : VPX_DL_GOOD_QUALITY);
     vpx_usec_timer_mark(&timer);
     cx_time += vpx_usec_timer_elapsed(&timer);
 
-    printf("%s", vpx_svc_get_message(&svc_ctx));
     fflush(stdout);
     if (res != VPX_CODEC_OK) {
-      die_codec(&codec, "Failed to encode frame");
+      die_codec(&encoder, "Failed to encode frame");
     }
 
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+    while ((cx_pkt = vpx_codec_get_cx_data(&encoder, &iter)) != NULL) {
       switch (cx_pkt->kind) {
         case VPX_CODEC_CX_FRAME_PKT: {
           SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
           if (cx_pkt->data.frame.sz > 0) {
-#if OUTPUT_RC_STATS
-            uint64_t sizes[8];
-            int count = 0;
-#endif
             vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
                                          cx_pkt->data.frame.sz,
                                          cx_pkt->data.frame.pts);
 #if OUTPUT_RC_STATS
-            // TODO(marpan): Put this (to line728) in separate function.
             if (svc_ctx.output_rc_stat) {
-              vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
-              parse_superframe_index(cx_pkt->data.frame.buf,
-                                     cx_pkt->data.frame.sz, sizes, &count);
-              if (enc_cfg.ss_number_layers == 1)
-                sizes[0] = cx_pkt->data.frame.sz;
-              // Note computing input_layer_frames here won't account for frame
-              // drops in rate control stats.
-              // TODO(marpan): Fix this for non-bypass mode so we can get stats
-              // for dropped frames.
-              if (svc_ctx.temporal_layering_mode !=
-                  VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers +
-                                          layer_id.temporal_layer_id];
-                }
-              }
-              for (tl = layer_id.temporal_layer_id;
-                   tl < enc_cfg.ts_number_layers; ++tl) {
-                vpx_video_writer_write_frame(
-                    outfile[tl], cx_pkt->data.frame.buf, cx_pkt->data.frame.sz,
-                    cx_pkt->data.frame.pts);
-              }
-
-              for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                for (tl = layer_id.temporal_layer_id;
-                     tl < enc_cfg.ts_number_layers; ++tl) {
-                  const int layer = sl * enc_cfg.ts_number_layers + tl;
-                  ++rc.layer_tot_enc_frames[layer];
-                  rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl];
-                  // Keep count of rate control stats per layer, for non-key
-                  // frames.
-                  if (tl == (unsigned int)layer_id.temporal_layer_id &&
-                      !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) {
-                    rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl];
-                    rc.layer_avg_rate_mismatch[layer] +=
-                        fabs(8.0 * sizes[sl] - rc.layer_pfb[layer]) /
-                        rc.layer_pfb[layer];
-                    ++rc.layer_enc_frames[layer];
-                  }
-                }
-              }
-
-              // Update for short-time encoding bitrate states, for moving
-              // window of size rc->window, shifted by rc->window / 2.
-              // Ignore first window segment, due to key frame.
-              if (frame_cnt > (unsigned int)rc.window_size) {
-                tl = layer_id.temporal_layer_id;
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate;
-                }
-                if (frame_cnt % rc.window_size == 0) {
-                  rc.window_count += 1;
-                  rc.avg_st_encoding_bitrate += sum_bitrate / rc.window_size;
-                  rc.variance_st_encoding_bitrate +=
-                      (sum_bitrate / rc.window_size) *
-                      (sum_bitrate / rc.window_size);
-                  sum_bitrate = 0.0;
-                }
-              }
-
-              // Second shifted window.
-              if (frame_cnt >
-                  (unsigned int)(rc.window_size + rc.window_size / 2)) {
-                tl = layer_id.temporal_layer_id;
-                for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-                  sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate;
-                }
-
-                if (frame_cnt > (unsigned int)(2 * rc.window_size) &&
-                    frame_cnt % rc.window_size == 0) {
-                  rc.window_count += 1;
-                  rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size;
-                  rc.variance_st_encoding_bitrate +=
-                      (sum_bitrate2 / rc.window_size) *
-                      (sum_bitrate2 / rc.window_size);
-                  sum_bitrate2 = 0.0;
-                }
-              }
+              svc_output_rc_stats(&encoder, &enc_cfg, &layer_id, cx_pkt, &rc,
+                                  outfile, frame_cnt, framerate);
             }
 #endif
           }
@@ -868,6 +1177,11 @@ int main(int argc, const char **argv) {
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
           ++frames_received;
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+          if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf,
+                               (unsigned int)cx_pkt->data.frame.sz, NULL, 0))
+            die_codec(&decoder, "Failed to decode frame.");
+#endif
           break;
         }
         case VPX_CODEC_STATS_PKT: {
@@ -877,6 +1191,19 @@ int main(int argc, const char **argv) {
         }
         default: { break; }
       }
+
+#if CONFIG_VP9_DECODER && !SIMULCAST_MODE
+      vpx_codec_control(&encoder, VP9E_GET_SVC_LAYER_ID, &layer_id);
+      // Don't look for mismatch on top spatial and top temporal layers as they
+      // are non reference frames.
+      if ((enc_cfg.ss_number_layers > 1 || enc_cfg.ts_number_layers > 1) &&
+          !(layer_id.temporal_layer_id > 0 &&
+            layer_id.temporal_layer_id == (int)enc_cfg.ts_number_layers - 1 &&
+            cx_pkt->data.frame
+                .spatial_layer_encoded[enc_cfg.ss_number_layers - 1])) {
+        test_decode(&encoder, &decoder, frame_cnt, &mismatch_seen);
+      }
+#endif
     }
 
     if (!end_of_stream) {
@@ -885,41 +1212,45 @@ int main(int argc, const char **argv) {
     }
   }
 
-  // Compensate for the extra frame count for the bypass mode.
-  if (svc_ctx.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
-      const int layer =
-          sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id;
-      --rc.layer_input_frames[layer];
-    }
-  }
-
   printf("Processed %d frames\n", frame_cnt);
-  fclose(infile);
+
+  close_input_file(&app_input.input_ctx);
+
 #if OUTPUT_RC_STATS
   if (svc_ctx.output_rc_stat) {
     printout_rate_control_summary(&rc, &enc_cfg, frame_cnt);
     printf("\n");
   }
 #endif
-  if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
+  if (vpx_codec_destroy(&encoder))
+    die_codec(&encoder, "Failed to destroy codec");
   if (app_input.passes == 2) stats_close(&app_input.rc_stats, 1);
   if (writer) {
     vpx_video_writer_close(writer);
   }
 #if OUTPUT_RC_STATS
   if (svc_ctx.output_rc_stat) {
-    for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) {
-      vpx_video_writer_close(outfile[tl]);
+    for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) {
+      vpx_video_writer_close(outfile[sl]);
     }
   }
+#endif
+#if CONFIG_INTERNAL_STATS
+  if (mismatch_seen) {
+    fprintf(f, "First mismatch occurred in frame %d\n", mismatch_seen);
+  } else {
+    fprintf(f, "No mismatch detected in recon buffers\n");
+  }
+  fclose(f);
 #endif
   printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
          frame_cnt, 1000 * (float)cx_time / (double)(frame_cnt * 1000000),
          1000000 * (double)frame_cnt / (double)cx_time);
-  vpx_img_free(&raw);
+  if (app_input.input_ctx.file_type != FILE_TYPE_Y4M) {
+    vpx_img_free(&raw);
+  }
   // display average size, psnr
-  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
+  vpx_svc_dump_statistics(&svc_ctx);
   vpx_svc_release(&svc_ctx);
   return EXIT_SUCCESS;
 }
diff --git a/libs/libvpx/examples/vp9cx_set_ref.c b/libs/libvpx/examples/vp9cx_set_ref.c
index 3472689db2f..911ad38630c 100644
--- a/libs/libvpx/examples/vp9cx_set_ref.c
+++ b/libs/libvpx/examples/vp9cx_set_ref.c
@@ -68,128 +68,6 @@ void usage_exit() {
   exit(EXIT_FAILURE);
 }
 
-static int compare_img(const vpx_image_t *const img1,
-                       const vpx_image_t *const img2) {
-  uint32_t l_w = img1->d_w;
-  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  uint32_t i;
-  int match = 1;
-
-  match &= (img1->fmt == img2->fmt);
-  match &= (img1->d_w == img2->d_w);
-  match &= (img1->d_h == img2->d_h);
-
-  for (i = 0; i < img1->d_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     l_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     c_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     c_w) == 0);
-
-  return match;
-}
-
-#define mmin(a, b) ((a) < (b) ? (a) : (b))
-static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2, int yloc[4],
-                          int uloc[4], int vloc[4]) {
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_Y] +
-                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
-              *(img2->planes[VPX_PLANE_Y] +
-                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
-                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
-            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
-                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_U] +
-                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
-              *(img2->planes[VPX_PLANE_U] +
-                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(img1->planes[VPX_PLANE_U] +
-                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
-            uloc[3] = *(img2->planes[VPX_PLANE_U] +
-                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_V] +
-                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
-              *(img2->planes[VPX_PLANE_V] +
-                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(img1->planes[VPX_PLANE_V] +
-                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
-            vloc[3] = *(img2->planes[VPX_PLANE_V] +
-                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
 static void testing_decode(vpx_codec_ctx_t *encoder, vpx_codec_ctx_t *decoder,
                            unsigned int frame_out, int *mismatch_seen) {
   vpx_image_t enc_img, dec_img;
diff --git a/libs/libvpx/examples/vpx_dec_fuzzer.cc b/libs/libvpx/examples/vpx_dec_fuzzer.cc
new file mode 100644
index 00000000000..d55fe1571be
--- /dev/null
+++ b/libs/libvpx/examples/vpx_dec_fuzzer.cc
@@ -0,0 +1,118 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Fuzzer for libvpx decoders
+ * ==========================
+ * Requirements
+ * --------------
+ * Requires Clang 6.0 or above as -fsanitize=fuzzer is used as a linker
+ * option.
+
+ * Steps to build
+ * --------------
+ * Clone libvpx repository
+   $git clone https://chromium.googlesource.com/webm/libvpx
+
+ * Create a directory in parallel to libvpx and change directory
+   $mkdir vpx_dec_fuzzer
+   $cd vpx_dec_fuzzer/
+
+ * Enable sanitizers (Supported: address integer memory thread undefined)
+   $source ../libvpx/tools/set_analyzer_env.sh address
+
+ * Configure libvpx.
+ * Note --size-limit and VPX_MAX_ALLOCABLE_MEMORY are defined to avoid
+ * Out of memory errors when running generated fuzzer binary
+   $../libvpx/configure --disable-unit-tests --size-limit=12288x12288 \
+   --extra-cflags="-fsanitize=fuzzer-no-link \
+   -DVPX_MAX_ALLOCABLE_MEMORY=1073741824" \
+   --disable-webm-io --enable-debug --disable-vp8-encoder \
+   --disable-vp9-encoder --disable-examples
+
+ * Build libvpx
+   $make -j32
+
+ * Build vp9 fuzzer
+   $ $CXX $CXXFLAGS -std=c++11 -DDECODER=vp9 \
+   -fsanitize=fuzzer -I../libvpx -I. -Wl,--start-group \
+   ../libvpx/examples/vpx_dec_fuzzer.cc -o ./vpx_dec_fuzzer_vp9 \
+   ./libvpx.a -Wl,--end-group
+
+ * DECODER should be defined as vp9 or vp8 to enable vp9/vp8
+ *
+ * create a corpus directory and copy some ivf files there.
+ * Based on which codec (vp8/vp9) is being tested, it is recommended to
+ * have corresponding ivf files in corpus directory
+ * Empty corpus directoy also is acceptable, though not recommended
+   $mkdir CORPUS && cp some-files CORPUS
+
+ * Run fuzzing:
+   $./vpx_dec_fuzzer_vp9 CORPUS
+
+ * References:
+ * http://llvm.org/docs/LibFuzzer.html
+ * https://github.com/google/oss-fuzz
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <algorithm>
+#include <memory>
+
+#include "vpx/vp8dx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx_ports/mem_ops.h"
+
+#define IVF_FRAME_HDR_SZ (4 + 8) /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define VPXD_INTERFACE(name) VPXD_INTERFACE_(name)
+#define VPXD_INTERFACE_(name) vpx_codec_##name##_dx()
+
+extern "C" void usage_exit(void) { exit(EXIT_FAILURE); }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
+  if (size <= IVF_FILE_HDR_SZ) {
+    return 0;
+  }
+
+  vpx_codec_ctx_t codec;
+  // Set thread count in the range [1, 64].
+  const unsigned int threads = (data[IVF_FILE_HDR_SZ] & 0x3f) + 1;
+  vpx_codec_dec_cfg_t cfg = { threads, 0, 0 };
+  if (vpx_codec_dec_init(&codec, VPXD_INTERFACE(DECODER), &cfg, 0)) {
+    return 0;
+  }
+
+  data += IVF_FILE_HDR_SZ;
+  size -= IVF_FILE_HDR_SZ;
+
+  while (size > IVF_FRAME_HDR_SZ) {
+    size_t frame_size = mem_get_le32(data);
+    size -= IVF_FRAME_HDR_SZ;
+    data += IVF_FRAME_HDR_SZ;
+    frame_size = std::min(size, frame_size);
+
+    const vpx_codec_err_t err =
+        vpx_codec_decode(&codec, data, frame_size, nullptr, 0);
+    static_cast<void>(err);
+    vpx_codec_iter_t iter = nullptr;
+    vpx_image_t *img = nullptr;
+    while ((img = vpx_codec_get_frame(&codec, &iter)) != nullptr) {
+    }
+    data += frame_size;
+    size -= frame_size;
+  }
+  vpx_codec_destroy(&codec);
+  return 0;
+}
diff --git a/libs/libvpx/examples/vpx_temporal_svc_encoder.c b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
index f5736ea45d2..6afbee83d22 100644
--- a/libs/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libs/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -19,14 +19,18 @@
 #include <string.h>
 
 #include "./vpx_config.h"
+#include "./y4minput.h"
 #include "../vpx_ports/vpx_timer.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx_ports/bitops.h"
 
 #include "../tools_common.h"
 #include "../video_writer.h"
 
-#define VP8_ROI_MAP 0
+#define ROI_MAP 0
+
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
 
 static const char *exec_name;
 
@@ -89,19 +93,21 @@ struct RateControlMetrics {
 // in the stream.
 static void set_rate_control_metrics(struct RateControlMetrics *rc,
                                      vpx_codec_enc_cfg_t *cfg) {
-  unsigned int i = 0;
+  int i = 0;
   // Set the layer (cumulative) framerate and the target layer (non-cumulative)
   // per-frame-bandwidth, for the rate control encoding stats below.
   const double framerate = cfg->g_timebase.den / cfg->g_timebase.num;
+  const int ts_number_layers = cfg->ts_number_layers;
   rc->layer_framerate[0] = framerate / cfg->ts_rate_decimator[0];
   rc->layer_pfb[0] =
       1000.0 * rc->layer_target_bitrate[0] / rc->layer_framerate[0];
-  for (i = 0; i < cfg->ts_number_layers; ++i) {
+  for (i = 0; i < ts_number_layers; ++i) {
     if (i > 0) {
       rc->layer_framerate[i] = framerate / cfg->ts_rate_decimator[i];
-      rc->layer_pfb[i] = 1000.0 * (rc->layer_target_bitrate[i] -
-                                   rc->layer_target_bitrate[i - 1]) /
-                         (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
+      rc->layer_pfb[i] =
+          1000.0 *
+          (rc->layer_target_bitrate[i] - rc->layer_target_bitrate[i - 1]) /
+          (rc->layer_framerate[i] - rc->layer_framerate[i - 1]);
     }
     rc->layer_input_frames[i] = 0;
     rc->layer_enc_frames[i] = 0;
@@ -114,6 +120,9 @@ static void set_rate_control_metrics(struct RateControlMetrics *rc,
   rc->window_size = 15;
   rc->avg_st_encoding_bitrate = 0.0;
   rc->variance_st_encoding_bitrate = 0.0;
+  // Target bandwidth for the whole stream.
+  // Set to layer_target_bitrate for highest layer (total bitrate).
+  cfg->rc_target_bitrate = rc->layer_target_bitrate[ts_number_layers - 1];
 }
 
 static void printout_rate_control_summary(struct RateControlMetrics *rc,
@@ -164,38 +173,60 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     die("Error: Number of input frames not equal to output! \n");
 }
 
-#if VP8_ROI_MAP
-static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+#if ROI_MAP
+static void set_roi_map(const char *enc_name, vpx_codec_enc_cfg_t *cfg,
+                        vpx_roi_map_t *roi) {
   unsigned int i, j;
-  memset(roi, 0, sizeof(*roi));
+  int block_size = 0;
+  uint8_t is_vp8 = strncmp(enc_name, "vp8", 3) == 0 ? 1 : 0;
+  uint8_t is_vp9 = strncmp(enc_name, "vp9", 3) == 0 ? 1 : 0;
+  if (!is_vp8 && !is_vp9) {
+    die("unsupported codec.");
+  }
+  zero(*roi);
+
+  block_size = is_vp9 && !is_vp8 ? 8 : 16;
 
   // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
   // segment is 16x16 for vp8, 8x8 for vp9.
-  roi->rows = (cfg->g_h + 15) / 16;
-  roi->cols = (cfg->g_w + 15) / 16;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
 
   // Applies delta QP on the segment blocks, varies from -63 to 63.
   // Setting to negative means lower QP (better quality).
   // Below we set delta_q to the extreme (-63) to show strong effect.
-  roi->delta_q[0] = 0;
+  // VP8 uses the first 4 segments. VP9 uses all 8 segments.
+  zero(roi->delta_q);
   roi->delta_q[1] = -63;
-  roi->delta_q[2] = 0;
-  roi->delta_q[3] = 0;
 
   // Applies delta loopfilter strength on the segment blocks, varies from -63 to
-  // 63. Setting to positive means stronger loopfilter.
-  roi->delta_lf[0] = 0;
-  roi->delta_lf[1] = 0;
-  roi->delta_lf[2] = 0;
-  roi->delta_lf[3] = 0;
-
-  // Applies skip encoding threshold on the segment blocks, varies from 0 to
-  // UINT_MAX. Larger value means more skipping of encoding is possible.
-  // This skip threshold only applies on delta frames.
-  roi->static_threshold[0] = 0;
-  roi->static_threshold[1] = 0;
-  roi->static_threshold[2] = 0;
-  roi->static_threshold[3] = 0;
+  // 63. Setting to positive means stronger loopfilter. VP8 uses the first 4
+  // segments. VP9 uses all 8 segments.
+  zero(roi->delta_lf);
+
+  if (is_vp8) {
+    // Applies skip encoding threshold on the segment blocks, varies from 0 to
+    // UINT_MAX. Larger value means more skipping of encoding is possible.
+    // This skip threshold only applies on delta frames.
+    zero(roi->static_threshold);
+  }
+
+  if (is_vp9) {
+    // Apply skip segment. Setting to 1 means this block will be copied from
+    // previous frame.
+    zero(roi->skip);
+  }
+
+  if (is_vp9) {
+    // Apply ref frame segment.
+    // -1 : Do not apply this segment.
+    //  0 : Froce using intra.
+    //  1 : Force using last.
+    //  2 : Force using golden.
+    //  3 : Force using alfref but not used in non-rd pickmode for 0 lag.
+    memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+    roi->ref_frame[1] = 1;
+  }
 
   // Use 2 states: 1 is center square, 0 is the rest.
   roi->roi_map =
@@ -563,12 +594,12 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
   int flag_periodicity = 1;
-#if VP8_ROI_MAP
+#if ROI_MAP
   vpx_roi_map_t roi;
 #endif
-  vpx_svc_layer_id_t layer_id = { 0, 0 };
+  vpx_svc_layer_id_t layer_id;
   const VpxInterface *encoder = NULL;
-  FILE *infile = NULL;
+  struct VpxInputContext input_ctx;
   struct RateControlMetrics rc;
   int64_t cx_time = 0;
   const int min_args_base = 13;
@@ -583,6 +614,15 @@ int main(int argc, char **argv) {
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
 
+  zero(rc.layer_target_bitrate);
+  memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
+  memset(&input_ctx, 0, sizeof(input_ctx));
+  /* Setup default input stream settings */
+  input_ctx.framerate.numerator = 30;
+  input_ctx.framerate.denominator = 1;
+  input_ctx.only_i420 = 1;
+  input_ctx.bit_depth = 0;
+
   exec_name = argv[0];
   // Check usage and arguments.
   if (argc < min_args) {
@@ -621,6 +661,9 @@ int main(int argc, char **argv) {
     die("Invalid number of arguments");
   }
 
+  input_ctx.filename = argv[1];
+  open_input_file(&input_ctx);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (strtol(argv[argc - 1], NULL, 0)) {
     case 8:
@@ -637,14 +680,22 @@ int main(int argc, char **argv) {
       break;
     default: die("Invalid bit depth (8, 10, 12) %s", argv[argc - 1]);
   }
-  if (!vpx_img_alloc(
-          &raw, bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
-          width, height, 32)) {
-    die("Failed to allocate image", width, height);
+
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!vpx_img_alloc(
+            &raw,
+            bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
+            width, height, 32)) {
+      die("Failed to allocate image", width, height);
+    }
   }
 #else
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-    die("Failed to allocate image", width, height);
+  // Y4M reader has its own allocation.
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
+      die("Failed to allocate image", width, height);
+    }
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -675,6 +726,9 @@ int main(int argc, char **argv) {
   if (speed < 0) {
     die("Invalid speed setting: must be positive");
   }
+  if (strncmp(encoder->name, "vp9", 3) == 0 && speed > 9) {
+    warn("Mapping speed %d to speed 9.\n", speed);
+  }
 
   for (i = min_args_base;
        (int)i < min_args_base + mode_to_num_layers[layering_mode]; ++i) {
@@ -722,13 +776,15 @@ int main(int argc, char **argv) {
 
   set_rate_control_metrics(&rc, &cfg);
 
-  // Target bandwidth for the whole stream.
-  // Set to layer_target_bitrate for highest layer (total bitrate).
-  cfg.rc_target_bitrate = rc.layer_target_bitrate[cfg.ts_number_layers - 1];
-
-  // Open input file.
-  if (!(infile = fopen(argv[1], "rb"))) {
-    die("Failed to open %s for reading", argv[1]);
+  if (input_ctx.file_type == FILE_TYPE_Y4M) {
+    if (input_ctx.width != cfg.g_w || input_ctx.height != cfg.g_h) {
+      die("Incorrect width or height: %d x %d", cfg.g_w, cfg.g_h);
+    }
+    if (input_ctx.framerate.numerator != cfg.g_timebase.den ||
+        input_ctx.framerate.denominator != cfg.g_timebase.num) {
+      die("Incorrect framerate: numerator %d denominator %d",
+          cfg.g_timebase.num, cfg.g_timebase.den);
+    }
   }
 
   framerate = cfg.g_timebase.den / cfg.g_timebase.num;
@@ -766,8 +822,8 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
-#if VP8_ROI_MAP
-    vp8_set_roi_map(&cfg, &roi);
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
     if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
       die_codec(&codec, "Failed to set ROI map");
 #endif
@@ -783,7 +839,13 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
-    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
+    vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
+#if ROI_MAP
+    set_roi_map(encoder->name, &cfg, &roi);
+    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
+#endif
     // TODO(marpan/jianj): There is an issue with row-mt for low resolutons at
     // high speed settings, disable its use for those cases for now.
     if (cfg.g_threads > 1 && ((cfg.g_w > 320 && cfg.g_h > 240) || speed < 7))
@@ -822,6 +884,7 @@ int main(int argc, char **argv) {
     layer_id.spatial_layer_id = 0;
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
+    layer_id.temporal_layer_id_per_spatial[0] = layer_id.temporal_layer_id;
     if (strncmp(encoder->name, "vp9", 3) == 0) {
       vpx_codec_control(&codec, VP9E_SET_SVC_LAYER_ID, &layer_id);
     } else if (strncmp(encoder->name, "vp8", 3) == 0) {
@@ -830,7 +893,7 @@ int main(int argc, char **argv) {
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
     if (layering_mode == 0) flags = 0;
-    frame_avail = vpx_img_read(&raw, infile);
+    frame_avail = read_frame(&input_ctx, &raw);
     if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id];
     vpx_usec_timer_start(&timer);
     if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags,
@@ -898,7 +961,7 @@ int main(int argc, char **argv) {
     ++frame_cnt;
     pts += frame_duration;
   }
-  fclose(infile);
+  close_input_file(&input_ctx);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
   printf("\n");
   printf("Frame cnt and encoding time/FPS stats for encoding: %d %f %f \n",
@@ -910,6 +973,12 @@ int main(int argc, char **argv) {
   // Try to rewrite the output file headers with the actual frame count.
   for (i = 0; i < cfg.ts_number_layers; ++i) vpx_video_writer_close(outfile[i]);
 
-  vpx_img_free(&raw);
+  if (input_ctx.file_type != FILE_TYPE_Y4M) {
+    vpx_img_free(&raw);
+  }
+
+#if ROI_MAP
+  free(roi.roi_map);
+#endif
   return EXIT_SUCCESS;
 }
diff --git a/libs/libvpx/ivfdec.c b/libs/libvpx/ivfdec.c
index f64e594ab0e..3e179bc6ed2 100644
--- a/libs/libvpx/ivfdec.c
+++ b/libs/libvpx/ivfdec.c
@@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
   size_t frame_size = 0;
 
   if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile)) warn("Failed to read frame size\n");
+    if (!feof(infile)) warn("Failed to read frame size");
   } else {
     frame_size = mem_get_le32(raw_header);
 
     if (frame_size > 256 * 1024 * 1024) {
-      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      warn("Read invalid frame size (%u)", (unsigned int)frame_size);
       frame_size = 0;
     }
 
@@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
         *buffer = new_buffer;
         *buffer_size = 2 * frame_size;
       } else {
-        warn("Failed to allocate compressed data buffer\n");
+        warn("Failed to allocate compressed data buffer");
         frame_size = 0;
       }
     }
@@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 
   if (!feof(infile)) {
     if (fread(*buffer, 1, frame_size, infile) != frame_size) {
-      warn("Failed to read full frame\n");
+      warn("Failed to read full frame");
       return 1;
     }
 
diff --git a/libs/libvpx/ivfdec.h b/libs/libvpx/ivfdec.h
index af725572b48..847cd79f3fe 100644
--- a/libs/libvpx/ivfdec.h
+++ b/libs/libvpx/ivfdec.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef IVFDEC_H_
-#define IVFDEC_H_
+#ifndef VPX_IVFDEC_H_
+#define VPX_IVFDEC_H_
 
 #include "./tools_common.h"
 
@@ -25,4 +25,4 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
 } /* extern "C" */
 #endif
 
-#endif  // IVFDEC_H_
+#endif  // VPX_IVFDEC_H_
diff --git a/libs/libvpx/ivfenc.h b/libs/libvpx/ivfenc.h
index ebdce47be8f..483f2d2c591 100644
--- a/libs/libvpx/ivfenc.h
+++ b/libs/libvpx/ivfenc.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef IVFENC_H_
-#define IVFENC_H_
+#ifndef VPX_IVFENC_H_
+#define VPX_IVFENC_H_
 
 #include "./tools_common.h"
 
@@ -30,4 +30,4 @@ void ivf_write_frame_size(FILE *outfile, size_t frame_size);
 } /* extern "C" */
 #endif
 
-#endif  // IVFENC_H_
+#endif  // VPX_IVFENC_H_
diff --git a/libs/libvpx/libs.doxy_template b/libs/libvpx/libs.doxy_template
index 5a8f847280e..1eacc8fe2db 100644
--- a/libs/libvpx/libs.doxy_template
+++ b/libs/libvpx/libs.doxy_template
@@ -943,18 +943,6 @@ GENERATE_XML           = NO
 
 XML_OUTPUT             = xml
 
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
 # If the XML_PROGRAMLISTING tag is set to YES Doxygen will
 # dump the program listings (including syntax highlighting
 # and cross-referencing information) to the XML output. Note that
diff --git a/libs/libvpx/libs.mk b/libs/libvpx/libs.mk
index a3e2f9d0ebd..67d7512abea 100644
--- a/libs/libvpx/libs.mk
+++ b/libs/libvpx/libs.mk
@@ -88,7 +88,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
   CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
   CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
-  INSTALL-LIBS-$(CONFIG_SPATIAL_SVC) += include/vpx/svc_context.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
@@ -113,13 +112,6 @@ ifeq ($(CONFIG_DECODERS),yes)
   CODEC_DOC_SECTIONS += decoder
 endif
 
-# Suppress -Wextra warnings in third party code.
-$(BUILD_PFX)third_party/googletest/%.cc.o: CXXFLAGS += -Wno-missing-field-initializers
-# Suppress -Wextra warnings in first party code pending investigation.
-# https://bugs.chromium.org/p/webm/issues/detail?id=1069
-$(BUILD_PFX)vp8/encoder/onyx_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-$(BUILD_PFX)vp8/decoder/onyxd_if.c.o: CFLAGS += -Wno-unknown-warning-option -Wno-clobbered
-
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
@@ -153,9 +145,6 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += vpx_dsp/x86/bitdepth_conversion_sse2.asm
 endif
 CODEC_EXPORTS-yes += vpx/exports_com
 CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_enc
-ifeq ($(CONFIG_SPATIAL_SVC),yes)
-CODEC_EXPORTS-$(CONFIG_ENCODERS) += vpx/exports_spatial_svc
-endif
 CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
@@ -206,6 +195,8 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
+vpx.$(VCPROJ_SFX): VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
+
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
@@ -218,7 +209,15 @@ vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
             --out=$@ $(CFLAGS) \
-            $(filter-out $(addprefix %, $(ASM_INCLUDES)), $^) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp8/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.c, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vp9/%.h, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx/%, $(VCPROJ_SRCS)) \
+            $(filter $(SRC_PATH_BARE)/vpx_dsp/%, $(VCPROJ_SRCS)) \
+            $(filter-out $(addprefix $(SRC_PATH_BARE)/, \
+                           vp8/%.c vp8/%.h vp9/%.c vp9/%.h vpx/% vpx_dsp/%), \
+              $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
 PROJECTS-yes += vpx.$(VCPROJ_SFX)
@@ -233,8 +232,8 @@ OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 5
-SO_VERSION_MINOR := 0
+SO_VERSION_MAJOR := 6
+SO_VERSION_MINOR := 1
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -274,18 +273,6 @@ $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
 $(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
-libvpx.ver: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)echo "{ global:" > $@
-	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
-	$(qexec)echo "local: *; };" >> $@
-CLEAN-OBJS += libvpx.ver
-
-libvpx.syms: $(call enabled,CODEC_EXPORTS)
-	@echo "    [CREATE] $@"
-	$(qexec)awk '{print "_"$$2}' $^ >$@
-CLEAN-OBJS += libvpx.syms
-
 libvpx.def: $(call enabled,CODEC_EXPORTS)
 	@echo "    [CREATE] $@"
 	$(qexec)echo LIBRARY $(LIBVPX_SO:.dll=) INITINSTANCE TERMINSTANCE > $@
@@ -345,6 +332,18 @@ INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 endif
 
+libvpx.ver: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)echo "{ global:" > $@
+	$(qexec)for f in $?; do awk '{print $$2";"}' < $$f >>$@; done
+	$(qexec)echo "local: *; };" >> $@
+CLEAN-OBJS += libvpx.ver
+
+libvpx.syms: $(call enabled,CODEC_EXPORTS)
+	@echo "    [CREATE] $@"
+	$(qexec)awk '{print "_"$$2}' $^ >$@
+CLEAN-OBJS += libvpx.syms
+
 #
 # Rule to make assembler configuration file from C configuration file
 #
diff --git a/libs/libvpx/mainpage.dox b/libs/libvpx/mainpage.dox
index ec202fa4fb5..4b0dff08710 100644
--- a/libs/libvpx/mainpage.dox
+++ b/libs/libvpx/mainpage.dox
@@ -25,8 +25,10 @@
     release.
   - The \ref readme contains instructions on recompiling the sample applications.
   - Read the \ref usage "usage" for a narrative on codec usage.
+  \if samples
   - Read the \ref samples "sample code" for examples of how to interact with the
     codec.
+  \endif
   - \ref codec reference
   \if encoder
   - \ref encoder reference
diff --git a/libs/libvpx/md5_utils.c b/libs/libvpx/md5_utils.c
index 093798b8339..9ddb104c8a6 100644
--- a/libs/libvpx/md5_utils.c
+++ b/libs/libvpx/md5_utils.c
@@ -163,7 +163,7 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
  */
 VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
                                                  UWORD32 const in[16]) {
-  register UWORD32 a, b, c, d;
+  UWORD32 a, b, c, d;
 
   a = buf[0];
   b = buf[1];
diff --git a/libs/libvpx/md5_utils.h b/libs/libvpx/md5_utils.h
index bd4991b3ad9..e0d5a2d1fbd 100644
--- a/libs/libvpx/md5_utils.h
+++ b/libs/libvpx/md5_utils.h
@@ -20,8 +20,8 @@
  * Still in the public domain.
  */
 
-#ifndef MD5_UTILS_H_
-#define MD5_UTILS_H_
+#ifndef VPX_MD5_UTILS_H_
+#define VPX_MD5_UTILS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -46,4 +46,4 @@ void MD5Transform(UWORD32 buf[4], UWORD32 const in[16]);
 }  // extern "C"
 #endif
 
-#endif  // MD5_UTILS_H_
+#endif  // VPX_MD5_UTILS_H_
diff --git a/libs/libvpx/rate_hist.h b/libs/libvpx/rate_hist.h
index 00a1676a617..d6a4c685195 100644
--- a/libs/libvpx/rate_hist.h
+++ b/libs/libvpx/rate_hist.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef RATE_HIST_H_
-#define RATE_HIST_H_
+#ifndef VPX_RATE_HIST_H_
+#define VPX_RATE_HIST_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -37,4 +37,4 @@ void show_rate_histogram(struct rate_hist *hist, const vpx_codec_enc_cfg_t *cfg,
 }  // extern "C"
 #endif
 
-#endif  // RATE_HIST_H_
+#endif  // VPX_RATE_HIST_H_
diff --git a/libs/libvpx/test/acm_random.h b/libs/libvpx/test/acm_random.h
index d915cf91336..ccfa20681aa 100644
--- a/libs/libvpx/test/acm_random.h
+++ b/libs/libvpx/test/acm_random.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_ACM_RANDOM_H_
-#define TEST_ACM_RANDOM_H_
+#ifndef VPX_TEST_ACM_RANDOM_H_
+#define VPX_TEST_ACM_RANDOM_H_
 
 #include <assert.h>
 
@@ -34,6 +34,24 @@ class ACMRandom {
     return (value >> 15) & 0xffff;
   }
 
+  int32_t Rand20Signed(void) {
+    // Use 20 bits: values between 524287 and -524288.
+    const uint32_t value = random_.Generate(1048576);
+    return static_cast<int32_t>(value) - 524288;
+  }
+
+  int16_t Rand16Signed(void) {
+    // Use 16 bits: values between 32767 and -32768.
+    const uint32_t value = random_.Generate(65536);
+    return static_cast<int16_t>(value) - 32768;
+  }
+
+  int16_t Rand13Signed(void) {
+    // Use 13 bits: values between 4095 and -4096.
+    const uint32_t value = random_.Generate(8192);
+    return static_cast<int16_t>(value) - 4096;
+  }
+
   int16_t Rand9Signed(void) {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
@@ -73,4 +91,4 @@ class ACMRandom {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_ACM_RANDOM_H_
+#endif  // VPX_TEST_ACM_RANDOM_H_
diff --git a/libs/libvpx/test/active_map_refresh_test.cc b/libs/libvpx/test/active_map_refresh_test.cc
index d893635505c..a985ed4f119 100644
--- a/libs/libvpx/test/active_map_refresh_test.cc
+++ b/libs/libvpx/test/active_map_refresh_test.cc
@@ -74,7 +74,7 @@ class ActiveMapRefreshTest
                                   ::libvpx_test::Encoder *encoder) {
     ::libvpx_test::Y4mVideoSource *y4m_video =
         static_cast<libvpx_test::Y4mVideoSource *>(video);
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, kAqModeCyclicRefresh);
     } else if (video->frame() >= 2 && video->img()) {
diff --git a/libs/libvpx/test/active_map_test.cc b/libs/libvpx/test/active_map_test.cc
index 1d24f956f59..03536c81ef4 100644
--- a/libs/libvpx/test/active_map_test.cc
+++ b/libs/libvpx/test/active_map_test.cc
@@ -35,7 +35,7 @@ class ActiveMapTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
diff --git a/libs/libvpx/test/add_noise_test.cc b/libs/libvpx/test/add_noise_test.cc
index eae32c33bb8..0d1893c524a 100644
--- a/libs/libvpx/test/add_noise_test.cc
+++ b/libs/libvpx/test/add_noise_test.cc
@@ -8,8 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <math.h>
+#include <tuple>
+
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -25,7 +28,10 @@ typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
                              int height, int pitch);
 
-class AddNoiseTest : public ::testing::TestWithParam<AddNoiseFunc> {
+typedef std::tuple<double, AddNoiseFunc> AddNoiseTestFPParam;
+
+class AddNoiseTest : public ::testing::Test,
+                     public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
   virtual ~AddNoiseTest() {}
@@ -44,14 +50,14 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   const int height = 64;
   const int image_size = width * height;
   int8_t noise[kNoiseSize];
-  const int clamp = vpx_setup_noise(4.4, noise, kNoiseSize);
+  const int clamp = vpx_setup_noise(GET_PARAM(0), noise, kNoiseSize);
   uint8_t *const s =
       reinterpret_cast<uint8_t *>(vpx_calloc(image_size, sizeof(*s)));
   ASSERT_TRUE(s != NULL);
   memset(s, 99, image_size * sizeof(*s));
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure we don't end up having either the same or no added
   // noise either vertically or horizontally.
@@ -70,7 +76,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   memset(s, 255, image_size);
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure don't roll over.
   for (int i = 0; i < image_size; ++i) {
@@ -81,7 +87,7 @@ TEST_P(AddNoiseTest, CheckNoiseAdded) {
   memset(s, 0, image_size);
 
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
 
   // Check to make sure don't roll under.
   for (int i = 0; i < image_size; ++i) {
@@ -108,7 +114,7 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
 
   srand(0);
   ASM_REGISTER_STATE_CHECK(
-      GetParam()(s, noise, clamp, clamp, width, height, width));
+      GET_PARAM(1)(s, noise, clamp, clamp, width, height, width));
   srand(0);
   ASM_REGISTER_STATE_CHECK(
       vpx_plane_add_noise_c(d, noise, clamp, clamp, width, height, width));
@@ -121,16 +127,24 @@ TEST_P(AddNoiseTest, CheckCvsAssembly) {
   vpx_free(s);
 }
 
-INSTANTIATE_TEST_CASE_P(C, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_c));
+using std::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_c),
+                      make_tuple(4.4, vpx_plane_add_noise_c)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_sse2));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_sse2),
+                      make_tuple(4.4, vpx_plane_add_noise_sse2)));
 #endif
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, AddNoiseTest,
-                        ::testing::Values(vpx_plane_add_noise_msa));
+INSTANTIATE_TEST_CASE_P(
+    MSA, AddNoiseTest,
+    ::testing::Values(make_tuple(3.25, vpx_plane_add_noise_msa),
+                      make_tuple(4.4, vpx_plane_add_noise_msa)));
 #endif
 }  // namespace
diff --git a/libs/libvpx/test/alt_ref_aq_segment_test.cc b/libs/libvpx/test/alt_ref_aq_segment_test.cc
index 64a3011eb99..6e03a478525 100644
--- a/libs/libvpx/test/alt_ref_aq_segment_test.cc
+++ b/libs/libvpx/test/alt_ref_aq_segment_test.cc
@@ -32,7 +32,7 @@ class AltRefAqSegmentTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
diff --git a/libs/libvpx/test/altref_test.cc b/libs/libvpx/test/altref_test.cc
index f9308c2717a..0119be4da0a 100644
--- a/libs/libvpx/test/altref_test.cc
+++ b/libs/libvpx/test/altref_test.cc
@@ -35,7 +35,7 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_CPUUSED, 3);
     }
diff --git a/libs/libvpx/test/android/README b/libs/libvpx/test/android/README
index 4a1adcf7f48..ee21f9b652c 100644
--- a/libs/libvpx/test/android/README
+++ b/libs/libvpx/test/android/README
@@ -3,12 +3,12 @@ Android.mk will build vpx unittests on android.
 ./libvpx/configure --target=armv7-android-gcc --enable-external-build \
   --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
   --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
-  --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK
+  --disable-examples --disable-runtime-cpu-detect
 
 2) From the parent directory, invoke ndk-build:
 NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
   APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
-  APP_STL=gnustl_static
+  APP_STL=c++_static
 
 Note: Both adb and ndk-build are available prebuilt at:
   https://chromium.googlesource.com/android_tools
diff --git a/libs/libvpx/test/aq_segment_test.cc b/libs/libvpx/test/aq_segment_test.cc
index 1c2147fbb2c..3c4053be7f3 100644
--- a/libs/libvpx/test/aq_segment_test.cc
+++ b/libs/libvpx/test/aq_segment_test.cc
@@ -31,7 +31,7 @@ class AqSegmentTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 100);
diff --git a/libs/libvpx/test/avg_test.cc b/libs/libvpx/test/avg_test.cc
index ad21198e4b4..3d24f1cdb6d 100644
--- a/libs/libvpx/test/avg_test.cc
+++ b/libs/libvpx/test/avg_test.cc
@@ -11,6 +11,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -22,40 +23,43 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx/vpx_codec.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
+
+template <typename Pixel>
 class AverageTestBase : public ::testing::Test {
  public:
-  AverageTestBase(int width, int height) : width_(width), height_(height) {}
-
-  static void SetUpTestCase() {
-    source_data_ = reinterpret_cast<uint8_t *>(
-        vpx_memalign(kDataAlignment, kDataBlockSize));
-  }
+  AverageTestBase(int width, int height)
+      : width_(width), height_(height), source_data_(NULL), source_stride_(0),
+        bit_depth_(8) {}
 
-  static void TearDownTestCase() {
+  virtual void TearDown() {
     vpx_free(source_data_);
     source_data_ = NULL;
+    libvpx_test::ClearSystemState();
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 64 * 128;
 
   virtual void SetUp() {
+    source_data_ = reinterpret_cast<Pixel *>(
+        vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
     source_stride_ = (width_ + 31) & ~31;
+    bit_depth_ = 8;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
   // Sum Pixels
-  static unsigned int ReferenceAverage8x8(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage8x8(const Pixel *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 8; ++h) {
       for (int w = 0; w < 8; ++w) average += source[h * pitch + w];
@@ -63,7 +67,7 @@ class AverageTestBase : public ::testing::Test {
     return ((average + 32) >> 6);
   }
 
-  static unsigned int ReferenceAverage4x4(const uint8_t *source, int pitch) {
+  static unsigned int ReferenceAverage4x4(const Pixel *source, int pitch) {
     unsigned int average = 0;
     for (int h = 0; h < 4; ++h) {
       for (int w = 0; w < 4; ++w) average += source[h * pitch + w];
@@ -71,7 +75,7 @@ class AverageTestBase : public ::testing::Test {
     return ((average + 8) >> 4);
   }
 
-  void FillConstant(uint8_t fill_constant) {
+  void FillConstant(Pixel fill_constant) {
     for (int i = 0; i < width_ * height_; ++i) {
       source_data_[i] = fill_constant;
     }
@@ -79,21 +83,22 @@ class AverageTestBase : public ::testing::Test {
 
   void FillRandom() {
     for (int i = 0; i < width_ * height_; ++i) {
-      source_data_[i] = rnd_.Rand8();
+      source_data_[i] = rnd_.Rand16() & ((1 << bit_depth_) - 1);
     }
   }
 
   int width_, height_;
-  static uint8_t *source_data_;
+  Pixel *source_data_;
   int source_stride_;
+  int bit_depth_;
 
   ACMRandom rnd_;
 };
 typedef unsigned int (*AverageFunction)(const uint8_t *s, int pitch);
 
-typedef std::tr1::tuple<int, int, int, int, AverageFunction> AvgFunc;
+typedef std::tuple<int, int, int, int, AverageFunction> AvgFunc;
 
-class AverageTest : public AverageTestBase,
+class AverageTest : public AverageTestBase<uint8_t>,
                     public ::testing::WithParamInterface<AvgFunc> {
  public:
   AverageTest() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
@@ -119,12 +124,40 @@ class AverageTest : public AverageTestBase,
   }
 };
 
+#if CONFIG_VP9_HIGHBITDEPTH
+class AverageTestHBD : public AverageTestBase<uint16_t>,
+                       public ::testing::WithParamInterface<AvgFunc> {
+ public:
+  AverageTestHBD() : AverageTestBase(GET_PARAM(0), GET_PARAM(1)) {}
+
+ protected:
+  void CheckAverages() {
+    const int block_size = GET_PARAM(3);
+    unsigned int expected = 0;
+    if (block_size == 8) {
+      expected =
+          ReferenceAverage8x8(source_data_ + GET_PARAM(2), source_stride_);
+    } else if (block_size == 4) {
+      expected =
+          ReferenceAverage4x4(source_data_ + GET_PARAM(2), source_stride_);
+    }
+
+    ASM_REGISTER_STATE_CHECK(GET_PARAM(4)(
+        CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_));
+    unsigned int actual = GET_PARAM(4)(
+        CONVERT_TO_BYTEPTR(source_data_ + GET_PARAM(2)), source_stride_);
+
+    EXPECT_EQ(expected, actual);
+  }
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 typedef void (*IntProRowFunc)(int16_t hbuf[16], uint8_t const *ref,
                               const int ref_stride, const int height);
 
-typedef std::tr1::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
+typedef std::tuple<int, IntProRowFunc, IntProRowFunc> IntProRowParam;
 
-class IntProRowTest : public AverageTestBase,
+class IntProRowTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProRowParam> {
  public:
   IntProRowTest()
@@ -135,6 +168,10 @@ class IntProRowTest : public AverageTestBase,
 
  protected:
   virtual void SetUp() {
+    source_data_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
+    ASSERT_TRUE(source_data_ != NULL);
+
     hbuf_asm_ = reinterpret_cast<int16_t *>(
         vpx_memalign(kDataAlignment, sizeof(*hbuf_asm_) * 16));
     hbuf_c_ = reinterpret_cast<int16_t *>(
@@ -142,6 +179,8 @@ class IntProRowTest : public AverageTestBase,
   }
 
   virtual void TearDown() {
+    vpx_free(source_data_);
+    source_data_ = NULL;
     vpx_free(hbuf_c_);
     hbuf_c_ = NULL;
     vpx_free(hbuf_asm_);
@@ -164,9 +203,9 @@ class IntProRowTest : public AverageTestBase,
 
 typedef int16_t (*IntProColFunc)(uint8_t const *ref, const int width);
 
-typedef std::tr1::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
+typedef std::tuple<int, IntProColFunc, IntProColFunc> IntProColParam;
 
-class IntProColTest : public AverageTestBase,
+class IntProColTest : public AverageTestBase<uint8_t>,
                       public ::testing::WithParamInterface<IntProColParam> {
  public:
   IntProColTest() : AverageTestBase(GET_PARAM(0), 1), sum_asm_(0), sum_c_(0) {
@@ -189,7 +228,7 @@ class IntProColTest : public AverageTestBase,
 };
 
 typedef int (*SatdFunc)(const tran_low_t *coeffs, int length);
-typedef std::tr1::tuple<int, SatdFunc> SatdTestParam;
+typedef std::tuple<int, SatdFunc> SatdTestParam;
 
 class SatdTest : public ::testing::Test,
                  public ::testing::WithParamInterface<SatdTestParam> {
@@ -212,12 +251,7 @@ class SatdTest : public ::testing::Test,
     for (int i = 0; i < satd_size_; ++i) src_[i] = val;
   }
 
-  void FillRandom() {
-    for (int i = 0; i < satd_size_; ++i) {
-      const int16_t tmp = rnd_.Rand16();
-      src_[i] = (tran_low_t)tmp;
-    }
-  }
+  virtual void FillRandom() = 0;
 
   void Check(const int expected) {
     int total;
@@ -225,17 +259,29 @@ class SatdTest : public ::testing::Test,
     EXPECT_EQ(expected, total);
   }
 
+  tran_low_t *GetCoeff() const { return src_; }
+
   int satd_size_;
+  ACMRandom rnd_;
+  tran_low_t *src_;
 
  private:
-  tran_low_t *src_;
   SatdFunc satd_func_;
-  ACMRandom rnd_;
+};
+
+class SatdLowbdTest : public SatdTest {
+ protected:
+  virtual void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) {
+      const int16_t tmp = rnd_.Rand16Signed();
+      src_[i] = (tran_low_t)tmp;
+    }
+  }
 };
 
 typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
                                   const tran_low_t *dqcoeff, int block_size);
-typedef std::tr1::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
+typedef std::tuple<int, BlockErrorFunc> BlockErrorTestFPParam;
 
 class BlockErrorTestFP
     : public ::testing::Test,
@@ -279,6 +325,10 @@ class BlockErrorTestFP
     EXPECT_EQ(expected, total);
   }
 
+  tran_low_t *GetCoeff() const { return coeff_; }
+
+  tran_low_t *GetDQCoeff() const { return dqcoeff_; }
+
   int txfm_size_;
 
  private:
@@ -288,8 +338,6 @@ class BlockErrorTestFP
   ACMRandom rnd_;
 };
 
-uint8_t *AverageTestBase::source_data_ = NULL;
-
 TEST_P(AverageTest, MinValue) {
   FillConstant(0);
   CheckAverages();
@@ -308,6 +356,27 @@ TEST_P(AverageTest, Random) {
     CheckAverages();
   }
 }
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST_P(AverageTestHBD, MinValue) {
+  FillConstant(0);
+  CheckAverages();
+}
+
+TEST_P(AverageTestHBD, MaxValue) {
+  FillConstant((1 << VPX_BITS_12) - 1);
+  CheckAverages();
+}
+
+TEST_P(AverageTestHBD, Random) {
+  bit_depth_ = VPX_BITS_12;
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  for (int i = 0; i < 1000; i++) {
+    FillRandom();
+    CheckAverages();
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 TEST_P(IntProRowTest, MinValue) {
   FillConstant(0);
@@ -339,27 +408,27 @@ TEST_P(IntProColTest, Random) {
   RunComparison();
 }
 
-TEST_P(SatdTest, MinValue) {
+TEST_P(SatdLowbdTest, MinValue) {
   const int kMin = -32640;
   const int expected = -kMin * satd_size_;
   FillConstant(kMin);
   Check(expected);
 }
 
-TEST_P(SatdTest, MaxValue) {
+TEST_P(SatdLowbdTest, MaxValue) {
   const int kMax = 32640;
   const int expected = kMax * satd_size_;
   FillConstant(kMax);
   Check(expected);
 }
 
-TEST_P(SatdTest, Random) {
+TEST_P(SatdLowbdTest, Random) {
   int expected;
   switch (satd_size_) {
-    case 16: expected = 205298; break;
-    case 64: expected = 1113950; break;
-    case 256: expected = 4268415; break;
-    case 1024: expected = 16954082; break;
+    case 16: expected = 263252; break;
+    case 64: expected = 1105420; break;
+    case 256: expected = 4252250; break;
+    case 1024: expected = 16876840; break;
     default:
       FAIL() << "Invalid satd size (" << satd_size_
              << ") valid: 16/64/256/1024";
@@ -368,11 +437,12 @@ TEST_P(SatdTest, Random) {
   Check(expected);
 }
 
-TEST_P(SatdTest, DISABLED_Speed) {
+TEST_P(SatdLowbdTest, DISABLED_Speed) {
   const int kCountSpeedTestBlock = 20000;
   vpx_usec_timer timer;
-  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
   const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
 
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
@@ -383,6 +453,62 @@ TEST_P(SatdTest, DISABLED_Speed) {
   printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+class SatdHighbdTest : public SatdTest {
+ protected:
+  virtual void FillRandom() {
+    for (int i = 0; i < satd_size_; ++i) {
+      src_[i] = rnd_.Rand20Signed();
+    }
+  }
+};
+
+TEST_P(SatdHighbdTest, MinValue) {
+  const int kMin = -524280;
+  const int expected = -kMin * satd_size_;
+  FillConstant(kMin);
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, MaxValue) {
+  const int kMax = 524280;
+  const int expected = kMax * satd_size_;
+  FillConstant(kMax);
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, Random) {
+  int expected;
+  switch (satd_size_) {
+    case 16: expected = 5249712; break;
+    case 64: expected = 18362120; break;
+    case 256: expected = 66100520; break;
+    case 1024: expected = 266094734; break;
+    default:
+      FAIL() << "Invalid satd size (" << satd_size_
+             << ") valid: 16/64/256/1024";
+  }
+  FillRandom();
+  Check(expected);
+}
+
+TEST_P(SatdHighbdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 TEST_P(BlockErrorTestFP, MinValue) {
   const int64_t kMin = -32640;
   const int64_t expected = kMin * kMin * txfm_size_;
@@ -415,9 +541,10 @@ TEST_P(BlockErrorTestFP, Random) {
 TEST_P(BlockErrorTestFP, DISABLED_Speed) {
   const int kCountSpeedTestBlock = 20000;
   vpx_usec_timer timer;
-  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]);
   const int blocksize = GET_PARAM(0);
+  FillRandom();
+  tran_low_t *coeff = GetCoeff();
+  tran_low_t *dqcoeff = GetDQCoeff();
 
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
@@ -428,14 +555,34 @@ TEST_P(BlockErrorTestFP, DISABLED_Speed) {
   printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
     C, AverageTest,
     ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_avg_8x8_c),
                       make_tuple(16, 16, 1, 4, &vpx_avg_4x4_c)));
 
-INSTANTIATE_TEST_CASE_P(C, SatdTest,
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_c),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_c)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_sse2),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
+#endif  // HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(C, SatdHighbdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_c),
+                                          make_tuple(64, &vpx_satd_c),
+                                          make_tuple(256, &vpx_satd_c),
+                                          make_tuple(1024, &vpx_satd_c)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+INSTANTIATE_TEST_CASE_P(C, SatdLowbdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_c),
                                           make_tuple(64, &vpx_satd_c),
                                           make_tuple(256, &vpx_satd_c),
@@ -472,7 +619,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(64, &vpx_int_pro_col_sse2,
                                  &vpx_int_pro_col_c)));
 
-INSTANTIATE_TEST_CASE_P(SSE2, SatdTest,
+INSTANTIATE_TEST_CASE_P(SSE2, SatdLowbdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_sse2),
                                           make_tuple(64, &vpx_satd_sse2),
                                           make_tuple(256, &vpx_satd_sse2),
@@ -487,12 +634,21 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+INSTANTIATE_TEST_CASE_P(AVX2, SatdLowbdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_avx2),
                                           make_tuple(64, &vpx_satd_avx2),
                                           make_tuple(256, &vpx_satd_avx2),
                                           make_tuple(1024, &vpx_satd_avx2)));
 
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    AVX2, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_avx2),
+                      make_tuple(64, &vpx_highbd_satd_avx2),
+                      make_tuple(256, &vpx_highbd_satd_avx2),
+                      make_tuple(1024, &vpx_highbd_satd_avx2)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_CASE_P(
     AVX2, BlockErrorTestFP,
     ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
@@ -525,7 +681,7 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(64, &vpx_int_pro_col_neon,
                                  &vpx_int_pro_col_c)));
 
-INSTANTIATE_TEST_CASE_P(NEON, SatdTest,
+INSTANTIATE_TEST_CASE_P(NEON, SatdLowbdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_neon),
                                           make_tuple(64, &vpx_satd_neon),
                                           make_tuple(256, &vpx_satd_neon),
@@ -570,7 +726,7 @@ INSTANTIATE_TEST_CASE_P(
 // TODO(jingning): Remove the highbitdepth flag once the SIMD functions are
 // in place.
 #if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(MSA, SatdTest,
+INSTANTIATE_TEST_CASE_P(MSA, SatdLowbdTest,
                         ::testing::Values(make_tuple(16, &vpx_satd_msa),
                                           make_tuple(64, &vpx_satd_msa),
                                           make_tuple(256, &vpx_satd_msa),
diff --git a/libs/libvpx/test/bench.cc b/libs/libvpx/test/bench.cc
new file mode 100644
index 00000000000..4b883d8250e
--- /dev/null
+++ b/libs/libvpx/test/bench.cc
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <algorithm>
+
+#include "test/bench.h"
+#include "vpx_ports/vpx_timer.h"
+
+void AbstractBench::RunNTimes(int n) {
+  for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) {
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int j = 0; j < n; ++j) {
+      Run();
+    }
+    vpx_usec_timer_mark(&timer);
+    times_[r] = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  }
+}
+
+void AbstractBench::PrintMedian(const char *title) {
+  std::sort(times_, times_ + VPX_BENCH_ROBUST_ITER);
+  const int med = times_[VPX_BENCH_ROBUST_ITER >> 1];
+  int sad = 0;
+  for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) {
+    sad += abs(times_[t] - med);
+  }
+  printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0,
+         sad / (VPX_BENCH_ROBUST_ITER * 1000.0));
+}
diff --git a/libs/libvpx/test/bench.h b/libs/libvpx/test/bench.h
new file mode 100644
index 00000000000..57ca9118bac
--- /dev/null
+++ b/libs/libvpx/test/bench.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_BENCH_H_
+#define VPX_TEST_BENCH_H_
+
+// Number of iterations used to compute median run time.
+#define VPX_BENCH_ROBUST_ITER 15
+
+class AbstractBench {
+ public:
+  void RunNTimes(int n);
+  void PrintMedian(const char *title);
+
+ protected:
+  // Implement this method and put the code to benchmark in it.
+  virtual void Run() = 0;
+
+ private:
+  int times_[VPX_BENCH_ROBUST_ITER];
+};
+
+#endif  // VPX_TEST_BENCH_H_
diff --git a/libs/libvpx/test/blockiness_test.cc b/libs/libvpx/test/blockiness_test.cc
index 2fa10192f15..ced6e66c620 100644
--- a/libs/libvpx/test/blockiness_test.cc
+++ b/libs/libvpx/test/blockiness_test.cc
@@ -11,6 +11,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -25,10 +26,7 @@
 #include "test/util.h"
 
 #include "vpx_mem/vpx_mem.h"
-
-extern "C" double vp9_get_blockiness(const unsigned char *img1, int img1_pitch,
-                                     const unsigned char *img2, int img2_pitch,
-                                     int width, int height);
+#include "vp9/encoder/vp9_blockiness.h"
 
 using libvpx_test::ACMRandom;
 
@@ -141,7 +139,7 @@ class BlockinessTestBase : public ::testing::Test {
 };
 
 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> BlockinessParam;
+typedef std::tuple<int, int> BlockinessParam;
 class BlockinessVP9Test
     : public BlockinessTestBase,
       public ::testing::WithParamInterface<BlockinessParam> {
@@ -208,15 +206,15 @@ TEST_P(BlockinessVP9Test, WorstCaseBlockiness) {
 }
 #endif  // CONFIG_VP9_ENCODER
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
 
 #if CONFIG_VP9_ENCODER
-const BlockinessParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
-};
+const BlockinessParam c_vp9_tests[] = { make_tuple(320, 240),
+                                        make_tuple(318, 242),
+                                        make_tuple(318, 238) };
 INSTANTIATE_TEST_CASE_P(C, BlockinessVP9Test, ::testing::ValuesIn(c_vp9_tests));
 #endif
 
diff --git a/libs/libvpx/test/borders_test.cc b/libs/libvpx/test/borders_test.cc
index e66ff02e25e..b91a15b8003 100644
--- a/libs/libvpx/test/borders_test.cc
+++ b/libs/libvpx/test/borders_test.cc
@@ -31,7 +31,7 @@ class BordersTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
diff --git a/libs/libvpx/test/buffer.h b/libs/libvpx/test/buffer.h
index 2175dad9d90..b003d2f0d02 100644
--- a/libs/libvpx/test/buffer.h
+++ b/libs/libvpx/test/buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_BUFFER_H_
-#define TEST_BUFFER_H_
+#ifndef VPX_TEST_BUFFER_H_
+#define VPX_TEST_BUFFER_H_
 
 #include <stdio.h>
 
@@ -379,4 +379,4 @@ bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
   return true;
 }
 }  // namespace libvpx_test
-#endif  // TEST_BUFFER_H_
+#endif  // VPX_TEST_BUFFER_H_
diff --git a/libs/libvpx/test/byte_alignment_test.cc b/libs/libvpx/test/byte_alignment_test.cc
index 5a058b27561..0ef6c4c5192 100644
--- a/libs/libvpx/test/byte_alignment_test.cc
+++ b/libs/libvpx/test/byte_alignment_test.cc
@@ -171,8 +171,9 @@ TEST_F(ByteAlignmentTest, SwitchByteAlignment) {
 TEST_P(ByteAlignmentTest, TestAlignment) {
   const ByteAlignmentTestParam t = GetParam();
   SetByteAlignment(t.byte_alignment, t.expected_value);
-  if (t.decode_remaining)
+  if (t.decode_remaining) {
     ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames(t.byte_alignment));
+  }
 }
 
 INSTANTIATE_TEST_CASE_P(Alignments, ByteAlignmentTest,
diff --git a/libs/libvpx/test/clear_system_state.h b/libs/libvpx/test/clear_system_state.h
index 044a5c75834..ba3c0b386a1 100644
--- a/libs/libvpx/test/clear_system_state.h
+++ b/libs/libvpx/test/clear_system_state.h
@@ -7,23 +7,17 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_CLEAR_SYSTEM_STATE_H_
-#define TEST_CLEAR_SYSTEM_STATE_H_
+#ifndef VPX_TEST_CLEAR_SYSTEM_STATE_H_
+#define VPX_TEST_CLEAR_SYSTEM_STATE_H_
 
 #include "./vpx_config.h"
-#if ARCH_X86 || ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
+#include "vpx_ports/system_state.h"
 
 namespace libvpx_test {
 
 // Reset system to a known state. This function should be used for all non-API
 // test cases.
-inline void ClearSystemState() {
-#if ARCH_X86 || ARCH_X86_64
-  vpx_reset_mmx_state();
-#endif
-}
+inline void ClearSystemState() { vpx_clear_system_state(); }
 
 }  // namespace libvpx_test
-#endif  // TEST_CLEAR_SYSTEM_STATE_H_
+#endif  // VPX_TEST_CLEAR_SYSTEM_STATE_H_
diff --git a/libs/libvpx/test/codec_factory.h b/libs/libvpx/test/codec_factory.h
index d5882ed9c8b..17c9512ca8b 100644
--- a/libs/libvpx/test/codec_factory.h
+++ b/libs/libvpx/test/codec_factory.h
@@ -7,8 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_CODEC_FACTORY_H_
-#define TEST_CODEC_FACTORY_H_
+#ifndef VPX_TEST_CODEC_FACTORY_H_
+#define VPX_TEST_CODEC_FACTORY_H_
+
+#include <tuple>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
@@ -53,23 +55,22 @@ class CodecFactory {
 template <class T1>
 class CodecTestWithParam
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1> > {};
 
 template <class T1, class T2>
 class CodecTestWith2Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2> > {};
 
 template <class T1, class T2, class T3>
 class CodecTestWith3Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2, T3> > {};
 
 template <class T1, class T2, class T3, class T4>
 class CodecTestWith4Params
     : public ::testing::TestWithParam<
-          std::tr1::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {
-};
+          std::tuple<const libvpx_test::CodecFactory *, T1, T2, T3, T4> > {};
 
 /*
  * VP8 Codec Definitions
@@ -264,4 +265,4 @@ const libvpx_test::VP9CodecFactory kVP9;
 #endif  // CONFIG_VP9
 
 }  // namespace libvpx_test
-#endif  // TEST_CODEC_FACTORY_H_
+#endif  // VPX_TEST_CODEC_FACTORY_H_
diff --git a/libs/libvpx/test/comp_avg_pred_test.cc b/libs/libvpx/test/comp_avg_pred_test.cc
index 110e0658365..56e701e09cb 100644
--- a/libs/libvpx/test/comp_avg_pred_test.cc
+++ b/libs/libvpx/test/comp_avg_pred_test.cc
@@ -29,6 +29,10 @@ uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
 
 void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
                     int width, int height, Buffer<uint8_t> *avg) {
+  ASSERT_TRUE(avg->TopLeftPixel() != NULL);
+  ASSERT_TRUE(pred.TopLeftPixel() != NULL);
+  ASSERT_TRUE(ref.TopLeftPixel() != NULL);
+
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       avg->TopLeftPixel()[y * avg->stride() + x] =
diff --git a/libs/libvpx/test/consistency_test.cc b/libs/libvpx/test/consistency_test.cc
index 37b4a45e541..875b06f4aa1 100644
--- a/libs/libvpx/test/consistency_test.cc
+++ b/libs/libvpx/test/consistency_test.cc
@@ -11,6 +11,7 @@
 #include <limits.h>
 #include <stdio.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -127,7 +128,7 @@ class ConsistencyTestBase : public ::testing::Test {
 };
 
 #if CONFIG_VP9_ENCODER
-typedef std::tr1::tuple<int, int> ConsistencyParam;
+typedef std::tuple<int, int> ConsistencyParam;
 class ConsistencyVP9Test
     : public ConsistencyTestBase,
       public ::testing::WithParamInterface<ConsistencyParam> {
@@ -198,15 +199,15 @@ TEST_P(ConsistencyVP9Test, ConsistencyIsZero) {
 }
 #endif  // CONFIG_VP9_ENCODER
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 //------------------------------------------------------------------------------
 // C functions
 
 #if CONFIG_VP9_ENCODER
-const ConsistencyParam c_vp9_tests[] = {
-  make_tuple(320, 240), make_tuple(318, 242), make_tuple(318, 238),
-};
+const ConsistencyParam c_vp9_tests[] = { make_tuple(320, 240),
+                                         make_tuple(318, 242),
+                                         make_tuple(318, 238) };
 INSTANTIATE_TEST_CASE_P(C, ConsistencyVP9Test,
                         ::testing::ValuesIn(c_vp9_tests));
 #endif
diff --git a/libs/libvpx/test/convolve_test.cc b/libs/libvpx/test/convolve_test.cc
index 70f0b11a770..47589a9f2e0 100644
--- a/libs/libvpx/test/convolve_test.cc
+++ b/libs/libvpx/test/convolve_test.cc
@@ -9,6 +9,7 @@
  */
 
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -77,7 +78,7 @@ struct ConvolveFunctions {
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
-typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
+typedef std::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
 #define ALL_SIZES(convolve_fn)                                            \
   make_tuple(4, 4, &convolve_fn), make_tuple(8, 4, &convolve_fn),         \
@@ -114,6 +115,7 @@ void filter_block2d_8_c(const uint8_t *src_ptr, const unsigned int src_stride,
   // and filter_max_width          = 16
   //
   uint8_t intermediate_buffer[71 * kMaxDimension];
+  vp9_zero(intermediate_buffer);
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -213,6 +215,8 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
+  vp9_zero(intermediate_buffer);
+
   // Horizontal pass (src -> transposed intermediate).
   {
     uint16_t *output_ptr = intermediate_buffer;
@@ -412,8 +416,14 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
     for (int i = 0; i < kOutputBufferSize; ++i) {
       if (IsIndexInBorder(i)) {
         output_[i] = 255;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = mask_;
+#endif
       } else {
         output_[i] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        output16_[i] = 0;
+#endif
       }
     }
 
@@ -450,7 +460,9 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 
   void CheckGuardBlocks() {
     for (int i = 0; i < kOutputBufferSize; ++i) {
-      if (IsIndexInBorder(i)) EXPECT_EQ(255, output_[i]);
+      if (IsIndexInBorder(i)) {
+        EXPECT_EQ(255, output_[i]);
+      }
     }
   }
 
@@ -672,6 +684,74 @@ TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
          UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
 }
 
+TEST_P(ConvolveTest, DISABLED_4Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_4Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_4Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const fourtap = vp9_filter_kernels[FOURTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, fourtap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve4_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
 TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
   const uint8_t *const in = input();
   uint8_t *const out = output();
@@ -787,7 +867,7 @@ TEST_P(ConvolveTest, Copy2D) {
   }
 }
 
-const int kNumFilterBanks = 4;
+const int kNumFilterBanks = 5;
 const int kNumFilters = 16;
 
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
@@ -1040,7 +1120,7 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
 }
 #endif
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 #define WRAP(func, bd)                                                       \
@@ -1183,9 +1263,9 @@ const ConvolveFunctions convolve12_c(
     wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
     wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
     wrap_convolve8_avg_c_12, 12);
-const ConvolveParam kArrayConvolve_c[] = {
-  ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
-};
+const ConvolveParam kArrayConvolve_c[] = { ALL_SIZES(convolve8_c),
+                                           ALL_SIZES(convolve10_c),
+                                           ALL_SIZES(convolve12_c) };
 
 #else
 const ConvolveFunctions convolve8_c(
@@ -1377,4 +1457,16 @@ const ConvolveParam kArrayConvolve_vsx[] = { ALL_SIZES(convolve8_vsx) };
 INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
                         ::testing::ValuesIn(kArrayConvolve_vsx));
 #endif  // HAVE_VSX
+
+#if HAVE_MMI
+const ConvolveFunctions convolve8_mmi(
+    vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi,
+    vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi,
+    vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+const ConvolveParam kArrayConvolve_mmi[] = { ALL_SIZES(convolve8_mmi) };
+INSTANTIATE_TEST_CASE_P(MMI, ConvolveTest,
+                        ::testing::ValuesIn(kArrayConvolve_mmi));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libs/libvpx/test/cpu_speed_test.cc b/libs/libvpx/test/cpu_speed_test.cc
index 404b5b44f4f..2fb5c10eae1 100644
--- a/libs/libvpx/test/cpu_speed_test.cc
+++ b/libs/libvpx/test/cpu_speed_test.cc
@@ -44,7 +44,7 @@ class CpuSpeedTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -152,5 +152,5 @@ VP9_INSTANTIATE_TEST_CASE(CpuSpeedTest,
                           ::testing::Values(::libvpx_test::kTwoPassGood,
                                             ::libvpx_test::kOnePassGood,
                                             ::libvpx_test::kRealTime),
-                          ::testing::Range(0, 9));
+                          ::testing::Range(0, 10));
 }  // namespace
diff --git a/libs/libvpx/test/cq_test.cc b/libs/libvpx/test/cq_test.cc
index 20e1f0f3def..474b9d0fa2e 100644
--- a/libs/libvpx/test/cq_test.cc
+++ b/libs/libvpx/test/cq_test.cc
@@ -65,7 +65,7 @@ class CQTest : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       if (cfg_.rc_end_usage == VPX_CQ) {
         encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
       }
diff --git a/libs/libvpx/test/datarate_test.cc b/libs/libvpx/test/datarate_test.cc
deleted file mode 100644
index 31a8523d219..00000000000
--- a/libs/libvpx/test/datarate_test.cc
+++ /dev/null
@@ -1,1876 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/encode_test_driver.h"
-#include "test/i420_video_source.h"
-#include "test/util.h"
-#include "test/y4m_video_source.h"
-#include "vpx/vpx_codec.h"
-
-namespace {
-
-class DatarateTestLarge
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
-
-  virtual ~DatarateTestLarge() {}
-
- protected:
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
-    duration_ = 0.0;
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-    gf_boost_ = 0;
-    use_roi_ = 0;
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
-      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
-    }
-
-#if CONFIG_VP8_ENCODER
-    if (use_roi_ == 1) {
-      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
-    }
-#endif
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
-    }
-
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    // TODO(jimbankoski): Remove these lines when the issue:
-    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
-    // For now the codec assumes buffer starts at starting buffer rate
-    // plus one frame's time.
-    if (last_pts_ == 0) duration = 1;
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    /* Test the buffer model here before subtracting the frame. Do so because
-     * the way the leaky bucket model works in libvpx is to allow the buffer to
-     * empty - and then stop showing frames until we've got enough bits to
-     * show one. As noted in comment below (issue 495), this does not currently
-     * apply to key frames. For now exclude key frames in condition below. */
-    const bool key_frame =
-        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0)
-          << "Buffer Underrun at frame " << pkt->data.frame.pts;
-    }
-
-    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Subtract from the buffer the bits associated with a played back frame.
-    bits_in_buffer_model_ -= frame_size_in_bits;
-
-    // Update the running total of bits for end of test datarate checks.
-    bits_total_ += frame_size_in_bits;
-
-    // If first drop not set and we have a drop set it to this time.
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-
-    // We update this so that we can calculate the datarate minus the last
-    // frame encoded in the file.
-    bits_in_last_frame_ = frame_size_in_bits;
-
-    ++frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-
-      duration_ = (last_pts_ + 1) * timebase_;
-
-      // Effective file datarate includes the time spent prebuffering.
-      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
-                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
-
-      file_datarate_ = file_size_in_kb / duration_;
-    }
-  }
-
-  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
-  double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
-  double duration_;
-  double file_datarate_;
-  double effective_datarate_;
-  int64_t bits_in_last_frame_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-  int set_cpu_used_;
-  int gf_boost_;
-  int use_roi_;
-  vpx_roi_map_t roi_;
-};
-
-#if CONFIG_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, but loop over the
-// various denoiser settings.
-TEST_P(DatarateTestLarge, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int j = 1; j < 5; ++j) {
-    // Run over the denoiser levels.
-    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
-    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
-    // denoiserOnAggressive, and denoiserOnAdaptive.
-    denoiser_on_ = j;
-    cfg_.rc_target_bitrate = 300;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestLarge, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif  // CONFIG_TEMPORAL_DENOISING
-
-TEST_P(DatarateTestLarge, BasicBufferModel) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // 2 pass cbr datarate control has a bug hidden by the small # of
-  // frames selected in this encode. The problem is that even if the buffer is
-  // negative we produce a keyframe on a cutscene. Ignoring datarate
-  // constraints
-  // TODO(jimbankoski): ( Fix when issue
-  // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // There is an issue for low bitrates in real-time mode, where the
-  // effective_datarate slightly overshoots the target bitrate.
-  // This is same the issue as noted about (#495).
-  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
-  // when the issue is resolved.
-  for (int i = 100; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_max_quantizer = 36;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.kf_mode = VPX_KF_DISABLED;
-
-  const int frame_count = 40;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, frame_count);
-
-  // Here we check that the first dropped frame gets earlier and earlier
-  // as the drop frame threshold is increased.
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = frame_count;
-  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-  }
-}
-
-TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-
-class DatarateTestRealTime : public DatarateTestLarge {
- public:
-  virtual ~DatarateTestRealTime() {}
-};
-
-#if CONFIG_TEMPORAL_DENOISING
-// Check basic datarate targeting, for a single bitrate, but loop over the
-// various denoiser settings.
-TEST_P(DatarateTestRealTime, DenoiserLevels) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int j = 1; j < 5; ++j) {
-    // Run over the denoiser levels.
-    // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
-    // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
-    // denoiserOnAggressive, and denoiserOnAdaptive.
-    denoiser_on_ = j;
-    cfg_.rc_target_bitrate = 300;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestRealTime, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-#endif  // CONFIG_TEMPORAL_DENOISING
-
-TEST_P(DatarateTestRealTime, BasicBufferModel) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // 2 pass cbr datarate control has a bug hidden by the small # of
-  // frames selected in this encode. The problem is that even if the buffer is
-  // negative we produce a keyframe on a cutscene, ignoring datarate
-  // constraints
-  // TODO(jimbankoski): Fix when issue
-  // http://bugs.chromium.org/p/webm/issues/detail?id=495 is addressed.
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // There is an issue for low bitrates in real-time mode, where the
-  // effective_datarate slightly overshoots the target bitrate.
-  // This is same the issue as noted above (#495).
-  // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
-  // when the issue is resolved.
-  for (int i = 100; i <= 700; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-        << " The datarate for the file exceeds the target!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-        << " The datarate for the file missed the target!";
-  }
-}
-
-TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_max_quantizer = 36;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.kf_mode = VPX_KF_DISABLED;
-
-  const int frame_count = 40;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, frame_count);
-
-  // Check that the first dropped frame gets earlier and earlier
-  // as the drop frame threshold is increased.
-
-  const int kDropFrameThreshTestStep = 30;
-  vpx_codec_pts_t last_drop = frame_count;
-  for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
-    cfg_.rc_dropframe_thresh = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_LE(first_drop_, last_drop)
-        << " The first dropped frame for drop_thresh " << i
-        << " > first dropped frame for drop_thresh "
-        << i - kDropFrameThreshTestStep;
-    last_drop = first_drop_;
-  }
-}
-
-TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // Encode using multiple threads.
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-
-TEST_P(DatarateTestRealTime, RegionOfInterest) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  // Encode using multiple threads.
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  cfg_.rc_target_bitrate = 450;
-  cfg_.g_w = 352;
-  cfg_.g_h = 288;
-
-  ResetModel();
-
-  // Set ROI parameters
-  use_roi_ = 1;
-  memset(&roi_, 0, sizeof(roi_));
-
-  roi_.rows = (cfg_.g_h + 15) / 16;
-  roi_.cols = (cfg_.g_w + 15) / 16;
-
-  roi_.delta_q[0] = 0;
-  roi_.delta_q[1] = -20;
-  roi_.delta_q[2] = 0;
-  roi_.delta_q[3] = 0;
-
-  roi_.delta_lf[0] = 0;
-  roi_.delta_lf[1] = -20;
-  roi_.delta_lf[2] = 0;
-  roi_.delta_lf[3] = 0;
-
-  roi_.static_threshold[0] = 0;
-  roi_.static_threshold[1] = 1000;
-  roi_.static_threshold[2] = 0;
-  roi_.static_threshold[3] = 0;
-
-  // Use 2 states: 1 is center square, 0 is the rest.
-  roi_.roi_map =
-      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
-  for (unsigned int i = 0; i < roi_.rows; ++i) {
-    for (unsigned int j = 0; j < roi_.cols; ++j) {
-      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
-          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
-        roi_.roi_map[i * roi_.cols + j] = 1;
-      }
-    }
-  }
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-
-  free(roi_.roi_map);
-}
-
-TEST_P(DatarateTestRealTime, GFBoost) {
-  denoiser_on_ = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_error_resilient = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // Apply a gf boost.
-  gf_boost_ = 50;
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
-      << " The datarate for the file exceeds the target!";
-
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
-      << " The datarate for the file missed the target!";
-}
-
-class DatarateTestVP9Large
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateTestVP9Large() : EncoderTest(GET_PARAM(0)) {}
-
- protected:
-  virtual ~DatarateTestVP9Large() {}
-
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    set_cpu_used_ = GET_PARAM(2);
-    ResetModel();
-  }
-
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    tot_frame_number_ = 0;
-    first_drop_ = 0;
-    num_drops_ = 0;
-    // Denoiser is off by default.
-    denoiser_on_ = 0;
-    // For testing up to 3 layers.
-    for (int i = 0; i < 3; ++i) {
-      bits_total_[i] = 0;
-    }
-    denoiser_offon_test_ = 0;
-    denoiser_offon_period_ = -1;
-    frame_parallel_decoding_mode_ = 1;
-  }
-
-  //
-  // Frame flags and layer id for temporal layers.
-  //
-
-  // For two layers, test pattern is:
-  //   1     3
-  // 0    2     .....
-  // For three layers, test pattern is:
-  //   1      3    5      7
-  //      2           6
-  // 0          4            ....
-  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
-  // For this 3 layer example, the 2nd enhancement layer (layer 2) updates
-  // the altref frame.
-  int SetFrameFlags(int frame_num, int num_temp_layers) {
-    int frame_flags = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        // Layer 0: predict from L and ARF, update L.
-        frame_flags =
-            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        // Layer 1: predict from L, G and ARF, and update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
-                      VP8_EFLAG_NO_UPD_ENTROPY;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        // Layer 0: predict from L and ARF; update L.
-        frame_flags =
-            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
-      } else if ((frame_num - 2) % 4 == 0) {
-        // Layer 1: predict from L, G, ARF; update G.
-        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
-      } else if ((frame_num - 1) % 2 == 0) {
-        // Layer 2: predict from L, G, ARF; update ARF.
-        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
-      }
-    }
-    return frame_flags;
-  }
-
-  int SetLayerId(int frame_num, int num_temp_layers) {
-    int layer_id = 0;
-    if (num_temp_layers == 2) {
-      if (frame_num % 2 == 0) {
-        layer_id = 0;
-      } else {
-        layer_id = 1;
-      }
-    } else if (num_temp_layers == 3) {
-      if (frame_num % 4 == 0) {
-        layer_id = 0;
-      } else if ((frame_num - 2) % 4 == 0) {
-        layer_id = 1;
-      } else if ((frame_num - 1) % 2 == 0) {
-        layer_id = 2;
-      }
-    }
-    return layer_id;
-  }
-
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
-
-    if (denoiser_offon_test_) {
-      ASSERT_GT(denoiser_offon_period_, 0)
-          << "denoiser_offon_period_ is not positive.";
-      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
-        // Flip denoiser_on_ periodically
-        denoiser_on_ ^= 1;
-      }
-    }
-
-    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
-    encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
-    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
-                     frame_parallel_decoding_mode_);
-
-    if (cfg_.ts_number_layers > 1) {
-      if (video->frame() == 0) {
-        encoder->Control(VP9E_SET_SVC, 1);
-      }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
-    }
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    // Time since last timestamp = duration.
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-
-    if (duration > 1) {
-      // If first drop not set and we have a drop set it to this time.
-      if (!first_drop_) first_drop_ = last_pts_ + 1;
-      // Update the number of frame drops.
-      num_drops_ += static_cast<int>(duration - 1);
-      // Update counter for total number of frames (#frames input to encoder).
-      // Needed for setting the proper layer_id below.
-      tot_frame_number_ += static_cast<int>(duration - 1);
-    }
-
-    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
-
-    // Add to the buffer the bits we'd expect from a constant bitrate server.
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
-
-    // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0)
-        << "Buffer Underrun at frame " << pkt->data.frame.pts;
-
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-
-    // Update the total encoded bits. For temporal layers, update the cumulative
-    // encoded bits per layer.
-    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
-      bits_total_[i] += frame_size_in_bits;
-    }
-
-    // Update the most recent pts.
-    last_pts_ = pkt->data.frame.pts;
-    ++frame_number_;
-    ++tot_frame_number_;
-  }
-
-  virtual void EndPassHook(void) {
-    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
-         ++layer) {
-      duration_ = (last_pts_ + 1) * timebase_;
-      if (bits_total_[layer]) {
-        // Effective file datarate:
-        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
-      }
-    }
-  }
-
-  vpx_codec_pts_t last_pts_;
-  double timebase_;
-  int frame_number_;      // Counter for number of non-dropped/encoded frames.
-  int tot_frame_number_;  // Counter for total number of input frames.
-  int64_t bits_total_[3];
-  double duration_;
-  double effective_datarate_[3];
-  int set_cpu_used_;
-  int64_t bits_in_buffer_model_;
-  vpx_codec_pts_t first_drop_;
-  int num_drops_;
-  int denoiser_on_;
-  int denoiser_offon_test_;
-  int denoiser_offon_period_;
-  int frame_parallel_decoding_mode_;
-};
-
-// Check basic rate targeting for VBR mode with 0 lag.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = VPX_VBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for VBR mode with non-zero lag.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = VPX_VBR;
-  // For non-zero lag, rate control will work (be within bounds) for
-  // real-time mode.
-  if (deadline_ == VPX_DL_REALTIME) {
-    cfg_.g_lag_in_frames = 15;
-  } else {
-    cfg_.g_lag_in_frames = 0;
-  }
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for VBR mode with non-zero lag, with
-// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
-// since error_resilience is off.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.g_error_resilient = 0;
-  cfg_.rc_end_usage = VPX_VBR;
-  // For non-zero lag, rate control will work (be within bounds) for
-  // real-time mode.
-  if (deadline_ == VPX_DL_REALTIME) {
-    cfg_.g_lag_in_frames = 15;
-  } else {
-    cfg_.g_lag_in_frames = 0;
-  }
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 300);
-  for (int i = 400; i <= 800; i += 400) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    frame_parallel_decoding_mode_ = 0;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for CBR mode.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 150; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
-// off( and error_resilience off).
-TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_error_resilient = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  for (int i = 150; i < 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    frame_parallel_decoding_mode_ = 0;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-        << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-        << " The datarate for the file is greater than target by too much!";
-  }
-}
-
-// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
-TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 30;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  // Encode using multiple threads.
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic rate targeting for CBR.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting444) {
-  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
-
-  cfg_.g_profile = 1;
-  cfg_.g_timebase = video.timebase();
-
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-
-  for (int i = 250; i < 900; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 0.80)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
-              effective_datarate_[0] * 1.15)
-        << " The datarate for the file missed the target!"
-        << cfg_.rc_target_bitrate << " " << effective_datarate_;
-  }
-}
-
-// Check that (1) the first dropped frame gets earlier and earlier
-// as the drop frame threshold is increased, and (2) that the total number of
-// frame drops does not decrease as we increase frame drop threshold.
-// Use a lower qp-max to force some frame drops.
-TEST_P(DatarateTestVP9Large, ChangingDropFrameThresh) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_undershoot_pct = 20;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 50;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.rc_target_bitrate = 200;
-  cfg_.g_lag_in_frames = 0;
-  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
-  // interval (128).
-  cfg_.kf_max_dist = 9999;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  const int kDropFrameThreshTestStep = 30;
-  for (int j = 50; j <= 150; j += 100) {
-    cfg_.rc_target_bitrate = j;
-    vpx_codec_pts_t last_drop = 140;
-    int last_num_drops = 0;
-    for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
-      cfg_.rc_dropframe_thresh = i;
-      ResetModel();
-      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-      ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-          << " The datarate for the file is lower than target by too much!";
-      ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
-          << " The datarate for the file is greater than target by too much!";
-      ASSERT_LE(first_drop_, last_drop)
-          << " The first dropped frame for drop_thresh " << i
-          << " > first dropped frame for drop_thresh "
-          << i - kDropFrameThreshTestStep;
-      ASSERT_GE(num_drops_, last_num_drops * 0.85)
-          << " The number of dropped frames for drop_thresh " << i
-          << " < number of dropped frames for drop_thresh "
-          << i - kDropFrameThreshTestStep;
-      last_drop = first_drop_;
-      last_num_drops = num_drops_;
-    }
-  }
-}
-
-// Check basic rate targeting for 2 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting2TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 2;
-  cfg_.ts_rate_decimator[0] = 2;
-  cfg_.ts_rate_decimator[1] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  if (deadline_ == VPX_DL_REALTIME) cfg_.g_error_resilient = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 60-40 bitrate allocation for 2 temporal layers.
-    cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
-          << " The datarate for the file is lower than target by too much, "
-             "for layer: "
-          << j;
-      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
-          << " The datarate for the file is greater than target by too much, "
-             "for layer: "
-          << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayers) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    // 40-20-40 bitrate allocation for 3 temporal layers.
-    cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-    cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than .75.
-      ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
-          << " The datarate for the file is lower than target by too much, "
-             "for layer: "
-          << j;
-      // TODO(yaowu): Work out more stable rc control strategy and
-      //              Adjust the thresholds to be tighter than 1.25.
-      ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
-          << " The datarate for the file is greater than target by too much, "
-             "for layer: "
-          << j;
-    }
-  }
-}
-
-// Check basic rate targeting for 3 temporal layers, with frame dropping.
-// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
-// frame drop threshold, to force frame dropping.
-TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
-  cfg_.rc_dropframe_thresh = 20;
-  cfg_.rc_max_quantizer = 45;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
-  cfg_.ss_number_layers = 1;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-
-  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
-  cfg_.rc_target_bitrate = 200;
-  ResetModel();
-  // 40-20-40 bitrate allocation for 3 temporal layers.
-  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
-  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
-  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
-    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
-        << " The datarate for the file is lower than target by too much, "
-           "for layer: "
-        << j;
-    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
-        << " The datarate for the file is greater than target by too much, "
-           "for layer: "
-        << j;
-    // Expect some frame drops in this test: for this 200 frames test,
-    // expect at least 10% and not more than 60% drops.
-    ASSERT_GE(num_drops_, 20);
-    ASSERT_LE(num_drops_, 130);
-  }
-}
-
-#if CONFIG_VP9_TEMPORAL_DENOISING
-class DatarateTestVP9LargeDenoiser : public DatarateTestVP9Large {
- public:
-  virtual ~DatarateTestVP9LargeDenoiser() {}
-};
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is on.
-TEST_P(DatarateTestVP9LargeDenoiser, LowNoise) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 140);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is on,
-// for clip with high noise level. Use 2 threads.
-TEST_P(DatarateTestVP9LargeDenoiser, HighNoise) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_threads = 2;
-
-  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 1000;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is on,
-// for 1280x720 clip with 4 threads.
-TEST_P(DatarateTestVP9LargeDenoiser, 4threads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.g_threads = 4;
-
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 1000;
-  ResetModel();
-  // Turn on the denoiser.
-  denoiser_on_ = 1;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
-      << " The datarate for the file is greater than target by too much!";
-}
-
-// Check basic datarate targeting, for a single bitrate, when denoiser is off
-// and on.
-TEST_P(DatarateTestVP9LargeDenoiser, DenoiserOffOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_dropframe_thresh = 1;
-  cfg_.rc_min_quantizer = 2;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 299);
-
-  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
-  // there is only one denoiser mode: denoiserYonly(which is 1),
-  // but may add more modes in the future.
-  cfg_.rc_target_bitrate = 300;
-  ResetModel();
-  // The denoiser is off by default.
-  denoiser_on_ = 0;
-  // Set the offon test flag.
-  denoiser_offon_test_ = 1;
-  denoiser_offon_period_ = 100;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
-      << " The datarate for the file is lower than target by too much!";
-  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
-      << " The datarate for the file is greater than target by too much!";
-}
-#endif  // CONFIG_VP9_TEMPORAL_DENOISING
-
-class DatarateOnePassCbrSvc
-    : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
- public:
-  DatarateOnePassCbrSvc() : EncoderTest(GET_PARAM(0)) {
-    memset(&svc_params_, 0, sizeof(svc_params_));
-  }
-  virtual ~DatarateOnePassCbrSvc() {}
-
- protected:
-  virtual void SetUp() {
-    InitializeConfig();
-    SetMode(GET_PARAM(1));
-    speed_setting_ = GET_PARAM(2);
-    ResetModel();
-  }
-  virtual void ResetModel() {
-    last_pts_ = 0;
-    duration_ = 0.0;
-    mismatch_psnr_ = 0.0;
-    mismatch_nframes_ = 0;
-    denoiser_on_ = 0;
-    tune_content_ = 0;
-    base_speed_setting_ = 5;
-    spatial_layer_id_ = 0;
-    temporal_layer_id_ = 0;
-    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
-    memset(bits_total_, 0, sizeof(bits_total_));
-    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
-  }
-  virtual void BeginPassHook(unsigned int /*pass*/) {}
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 0) {
-      int i;
-      for (i = 0; i < VPX_MAX_LAYERS; ++i) {
-        svc_params_.max_quantizers[i] = 63;
-        svc_params_.min_quantizers[i] = 0;
-      }
-      svc_params_.speed_per_layer[0] = base_speed_setting_;
-      for (i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
-        svc_params_.speed_per_layer[i] = speed_setting_;
-      }
-
-      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
-      encoder->Control(VP9E_SET_SVC, 1);
-      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
-      encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
-      encoder->Control(VP9E_SET_TILE_COLUMNS, 0);
-      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
-      encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
-      encoder->Control(VP9E_SET_ROW_MT, 1);
-      encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
-      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
-    }
-    const vpx_rational_t tb = video->timebase();
-    timebase_ = static_cast<double>(tb.num) / tb.den;
-    duration_ = 0;
-  }
-
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
-    vpx_svc_layer_id_t layer_id;
-    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
-    spatial_layer_id_ = layer_id.spatial_layer_id;
-    temporal_layer_id_ = layer_id.temporal_layer_id;
-    // Update buffer with per-layer target frame bandwidth, this is done
-    // for every frame passed to the encoder (encoded or dropped).
-    // For temporal layers, update the cumulative buffer level.
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        bits_in_buffer_model_[layer] +=
-            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
-      }
-    }
-  }
-
-  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                         uint32_t sizes[8], int *count) {
-    uint8_t marker;
-    marker = *(data + data_sz - 1);
-    *count = 0;
-    if ((marker & 0xe0) == 0xc0) {
-      const uint32_t frames = (marker & 0x7) + 1;
-      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-      const size_t index_sz = 2 + mag * frames;
-      // This chunk is marked as having a superframe index but doesn't have
-      // enough data for it, thus it's an invalid superframe index.
-      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
-      {
-        const uint8_t marker2 = *(data + data_sz - index_sz);
-        // This chunk is marked as having a superframe index but doesn't have
-        // the matching marker byte at the front of the index therefore it's an
-        // invalid chunk.
-        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
-      }
-      {
-        uint32_t i, j;
-        const uint8_t *x = &data[data_sz - index_sz + 1];
-        for (i = 0; i < frames; ++i) {
-          uint32_t this_sz = 0;
-
-          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
-          sizes[i] = this_sz;
-        }
-        *count = frames;
-      }
-    }
-    return VPX_CODEC_OK;
-  }
-
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    uint32_t sizes[8] = { 0 };
-    int count = 0;
-    last_pts_ = pkt->data.frame.pts;
-    const bool key_frame =
-        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
-                           pkt->data.frame.sz, sizes, &count);
-    ASSERT_EQ(count, number_spatial_layers_);
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      sizes[sl] = sizes[sl] << 3;
-      // Update the total encoded bits per layer.
-      // For temporal layers, update the cumulative encoded bits per layer.
-      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
-        // Update the per-layer buffer level with the encoded frame size.
-        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
-        // There should be no buffer underrun, except on the base
-        // temporal layer, since there may be key frames there.
-        if (!key_frame && tl > 0) {
-          ASSERT_GE(bits_in_buffer_model_[layer], 0)
-              << "Buffer Underrun at frame " << pkt->data.frame.pts;
-        }
-      }
-    }
-  }
-
-  virtual void EndPassHook(void) {
-    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
-      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
-        const int layer = sl * number_temporal_layers_ + tl;
-        const double file_size_in_kb = bits_total_[layer] / 1000.;
-        duration_ = (last_pts_ + 1) * timebase_;
-        file_datarate_[layer] = file_size_in_kb / duration_;
-      }
-    }
-  }
-
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
-    double mismatch_psnr = compute_psnr(img1, img2);
-    mismatch_psnr_ += mismatch_psnr;
-    ++mismatch_nframes_;
-  }
-
-  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
-
-  vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
-  double timebase_;
-  int64_t bits_total_[VPX_MAX_LAYERS];
-  double duration_;
-  double file_datarate_[VPX_MAX_LAYERS];
-  size_t bits_in_last_frame_;
-  vpx_svc_extra_cfg_t svc_params_;
-  int speed_setting_;
-  double mismatch_psnr_;
-  int mismatch_nframes_;
-  int denoiser_on_;
-  int tune_content_;
-  int base_speed_setting_;
-  int spatial_layer_id_;
-  int temporal_layer_id_;
-  int number_spatial_layers_;
-  int number_temporal_layers_;
-  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
-};
-static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
-                                  const vpx_svc_extra_cfg_t *svc_params,
-                                  int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode,
-                                  int *layer_target_avg_bandwidth,
-                                  int64_t *bits_in_buffer_model) {
-  int sl, spatial_layer_target;
-  float total = 0;
-  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
-  float framerate = 30.0;
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    if (svc_params->scaling_factor_den[sl] > 0) {
-      alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
-                                svc_params->scaling_factor_den[sl]);
-      total += alloc_ratio[sl];
-    }
-  }
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    enc_cfg->ss_target_bitrate[sl] = spatial_layer_target =
-        (unsigned int)(enc_cfg->rc_target_bitrate * alloc_ratio[sl] / total);
-    const int index = sl * temporal_layers;
-    if (temporal_layering_mode == 3) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target >> 1;
-      enc_cfg->layer_target_bitrate[index + 1] =
-          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
-      enc_cfg->layer_target_bitrate[index + 2] = spatial_layer_target;
-    } else if (temporal_layering_mode == 2) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
-      enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
-    } else if (temporal_layering_mode <= 1) {
-      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
-    }
-  }
-  for (sl = 0; sl < spatial_layers; ++sl) {
-    for (int tl = 0; tl < temporal_layers; ++tl) {
-      const int layer = sl * temporal_layers + tl;
-      float layer_framerate = framerate;
-      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
-      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
-      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
-      layer_target_avg_bandwidth[layer] = static_cast<int>(
-          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
-      bits_in_buffer_model[layer] =
-          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
-    }
-  }
-}
-
-static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
-                                    int number_spatial_layers,
-                                    int number_temporal_layers,
-                                    double *file_datarate,
-                                    double thresh_overshoot,
-                                    double thresh_undershoot) {
-  for (int sl = 0; sl < number_spatial_layers; ++sl)
-    for (int tl = 0; tl < number_temporal_layers; ++tl) {
-      const int layer = sl * number_temporal_layers + tl;
-      ASSERT_GE(cfg->layer_target_bitrate[layer],
-                file_datarate[layer] * thresh_overshoot)
-          << " The datarate for the file exceeds the target by too much!";
-      ASSERT_LE(cfg->layer_target_bitrate[layer],
-                file_datarate[layer] * thresh_undershoot)
-          << " The datarate for the file is lower than the target by too much!";
-    }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
-// temporal layer, with screen content mode on and same speed setting for all
-// layers.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 1;
-  cfg_.ts_rate_decimator[0] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 0;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  cfg_.rc_target_bitrate = 500;
-  ResetModel();
-  tune_content_ = 1;
-  base_speed_setting_ = speed_setting_;
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
-// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  // TODO(marpan): Check that effective_datarate for each layer hits the
-  // layer target_bitrate.
-  for (int i = 200; i <= 800; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
-#if CONFIG_VP9_DECODER
-    // Number of temporal layers > 1, so half of the frames in this SVC pattern
-    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expect 200 (half of the sequence)
-    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
-#endif
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC with denoising.
-// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 2;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  // TODO(marpan): Check that effective_datarate for each layer hits the
-  // layer target_bitrate.
-  // For SVC, noise_sen = 1 means denoising only the top spatial layer
-  // noise_sen = 2 means denoising the two top spatial layers.
-  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
-    for (int i = 600; i <= 1000; i += 200) {
-      cfg_.rc_target_bitrate = i;
-      ResetModel();
-      denoiser_on_ = noise_sen;
-      assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
-      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                              number_temporal_layers_, file_datarate_, 0.78,
-                              1.15);
-#if CONFIG_VP9_DECODER
-      // Number of temporal layers > 1, so half of the frames in this SVC
-      // pattern
-      // will be non-reference frame and hence encoder will avoid loopfilter.
-      // Since frame dropper is off, we can expect 200 (half of the sequence)
-      // mismatched frames.
-      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
-#endif
-    }
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
-// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_target_bitrate = 400;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
-  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
-  for (int j = 64; j <= 67; j++) {
-    cfg_.kf_max_dist = j;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
-// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 4;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 144;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 288;
-  svc_params_.scaling_factor_den[1] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-#if CONFIG_VP9_DECODER
-  // Number of temporal layers > 1, so half of the frames in this SVC pattern
-  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
-  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
-#endif
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
-// 3 temporal layers. Run CIF clip with 1 thread.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-#if CONFIG_VP9_DECODER
-  // Number of temporal layers > 1, so half of the frames in this SVC pattern
-  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 200 (half of the sequence)
-  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
-#endif
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
-// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 1;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.rc_target_bitrate = 800;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
-                                       0, 400);
-  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
-  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
-  for (int j = 32; j <= 35; j++) {
-    cfg_.kf_max_dist = j;
-    ResetModel();
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                            number_temporal_layers_, file_datarate_, 0.78,
-                            1.15);
-  }
-}
-
-// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
-// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 3;
-  cfg_.ts_number_layers = 3;
-  cfg_.ts_rate_decimator[0] = 4;
-  cfg_.ts_rate_decimator[1] = 2;
-  cfg_.ts_rate_decimator[2] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 4;
-  cfg_.temporal_layering_mode = 3;
-  svc_params_.scaling_factor_num[0] = 72;
-  svc_params_.scaling_factor_den[0] = 288;
-  svc_params_.scaling_factor_num[1] = 144;
-  svc_params_.scaling_factor_den[1] = 288;
-  svc_params_.scaling_factor_num[2] = 288;
-  svc_params_.scaling_factor_den[2] = 288;
-  cfg_.rc_dropframe_thresh = 0;
-  cfg_.kf_max_dist = 9999;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  cfg_.rc_target_bitrate = 800;
-  ResetModel();
-  assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
-                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-#if CONFIG_VP9_DECODER
-  // Number of temporal layers > 1, so half of the frames in this SVC pattern
-  // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expect 30 (half of the sequence)
-  // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
-#endif
-}
-
-// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial
-// downscale 5x5.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_min_quantizer = 0;
-  cfg_.rc_max_quantizer = 63;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.ss_number_layers = 2;
-  cfg_.ts_number_layers = 1;
-  cfg_.ts_rate_decimator[0] = 1;
-  cfg_.g_error_resilient = 1;
-  cfg_.g_threads = 3;
-  cfg_.temporal_layering_mode = 0;
-  svc_params_.scaling_factor_num[0] = 256;
-  svc_params_.scaling_factor_den[0] = 1280;
-  svc_params_.scaling_factor_num[1] = 1280;
-  svc_params_.scaling_factor_den[1] = 1280;
-  cfg_.rc_dropframe_thresh = 10;
-  cfg_.kf_max_dist = 999999;
-  cfg_.kf_min_dist = 0;
-  cfg_.ss_target_bitrate[0] = 300;
-  cfg_.ss_target_bitrate[1] = 1400;
-  cfg_.layer_target_bitrate[0] = 300;
-  cfg_.layer_target_bitrate[1] = 1400;
-  cfg_.rc_target_bitrate = 1700;
-  number_spatial_layers_ = cfg_.ss_number_layers;
-  number_temporal_layers_ = cfg_.ts_number_layers;
-  ResetModel();
-  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
-  bits_in_buffer_model_[0] =
-      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
-  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
-  bits_in_buffer_model_[1] =
-      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
-                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
-  EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
-}
-
-VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES,
-                          ::testing::Values(0));
-VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Values(-6, -12));
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9Large,
-                          ::testing::Values(::libvpx_test::kOnePassGood,
-                                            ::libvpx_test::kRealTime),
-                          ::testing::Range(2, 9));
-#if CONFIG_VP9_TEMPORAL_DENOISING
-VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeDenoiser,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
-#endif
-VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvc,
-                          ::testing::Values(::libvpx_test::kRealTime),
-                          ::testing::Range(5, 9));
-}  // namespace
diff --git a/libs/libvpx/test/dct16x16_test.cc b/libs/libvpx/test/dct16x16_test.cc
index ce0bd37b3df..9ccf2b84f11 100644
--- a/libs/libvpx/test/dct16x16_test.cc
+++ b/libs/libvpx/test/dct16x16_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -229,10 +230,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t>
-    Idct16x16Param;
+typedef std::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
+typedef std::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct16x16Param;
 
 void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                    int /*tx_type*/) {
@@ -744,7 +744,7 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
diff --git a/libs/libvpx/test/dct32x32_test.cc b/libs/libvpx/test/dct32x32_test.cc
index a95ff973287..94d6b37fa91 100644
--- a/libs/libvpx/test/dct32x32_test.cc
+++ b/libs/libvpx/test/dct32x32_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -18,6 +19,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -66,7 +68,7 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
 typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
     Trans32x32Param;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -79,7 +81,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
+class Trans32x32Test : public AbstractBench,
+                       public ::testing::TestWithParam<Trans32x32Param> {
  public:
   virtual ~Trans32x32Test() {}
   virtual void SetUp() {
@@ -99,8 +102,14 @@ class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
   int mask_;
   FwdTxfmFunc fwd_txfm_;
   InvTxfmFunc inv_txfm_;
+
+  int16_t *bench_in_;
+  tran_low_t *bench_out_;
+  virtual void Run();
 };
 
+void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); }
+
 TEST_P(Trans32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   uint32_t max_error = 0;
@@ -237,6 +246,19 @@ TEST_P(Trans32x32Test, MemCheck) {
   }
 }
 
+TEST_P(Trans32x32Test, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
+  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
+
+  bench_in_ = input_extreme_block;
+  bench_out_ = output_block;
+
+  RunNTimes(INT16_MAX);
+  PrintMedian("32x32");
+}
+
 TEST_P(Trans32x32Test, InverseAccuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
@@ -292,7 +314,7 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@@ -371,7 +393,7 @@ INSTANTIATE_TEST_CASE_P(
     VSX, Trans32x32Test,
     ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
                                  0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct32x32_rd_c,
+                      make_tuple(&vpx_fdct32x32_rd_vsx,
                                  &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libs/libvpx/test/dct_partial_test.cc b/libs/libvpx/test/dct_partial_test.cc
index 4d145f58915..c889e92d709 100644
--- a/libs/libvpx/test/dct_partial_test.cc
+++ b/libs/libvpx/test/dct_partial_test.cc
@@ -11,8 +11,8 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <limits>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -28,8 +28,8 @@
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -39,10 +39,14 @@ typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
 
 tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
   int64_t sum = 0;
-  for (int y = 0; y < size; ++y) {
-    for (int x = 0; x < size; ++x) {
-      sum += in.TopLeftPixel()[y * in.stride() + x];
+  if (in.TopLeftPixel() != NULL) {
+    for (int y = 0; y < size; ++y) {
+      for (int x = 0; x < size; ++x) {
+        sum += in.TopLeftPixel()[y * in.stride() + x];
+      }
     }
+  } else {
+    assert(0);
   }
 
   switch (size) {
@@ -77,21 +81,25 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
     Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(output_block.Init());
 
-    for (int i = 0; i < 100; ++i) {
-      if (i == 0) {
-        input_block.Set(maxvalue);
-      } else if (i == 1) {
-        input_block.Set(minvalue);
-      } else {
-        input_block.Set(&rnd, minvalue, maxvalue);
+    if (output_block.TopLeftPixel() != NULL) {
+      for (int i = 0; i < 100; ++i) {
+        if (i == 0) {
+          input_block.Set(maxvalue);
+        } else if (i == 1) {
+          input_block.Set(minvalue);
+        } else {
+          input_block.Set(&rnd, minvalue, maxvalue);
+        }
+
+        ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+                                           output_block.TopLeftPixel(),
+                                           input_block.stride()));
+
+        EXPECT_EQ(partial_fdct_ref(input_block, size_),
+                  output_block.TopLeftPixel()[0]);
       }
-
-      ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
-                                         output_block.TopLeftPixel(),
-                                         input_block.stride()));
-
-      EXPECT_EQ(partial_fdct_ref(input_block, size_),
-                output_block.TopLeftPixel()[0]);
+    } else {
+      assert(0);
     }
   }
 
diff --git a/libs/libvpx/test/dct_test.cc b/libs/libvpx/test/dct_test.cc
index addbdfb463f..6053aee5420 100644
--- a/libs/libvpx/test/dct_test.cc
+++ b/libs/libvpx/test/dct_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -28,8 +29,8 @@
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
-using std::tr1::tuple;
-using std::tr1::make_tuple;
+using std::make_tuple;
+using std::tuple;
 
 namespace {
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
@@ -40,10 +41,60 @@ typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
                            int size, int tx_type);
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
+typedef void (*IhtWithBdFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                              int tx_type, int bd);
+
+template <FdctFunc fn>
+void fdct_wrapper(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
+  (void)tx_type;
+  fn(in, out, stride);
+}
+
+template <IdctFunc fn>
+void idct_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                  int bd) {
+  (void)tx_type;
+  (void)bd;
+  fn(in, out, stride);
+}
+
+template <IhtFunc fn>
+void iht_wrapper(const tran_low_t *in, uint8_t *out, int stride, int tx_type,
+                 int bd) {
+  (void)bd;
+  fn(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*HighbdIdctFunc)(const tran_low_t *in, uint16_t *out, int stride,
+                               int bd);
+
+typedef void (*HighbdIhtFunc)(const tran_low_t *in, uint16_t *out, int stride,
+                              int tx_type, int bd);
+
+template <HighbdIdctFunc fn>
+void highbd_idct_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                         int tx_type, int bd) {
+  (void)tx_type;
+  fn(in, CAST_TO_SHORTPTR(out), stride, bd);
+}
+
+template <HighbdIhtFunc fn>
+void highbd_iht_wrapper(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type, int bd) {
+  fn(in, CAST_TO_SHORTPTR(out), stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+struct FuncInfo {
+  FhtFunc ft_func;
+  IhtWithBdFunc it_func;
+  int size;
+  int pixel_size;
+};
 
 /* forward transform, inverse transform, size, transform type, bit depth */
-typedef tuple<FdctFunc, IdctFunc, int, int, vpx_bit_depth_t> DctParam;
-typedef tuple<FhtFunc, IhtFunc, int, int, vpx_bit_depth_t> HtParam;
+typedef tuple<int, const FuncInfo *, int, vpx_bit_depth_t> DctParam;
 
 void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
               int /*tx_type*/) {
@@ -81,128 +132,123 @@ void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
   vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-#define idctNxN(n, coeffs, bitdepth)                                       \
-  void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,      \
-                                  int stride) {                            \
-    vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \
-                                                stride, bitdepth);         \
-  }
+class TransTestBase : public ::testing::TestWithParam<DctParam> {
+ public:
+  virtual void SetUp() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    const int idx = GET_PARAM(0);
+    const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
+    tx_type_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    fwd_txfm_ = func_info->ft_func;
+    inv_txfm_ = func_info->it_func;
+    size_ = func_info->size;
+    pixel_size_ = func_info->pixel_size;
+    max_pixel_value_ = (1 << bit_depth_) - 1;
 
-idctNxN(4, 16, 10);
-idctNxN(4, 16, 12);
-idctNxN(8, 64, 10);
-idctNxN(8, 64, 12);
-idctNxN(16, 256, 10);
-idctNxN(16, 256, 12);
-idctNxN(32, 1024, 10);
-idctNxN(32, 1024, 12);
-
-#define ihtNxN(n, coeffs, bitdepth)                                        \
-  void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,       \
-                                 int stride, int tx_type) {                \
-    vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out),  \
-                                               stride, tx_type, bitdepth); \
-  }
+    // Randomize stride_ to a value less than or equal to 1024
+    stride_ = rnd_(1024) + 1;
+    if (stride_ < size_) {
+      stride_ = size_;
+    }
+    // Align stride_ to 16 if it's bigger than 16.
+    if (stride_ > 16) {
+      stride_ &= ~15;
+    }
 
-ihtNxN(4, 16, 10);
-ihtNxN(4, 16, 12);
-ihtNxN(8, 64, 10);
-ihtNxN(8, 64, 12);
-ihtNxN(16, 256, 10);
-// ihtNxN(16, 256, 12);
+    block_size_ = size_ * stride_;
 
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
+    src_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_TRUE(src_ != NULL);
+    dst_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, pixel_size_ * block_size_));
+    ASSERT_TRUE(dst_ != NULL);
+  }
 
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  virtual void TearDown() {
+    vpx_free(src_);
+    src_ = NULL;
+    vpx_free(dst_);
+    dst_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
 
-class TransTestBase {
- public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void InitMem() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
+    if (pixel_size_ == 1) {
+      for (int j = 0; j < block_size_; ++j) {
+        src_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst_[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    } else {
+      ASSERT_EQ(pixel_size_, 2);
+      uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+      uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+      for (int j = 0; j < block_size_; ++j) {
+        src[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+      for (int j = 0; j < block_size_; ++j) {
+        dst[j] = rnd_.Rand16() & max_pixel_value_;
+      }
+    }
+  }
 
- protected:
-  virtual void RunFwdTxfm(const Buffer<int16_t> &in,
-                          Buffer<tran_low_t> *out) = 0;
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+  }
 
-  virtual void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) = 0;
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, stride_, tx_type_, bit_depth_);
+  }
 
+ protected:
   void RunAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     Buffer<int16_t> test_input_block =
         Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
     ASSERT_TRUE(test_input_block.Init());
+    ASSERT_TRUE(test_input_block.TopLeftPixel() != NULL);
     Buffer<tran_low_t> test_temp_block =
         Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(test_temp_block.Init());
-    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst.Init());
-    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(src.Init());
-#if CONFIG_VP9_HIGHBITDEPTH
-    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst16.Init());
-    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(src16.Init());
-#endif  // CONFIG_VP9_HIGHBITDEPTH
     uint32_t max_error = 0;
     int64_t total_error = 0;
     const int count_test_block = 10000;
     for (int i = 0; i < count_test_block; ++i) {
-      if (bit_depth_ == 8) {
-        src.Set(&rnd, &ACMRandom::Rand8);
-        dst.Set(&rnd, &ACMRandom::Rand8);
-        // Initialize a test block with input range [-255, 255].
-        for (int h = 0; h < size_; ++h) {
-          for (int w = 0; w < size_; ++w) {
+      InitMem();
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
             test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
-                src.TopLeftPixel()[h * src.stride() + w] -
-                dst.TopLeftPixel()[h * dst.stride() + w];
-          }
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        src16.Set(&rnd, 0, max_pixel_value_);
-        dst16.Set(&rnd, 0, max_pixel_value_);
-        for (int h = 0; h < size_; ++h) {
-          for (int w = 0; w < size_; ++w) {
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
             test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
-                src16.TopLeftPixel()[h * src16.stride() + w] -
-                dst16.TopLeftPixel()[h * dst16.stride() + w];
+                src[h * stride_ + w] - dst[h * stride_ + w];
           }
         }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
       ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, dst.TopLeftPixel()));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      }
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst_));
 
       for (int h = 0; h < size_; ++h) {
         for (int w = 0; w < size_; ++w) {
           int diff;
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (bit_depth_ != 8) {
-            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
-                   src16.TopLeftPixel()[h * src16.stride() + w];
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
           } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
-                   src.TopLeftPixel()[h * src.stride() + w];
-#if CONFIG_VP9_HIGHBITDEPTH
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
           }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
           const uint32_t error = diff * diff;
           if (max_error < error) max_error = error;
           total_error += error;
@@ -211,14 +257,18 @@ class TransTestBase {
     }
 
     EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has an individual round trip error > "
+        << limit;
 
     EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
+        << "Error: " << size_ << "x" << size_
+        << " transform/inverse transform has average round trip error > "
+        << limit << " per block";
   }
 
   void RunCoeffCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 5000;
     Buffer<int16_t> input_block =
@@ -248,6 +298,7 @@ class TransTestBase {
   }
 
   void RunMemCheck() {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 5000;
     Buffer<int16_t> input_extreme_block =
@@ -265,6 +316,7 @@ class TransTestBase {
       } else if (i == 1) {
         input_extreme_block.Set(-max_pixel_value_);
       } else {
+        ASSERT_TRUE(input_extreme_block.TopLeftPixel() != NULL);
         for (int h = 0; h < size_; ++h) {
           for (int w = 0; w < size_; ++w) {
             input_extreme_block
@@ -279,13 +331,14 @@ class TransTestBase {
 
       // The minimum quant value is 4.
       EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      ASSERT_TRUE(output_block.TopLeftPixel() != NULL);
       for (int h = 0; h < size_; ++h) {
         for (int w = 0; w < size_; ++w) {
           EXPECT_GE(
               4 * DCT_MAX_VALUE << (bit_depth_ - 8),
               abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
-              << "Error: 4x4 FDCT has coefficient larger than "
-                 "4*DCT_MAX_VALUE"
+              << "Error: " << size_ << "x" << size_
+              << " transform has coefficient larger than 4*DCT_MAX_VALUE"
               << " at " << w << "," << h;
           if (::testing::Test::HasFailure()) {
             printf("Size: %d Transform type: %d\n", size_, tx_type_);
@@ -298,6 +351,7 @@ class TransTestBase {
   }
 
   void RunInvAccuracyCheck(int limit) {
+    if (pixel_size_ == 1 && bit_depth_ > VPX_BITS_8) return;
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 1000;
     Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
@@ -314,100 +368,85 @@ class TransTestBase {
     ASSERT_TRUE(src16.Init());
 
     for (int i = 0; i < count_test_block; ++i) {
+      InitMem();
+      ASSERT_TRUE(in.TopLeftPixel() != NULL);
       // Initialize a test block with input range [-max_pixel_value_,
       // max_pixel_value_].
-      if (bit_depth_ == VPX_BITS_8) {
-        src.Set(&rnd, &ACMRandom::Rand8);
-        dst.Set(&rnd, &ACMRandom::Rand8);
-        for (int h = 0; h < size_; ++h) {
-          for (int w = 0; w < size_; ++w) {
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          if (pixel_size_ == 1) {
             in.TopLeftPixel()[h * in.stride() + w] =
-                src.TopLeftPixel()[h * src.stride() + w] -
-                dst.TopLeftPixel()[h * dst.stride() + w];
-          }
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        src16.Set(&rnd, 0, max_pixel_value_);
-        dst16.Set(&rnd, 0, max_pixel_value_);
-        for (int h = 0; h < size_; ++h) {
-          for (int w = 0; w < size_; ++w) {
+                src_[h * stride_ + w] - dst_[h * stride_ + w];
+          } else {
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
             in.TopLeftPixel()[h * in.stride() + w] =
-                src16.TopLeftPixel()[h * src16.stride() + w] -
-                dst16.TopLeftPixel()[h * dst16.stride() + w];
+                src[h * stride_ + w] - dst[h * stride_ + w];
           }
         }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
 
       fwd_txfm_ref(in, &coeff, size_, tx_type_);
 
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel()));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      }
+      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst_));
 
       for (int h = 0; h < size_; ++h) {
         for (int w = 0; w < size_; ++w) {
           int diff;
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (bit_depth_ != 8) {
-            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
-                   src16.TopLeftPixel()[h * src16.stride() + w];
+          if (pixel_size_ == 1) {
+            diff = dst_[h * stride_ + w] - src_[h * stride_ + w];
           } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
-                   src.TopLeftPixel()[h * src.stride() + w];
-#if CONFIG_VP9_HIGHBITDEPTH
+            ASSERT_EQ(pixel_size_, 2);
+            const uint16_t *const src = reinterpret_cast<uint16_t *>(src_);
+            const uint16_t *const dst = reinterpret_cast<uint16_t *>(dst_);
+            diff = dst[h * stride_ + w] - src[h * stride_ + w];
           }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
           const uint32_t error = diff * diff;
           EXPECT_GE(static_cast<uint32_t>(limit), error)
-              << "Error: " << size_ << "x" << size_ << " IDCT has error "
-              << error << " at " << w << "," << h;
+              << "Error: " << size_ << "x" << size_
+              << " inverse transform has error " << error << " at " << w << ","
+              << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            return;
+          }
         }
       }
     }
   }
 
+  FhtFunc fwd_txfm_;
   FhtFuncRef fwd_txfm_ref;
+  IhtWithBdFunc inv_txfm_;
+  ACMRandom rnd_;
+  uint8_t *src_;
+  uint8_t *dst_;
   vpx_bit_depth_t bit_depth_;
   int tx_type_;
   int max_pixel_value_;
   int size_;
+  int stride_;
+  int pixel_size_;
+  int block_size_;
 };
 
-class TransDCT : public TransTestBase,
-                 public ::testing::TestWithParam<DctParam> {
- public:
-  TransDCT() {
-    fwd_txfm_ref = fdct_ref;
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    size_ = GET_PARAM(2);
-    tx_type_ = GET_PARAM(3);
-    bit_depth_ = GET_PARAM(4);
-    max_pixel_value_ = (1 << bit_depth_) - 1;
-  }
+/* -------------------------------------------------------------------------- */
 
- protected:
-  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
-    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
-  }
-
-  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
-    inv_txfm_(in.TopLeftPixel(), out, in.stride());
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
+class TransDCT : public TransTestBase {
+ public:
+  TransDCT() { fwd_txfm_ref = fdct_ref; }
 };
 
-TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(TransDCT, AccuracyCheck) {
+  int t = 1;
+  if (size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 2;
+  } else if (size_ == 32 && bit_depth_ > 10 && pixel_size_ == 2) {
+    t = 7;
+  }
+  RunAccuracyCheck(t);
+}
 
 TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
 
@@ -415,177 +454,150 @@ TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
 
 TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
+static const FuncInfo dct_c_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vpx_highbd_fdct4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_c>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_c>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_c>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_c>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_c>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_c>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_c>, &idct_wrapper<vpx_idct16x16_256_add_c>, 16,
+    1 },
+  { &fdct_wrapper<vpx_fdct32x32_c>, &idct_wrapper<vpx_idct32x32_1024_add_c>, 32,
+    1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
     C, TransDCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10),
-        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10),
-        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10),
-        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, TransDCT,
-    ::testing::Values(
-        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_c_func_info) /
+                                             sizeof(dct_c_func_info[0]))),
+        ::testing::Values(dct_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
 
-#if HAVE_SSE2
 #if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_SSE2
+static const FuncInfo dct_sse2_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
-   make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12),
-   make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12),
-*/
-INSTANTIATE_TEST_CASE_P(
-    SSE2, TransDCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0,
-                   VPX_BITS_10),
-        make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0,
-                   VPX_BITS_10),
-        make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0,
-                   VPX_BITS_8)));
-#else
+  { &fdct_wrapper<vpx_highbd_fdct4x4_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_sse2>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_sse2>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_sse2>, 16, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct32x32_sse2>,
+    &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, 32, 2 },
+#endif
+  { &fdct_wrapper<vpx_fdct4x4_sse2>, &idct_wrapper<vpx_idct4x4_16_add_sse2>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_sse2>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_sse2>,
+    &idct_wrapper<vpx_idct16x16_256_add_sse2>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_sse2>,
+    &idct_wrapper<vpx_idct32x32_1024_add_sse2>, 32, 1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
     SSE2, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_sse2,
-                                 &vpx_idct32x32_1024_add_sse2, 32, 0,
-                                 VPX_BITS_8),
-                      make_tuple(&vpx_fdct16x16_sse2,
-                                 &vpx_idct16x16_256_add_sse2, 16, 0,
-                                 VPX_BITS_8),
-                      make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4,
-                                 0, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // !CONFIG_EMULATE_HARDWARE
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_sse2_func_info) /
+                                             sizeof(dct_sse2_func_info[0]))),
+        ::testing::Values(dct_sse2_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
 #endif  // HAVE_SSE2
 
-#if !CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
-#if !ARCH_X86_64
-// TODO(johannkoenig): high bit depth fdct8x8.
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
-                                 32, 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
-                                 VPX_BITS_8)));
-#else
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
 // vpx_fdct8x8_ssse3 is only available in 64 bit builds.
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
-                                 32, 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
-                                 8, 0, VPX_BITS_8)));
-#endif  // !ARCH_X86_64
-#endif  // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
-
-#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
-// TODO(johannkoenig): high bit depth fdct32x32.
-INSTANTIATE_TEST_CASE_P(
-    AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2,
-                                                 &vpx_idct32x32_1024_add_sse2,
-                                                 32, 0, VPX_BITS_8)));
+static const FuncInfo dct_ssse3_func_info = {
+  &fdct_wrapper<vpx_fdct8x8_ssse3>, &idct_wrapper<vpx_idct8x8_64_add_sse2>, 8, 1
+};
+
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_CASE_P(SSSE3, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_ssse3_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_avx2_func_info = {
+  &fdct_wrapper<vpx_fdct32x32_avx2>, &idct_wrapper<vpx_idct32x32_1024_add_sse2>,
+  32, 1
+};
 
-#endif  // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_CASE_P(AVX2, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_avx2_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON
-#if !CONFIG_EMULATE_HARDWARE
+static const FuncInfo dct_neon_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_neon>, &idct_wrapper<vpx_idct8x8_64_add_neon>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_neon>,
+    &idct_wrapper<vpx_idct16x16_256_add_neon>, 16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_neon>,
+    &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
     NEON, TransDCT,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
-                                 &vpx_idct32x32_1024_add_neon, 32, 0,
-                                 VPX_BITS_8),
-                      make_tuple(&vpx_fdct16x16_neon,
-                                 &vpx_idct16x16_256_add_neon, 16, 0,
-                                 VPX_BITS_8),
-                      make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4,
-                                 0, VPX_BITS_8)));
-#endif  // !CONFIG_EMULATE_HARDWARE
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_neon_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
 #endif  // HAVE_NEON
 
-#if HAVE_MSA
-#if !CONFIG_VP9_HIGHBITDEPTH
-#if !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    MSA, TransDCT,
-    ::testing::Values(
-        make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0,
-                   VPX_BITS_8),
-        make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8),
-        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0,
-                   VPX_BITS_8)));
-#endif  // !CONFIG_EMULATE_HARDWARE
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_MSA
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_msa_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_msa>, &idct_wrapper<vpx_idct4x4_16_add_msa>, 4,
+    1 },
+  { &fdct_wrapper<vpx_fdct8x8_msa>, &idct_wrapper<vpx_idct8x8_64_add_msa>, 8,
+    1 },
+  { &fdct_wrapper<vpx_fdct16x16_msa>, &idct_wrapper<vpx_idct16x16_256_add_msa>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_msa>, &idct_wrapper<vpx_idct32x32_1024_add_msa>,
+    32, 1 }
+};
 
-#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_vsx, 4,
-                                                     0, VPX_BITS_8)));
-#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(MSA, TransDCT,
+                        ::testing::Combine(::testing::Range(0, 4),
+                                           ::testing::Values(dct_msa_func_info),
+                                           ::testing::Values(0),
+                                           ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 
-class TransHT : public TransTestBase, public ::testing::TestWithParam<HtParam> {
- public:
-  TransHT() {
-    fwd_txfm_ref = fht_ref;
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    size_ = GET_PARAM(2);
-    tx_type_ = GET_PARAM(3);
-    bit_depth_ = GET_PARAM(4);
-    max_pixel_value_ = (1 << bit_depth_) - 1;
-  }
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_vsx_func_info = {
+  &fdct_wrapper<vpx_fdct4x4_c>, &idct_wrapper<vpx_idct4x4_16_add_vsx>, 4, 1
+};
 
- protected:
-  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
-    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
-  }
+INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
+                        ::testing::Values(make_tuple(0, &dct_vsx_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
 
-  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
-    inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_);
-  }
+#endif  // !CONFIG_EMULATE_HARDWARE
 
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
+/* -------------------------------------------------------------------------- */
+
+class TransHT : public TransTestBase {
+ public:
+  TransHT() { fwd_txfm_ref = fht_ref; }
 };
 
-TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(TransHT, AccuracyCheck) {
+  RunAccuracyCheck(size_ == 16 && bit_depth_ > 10 && pixel_size_ == 2 ? 2 : 1);
+}
 
 TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
 
@@ -593,117 +605,109 @@ TEST_P(TransHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
-/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
-   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12),
-   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12),
-   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12),
-   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12),
-  */
+static const FuncInfo ht_c_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_c>, 4,
+    2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_c>, 8,
+    2 },
+  { &vp9_highbd_fht16x16_c, &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_c>,
+    16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_c>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_c>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_c>, 16, 1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
     C, TransHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
-#else
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_c_func_info) /
+                                             sizeof(ht_c_func_info[0]))),
+        ::testing::Values(ht_c_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+
+static const FuncInfo ht_neon_func_info[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
+    2 },
+  { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
+    2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+#endif
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
-    C, TransHT,
-    ::testing::Values(
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
-
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
-
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    NEON, TransHT,
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(ht_neon_func_info) /
+                                             sizeof(ht_neon_func_info[0]))),
+        ::testing::Values(ht_neon_func_info), ::testing::Range(0, 4),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+#endif  // HAVE_NEON
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(
-    SSE2, TransHT,
-    ::testing::Values(
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0,
-                   VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1,
-                   VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2,
-                   VPX_BITS_8),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3,
-                   VPX_BITS_8),
-
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8),
-
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3,
-                   VPX_BITS_8)));
+
+static const FuncInfo ht_sse2_func_info[3] = {
+  { &vp9_fht4x4_sse2, &iht_wrapper<vp9_iht4x4_16_add_sse2>, 4, 1 },
+  { &vp9_fht8x8_sse2, &iht_wrapper<vp9_iht8x8_64_add_sse2>, 8, 1 },
+  { &vp9_fht16x16_sse2, &iht_wrapper<vp9_iht16x16_256_add_sse2>, 16, 1 }
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, TransHT,
+                        ::testing::Combine(::testing::Range(0, 3),
+                                           ::testing::Values(ht_sse2_func_info),
+                                           ::testing::Range(0, 4),
+                                           ::testing::Values(VPX_BITS_8)));
 #endif  // HAVE_SSE2
 
-class TransWHT : public TransTestBase,
-                 public ::testing::TestWithParam<DctParam> {
- public:
-  TransWHT() {
-    fwd_txfm_ref = fwht_ref;
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    size_ = GET_PARAM(2);
-    tx_type_ = GET_PARAM(3);
-    bit_depth_ = GET_PARAM(4);
-    max_pixel_value_ = (1 << bit_depth_) - 1;
-  }
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo ht_sse4_1_func_info[3] = {
+  { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_sse4_1>,
+    4, 2 },
+  { vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_sse4_1>,
+    8, 2 },
+  { &vp9_highbd_fht16x16_c,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_sse4_1>, 16, 2 }
+};
 
- protected:
-  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
-    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
-  }
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, TransHT,
+    ::testing::Combine(::testing::Range(0, 3),
+                       ::testing::Values(ht_sse4_1_func_info),
+                       ::testing::Range(0, 4),
+                       ::testing::Values(VPX_BITS_8, VPX_BITS_10,
+                                         VPX_BITS_12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo ht_vsx_func_info[3] = {
+  { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_vsx>, 4, 1 },
+  { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_vsx>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_vsx>, 16, 1 }
+};
 
-  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
-    inv_txfm_(in.TopLeftPixel(), out, in.stride());
-  }
+INSTANTIATE_TEST_CASE_P(VSX, TransHT,
+                        ::testing::Combine(::testing::Range(0, 3),
+                                           ::testing::Values(ht_vsx_func_info),
+                                           ::testing::Range(0, 4),
+                                           ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_VSX
+#endif  // !CONFIG_EMULATE_HARDWARE
+
+/* -------------------------------------------------------------------------- */
 
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
+class TransWHT : public TransTestBase {
+ public:
+  TransWHT() { fwd_txfm_ref = fwht_ref; }
 };
 
 TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
@@ -714,24 +718,39 @@ TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 
+static const FuncInfo wht_c_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  { &fdct_wrapper<vp9_highbd_fwht4x4_c>,
+    &highbd_idct_wrapper<vpx_highbd_iwht4x4_16_add_c>, 4, 2 },
+#endif
+  { &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_c>, 4, 1 }
+};
+
 INSTANTIATE_TEST_CASE_P(
     C, TransWHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, TransWHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
-                                                     &vpx_iwht4x4_16_add_c, 4,
-                                                     0, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(wht_c_func_info) /
+                                             sizeof(wht_c_func_info[0]))),
+        ::testing::Values(wht_c_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
+
+#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+static const FuncInfo wht_sse2_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_sse2>, &idct_wrapper<vpx_iwht4x4_16_add_sse2>, 4, 1
+};
 
-#if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_sse2,
-                                                     &vpx_iwht4x4_16_add_sse2,
-                                                     4, 0, VPX_BITS_8)));
-#endif  // HAVE_SSE2
+                        ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo wht_vsx_func_info = {
+  &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(VSX, TransWHT,
+                        ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libs/libvpx/test/decode_api_test.cc b/libs/libvpx/test/decode_api_test.cc
index 4167cf3e0f0..d4b67ccdb8e 100644
--- a/libs/libvpx/test/decode_api_test.cc
+++ b/libs/libvpx/test/decode_api_test.cc
@@ -138,8 +138,30 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
 }
 
-TEST(DecodeAPI, Vp9PeekSI) {
+void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
+                  uint32_t peek_size) {
   const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
+  // to decoder_peek_si_internal on frames of size < 8.
+  if (data_sz >= 8) {
+    vpx_codec_ctx_t dec;
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
+    EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM
+                                    : VPX_CODEC_CORRUPT_FRAME,
+              vpx_codec_decode(&dec, data, data_sz, NULL, 0));
+    vpx_codec_iter_t iter = NULL;
+    EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
+  }
+
+  // Verify behavior of vpx_codec_peek_stream_info.
+  vpx_codec_stream_info_t si;
+  si.sz = sizeof(si);
+  EXPECT_EQ((data_sz < peek_size) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
+            vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+}
+
+TEST(DecodeAPI, Vp9PeekStreamInfo) {
   // The first 9 bytes are valid and the rest of the bytes are made up. Until
   // size 10, this should return VPX_CODEC_UNSUP_BITSTREAM and after that it
   // should return VPX_CODEC_CORRUPT_FRAME.
@@ -150,24 +172,18 @@ TEST(DecodeAPI, Vp9PeekSI) {
   };
 
   for (uint32_t data_sz = 1; data_sz <= 32; ++data_sz) {
-    // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
-    // to decoder_peek_si_internal on frames of size < 8.
-    if (data_sz >= 8) {
-      vpx_codec_ctx_t dec;
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_dec_init(&dec, codec, NULL, 0));
-      EXPECT_EQ(
-          (data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_CORRUPT_FRAME,
-          vpx_codec_decode(&dec, data, data_sz, NULL, 0));
-      vpx_codec_iter_t iter = NULL;
-      EXPECT_EQ(NULL, vpx_codec_get_frame(&dec, &iter));
-      EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&dec));
-    }
+    TestPeekInfo(data, data_sz, 10);
+  }
+}
+
+TEST(DecodeAPI, Vp9PeekStreamInfoTruncated) {
+  // This profile 1 header requires 10.25 bytes, ensure
+  // vpx_codec_peek_stream_info doesn't over read.
+  const uint8_t profile1_data[10] = { 0xa4, 0xe9, 0x30, 0x68, 0x53,
+                                      0xe9, 0x30, 0x68, 0x53, 0x04 };
 
-    // Verify behavior of vpx_codec_peek_stream_info.
-    vpx_codec_stream_info_t si;
-    si.sz = sizeof(si);
-    EXPECT_EQ((data_sz < 10) ? VPX_CODEC_UNSUP_BITSTREAM : VPX_CODEC_OK,
-              vpx_codec_peek_stream_info(codec, data, data_sz, &si));
+  for (uint32_t data_sz = 1; data_sz <= 10; ++data_sz) {
+    TestPeekInfo(profile1_data, data_sz, 11);
   }
 }
 #endif  // CONFIG_VP9_DECODER
diff --git a/libs/libvpx/test/decode_corrupted.cc b/libs/libvpx/test/decode_corrupted.cc
new file mode 100644
index 00000000000..b1495ce89ff
--- /dev/null
+++ b/libs/libvpx/test/decode_corrupted.cc
@@ -0,0 +1,103 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tuple>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/i420_video_source.h"
+#include "vpx_mem/vpx_mem.h"
+
+namespace {
+
+class DecodeCorruptedFrameTest
+    : public ::libvpx_test::EncoderTest,
+      public ::testing::TestWithParam<
+          std::tuple<const libvpx_test::CodecFactory *> > {
+ public:
+  DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
+
+ protected:
+  virtual ~DecodeCorruptedFrameTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+
+    // Set small key frame distance such that we insert more key frames.
+    cfg_.kf_max_dist = 3;
+    dec_cfg_.threads = 1;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {}
+
+  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) {
+    // Don't edit frame packet on key frame.
+    if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
+
+    memcpy(&modified_pkt_, pkt, sizeof(*pkt));
+
+    // Halve the size so it's corrupted to decoder.
+    modified_pkt_.data.frame.sz = modified_pkt_.data.frame.sz / 2;
+
+    return &modified_pkt_;
+  }
+
+  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                                  const libvpx_test::VideoSource & /*video*/,
+                                  libvpx_test::Decoder *decoder) {
+    EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
+    return VPX_CODEC_MEM_ERROR != res_dec;
+  }
+
+  vpx_codec_cx_pkt_t modified_pkt_;
+};
+
+TEST_P(DecodeCorruptedFrameTest, DecodeCorruptedFrame) {
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9
+INSTANTIATE_TEST_CASE_P(
+    VP9, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif  // CONFIG_VP9
+
+#if CONFIG_VP8
+INSTANTIATE_TEST_CASE_P(
+    VP8, DecodeCorruptedFrameTest,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)));
+#endif  // CONFIG_VP8
+
+}  // namespace
diff --git a/libs/libvpx/test/decode_perf_test.cc b/libs/libvpx/test/decode_perf_test.cc
index ee26c3c0464..aecdd3e9993 100644
--- a/libs/libvpx/test/decode_perf_test.cc
+++ b/libs/libvpx/test/decode_perf_test.cc
@@ -9,6 +9,8 @@
  */
 
 #include <string>
+#include <tuple>
+
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
@@ -21,7 +23,7 @@
 #include "./ivfenc.h"
 #include "./vpx_version.h"
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 namespace {
 
@@ -34,7 +36,7 @@ const char kNewEncodeOutputFile[] = "new_encode.ivf";
 /*
  DecodePerfTest takes a tuple of filename + number of threads to decode with
  */
-typedef std::tr1::tuple<const char *, unsigned> DecodePerfParam;
+typedef std::tuple<const char *, unsigned> DecodePerfParam;
 
 const DecodePerfParam kVP9DecodePerfVectors[] = {
   make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
@@ -137,7 +139,7 @@ class VP9NewEncodeDecodePerfTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, speed_);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
diff --git a/libs/libvpx/test/decode_svc_test.cc b/libs/libvpx/test/decode_svc_test.cc
index 69f62f13bde..c6f0873f897 100644
--- a/libs/libvpx/test/decode_svc_test.cc
+++ b/libs/libvpx/test/decode_svc_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
 #include "test/codec_factory.h"
@@ -53,7 +54,7 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
 // number of frames decoded. This results in 1/4x1/4 resolution (320x180).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
   ASSERT_TRUE(video.get() != NULL);
   video->Init();
@@ -70,7 +71,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer0) {
 // number of frames decoded. This results in 1/2x1/2 resolution (640x360).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
   ASSERT_TRUE(video.get() != NULL);
   video->Init();
@@ -87,7 +88,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer1) {
 // number of frames decoded. This results in the full resolution (1280x720).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
   ASSERT_TRUE(video.get() != NULL);
   video->Init();
@@ -105,7 +106,7 @@ TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer2) {
 // the decoding should result in the full resolution (1280x720).
 TEST_P(DecodeSvcTest, DecodeSvcTestUpToSpatialLayer10) {
   const std::string filename = GET_PARAM(1);
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   video.reset(new libvpx_test::IVFVideoSource(filename));
   ASSERT_TRUE(video.get() != NULL);
   video->Init();
diff --git a/libs/libvpx/test/decode_test_driver.cc b/libs/libvpx/test/decode_test_driver.cc
index 48680eb8e97..ae23587759d 100644
--- a/libs/libvpx/test/decode_test_driver.cc
+++ b/libs/libvpx/test/decode_test_driver.cc
@@ -52,9 +52,10 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
     /* Vp8's implementation of PeekStream returns an error if the frame you
      * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
      * frame, which must be a keyframe. */
-    if (video->frame_number() == 0)
+    if (video->frame_number() == 0) {
       ASSERT_EQ(VPX_CODEC_OK, res_peek)
           << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
+    }
   } else {
     /* The Vp9 implementation of PeekStream returns an error only if the
      * data passed to it isn't a valid Vp9 chunk. */
@@ -97,7 +98,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video,
     const vpx_image_t *img = NULL;
 
     // Get decompressed data
-    while ((img = dec_iter.Next())) {
+    while (!::testing::Test::HasFailure() && (img = dec_iter.Next())) {
       DecompressedFrameHook(*img, video->frame_number());
     }
   }
diff --git a/libs/libvpx/test/decode_test_driver.h b/libs/libvpx/test/decode_test_driver.h
index 644fc9e90dc..04876cdd7cb 100644
--- a/libs/libvpx/test/decode_test_driver.h
+++ b/libs/libvpx/test/decode_test_driver.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_DECODE_TEST_DRIVER_H_
-#define TEST_DECODE_TEST_DRIVER_H_
+#ifndef VPX_TEST_DECODE_TEST_DRIVER_H_
+#define VPX_TEST_DECODE_TEST_DRIVER_H_
 #include <cstring>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
@@ -159,4 +159,4 @@ class DecoderTest {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_DECODE_TEST_DRIVER_H_
+#endif  // VPX_TEST_DECODE_TEST_DRIVER_H_
diff --git a/libs/libvpx/test/encode_perf_test.cc b/libs/libvpx/test/encode_perf_test.cc
index 0bb435502b3..142d9e2da8e 100644
--- a/libs/libvpx/test/encode_perf_test.cc
+++ b/libs/libvpx/test/encode_perf_test.cc
@@ -48,7 +48,7 @@ const EncodePerfTestVideo kVP9EncodePerfTestVectors[] = {
   EncodePerfTestVideo("niklas_1280_720_30.yuv", 1280, 720, 600, 470),
 };
 
-const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8 };
+const int kEncodePerfTestSpeeds[] = { 5, 6, 7, 8, 9 };
 const int kEncodePerfTestThreads[] = { 1, 2, 4 };
 
 #define NELEMENTS(x) (sizeof((x)) / sizeof((x)[0]))
diff --git a/libs/libvpx/test/encode_test_driver.cc b/libs/libvpx/test/encode_test_driver.cc
index b2cbc3f05bd..8fdbdb62ae7 100644
--- a/libs/libvpx/test/encode_test_driver.cc
+++ b/libs/libvpx/test/encode_test_driver.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -128,6 +129,8 @@ static bool compare_img(const vpx_image_t *img1, const vpx_image_t *img2) {
   bool match = (img1->fmt == img2->fmt) && (img1->cs == img2->cs) &&
                (img1->d_w == img2->d_w) && (img1->d_h == img2->d_h);
 
+  if (!match) return false;
+
   const unsigned int width_y = img1->d_w;
   const unsigned int height_y = img1->d_h;
   unsigned int i;
@@ -177,7 +180,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
     }
 
     BeginPassHook(pass);
-    testing::internal::scoped_ptr<Encoder> encoder(
+    std::unique_ptr<Encoder> encoder(
         codec_->CreateEncoder(cfg_, deadline_, init_flags_, &stats_));
     ASSERT_TRUE(encoder.get() != NULL);
 
@@ -191,7 +194,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
     if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION) {
       dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
     }
-    testing::internal::scoped_ptr<Decoder> decoder(
+    std::unique_ptr<Decoder> decoder(
         codec_->CreateDecoder(dec_cfg, dec_init_flags));
     bool again;
     for (again = true; again; video->Next()) {
@@ -214,6 +217,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
           case VPX_CODEC_CX_FRAME_PKT:
             has_cxdata = true;
             if (decoder.get() != NULL && DoDecode()) {
+              PreDecodeFrameHook(video, decoder.get());
               vpx_codec_err_t res_dec = decoder->DecodeFrame(
                   (const uint8_t *)pkt->data.frame.buf, pkt->data.frame.sz);
 
diff --git a/libs/libvpx/test/encode_test_driver.h b/libs/libvpx/test/encode_test_driver.h
index 89a3b1767e2..3edba4b926a 100644
--- a/libs/libvpx/test/encode_test_driver.h
+++ b/libs/libvpx/test/encode_test_driver.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_ENCODE_TEST_DRIVER_H_
-#define TEST_ENCODE_TEST_DRIVER_H_
+#ifndef VPX_TEST_ENCODE_TEST_DRIVER_H_
+#define VPX_TEST_ENCODE_TEST_DRIVER_H_
 
 #include <string>
 #include <vector>
@@ -128,24 +128,37 @@ class Encoder {
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 
+  void Control(int ctrl_id, struct vpx_svc_ref_frame_config *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
   void Control(int ctrl_id, struct vpx_svc_parameters *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
+
+  void Control(int ctrl_id, struct vpx_svc_frame_drop *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
+  void Control(int ctrl_id, struct vpx_svc_spatial_layer_sync *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
   void Control(int ctrl_id, vpx_active_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
-#endif
 
-#if CONFIG_VP8_ENCODER
   void Control(int ctrl_id, vpx_roi_map_t *arg) {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
 #endif
-
   void Config(const vpx_codec_enc_cfg_t *cfg) {
     const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -219,6 +232,9 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
+  virtual void PreDecodeFrameHook(VideoSource * /*video*/,
+                                  Decoder * /*decoder*/) {}
+
   virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
 
   // Hook to be called on every compressed data packet.
@@ -273,4 +289,4 @@ class EncoderTest {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_ENCODE_TEST_DRIVER_H_
+#endif  // VPX_TEST_ENCODE_TEST_DRIVER_H_
diff --git a/libs/libvpx/test/external_frame_buffer_test.cc b/libs/libvpx/test/external_frame_buffer_test.cc
index dbf2971198c..438eeb3ecdf 100644
--- a/libs/libvpx/test/external_frame_buffer_test.cc
+++ b/libs/libvpx/test/external_frame_buffer_test.cc
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
 #include <string>
 
 #include "./vpx_config.h"
@@ -113,9 +114,9 @@ class ExternalFrameBufferList {
     return 0;
   }
 
-  // Checks that the ximage data is contained within the external frame buffer
-  // private data passed back in the ximage.
-  void CheckXImageFrameBuffer(const vpx_image_t *img) {
+  // Checks that the vpx_image_t data is contained within the external frame
+  // buffer private data passed back in the vpx_image_t.
+  void CheckImageFrameBuffer(const vpx_image_t *img) {
     if (img->fb_priv != NULL) {
       const struct ExternalFrameBuffer *const ext_fb =
           reinterpret_cast<ExternalFrameBuffer *>(img->fb_priv);
@@ -335,14 +336,13 @@ class ExternalFrameBufferTest : public ::testing::Test {
     return VPX_CODEC_OK;
   }
 
- protected:
   void CheckDecodedFrames() {
     libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
     const vpx_image_t *img = NULL;
 
     // Get decompressed data
     while ((img = dec_iter.Next()) != NULL) {
-      fb_list_.CheckXImageFrameBuffer(img);
+      fb_list_.CheckImageFrameBuffer(img);
     }
   }
 
@@ -393,7 +393,7 @@ TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
 #endif
 
   // Open compressed video file.
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   if (filename.substr(filename.length() - 3, 3) == "ivf") {
     video.reset(new libvpx_test::IVFVideoSource(filename));
   } else {
diff --git a/libs/libvpx/test/fdct8x8_test.cc b/libs/libvpx/test/fdct8x8_test.cc
index 5021dda9b38..244b9740b03 100644
--- a/libs/libvpx/test/fdct8x8_test.cc
+++ b/libs/libvpx/test/fdct8x8_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -43,9 +44,9 @@ typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
-typedef std::tr1::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
+typedef std::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
+typedef std::tuple<IdctFunc, IdctFunc, int, vpx_bit_depth_t> Idct8x8Param;
 
 void reference_8x8_dct_1d(const double in[8], double out[8]) {
   const double kInvSqrt2 = 0.707106781186547524400844362104;
@@ -628,7 +629,7 @@ TEST_P(InvTrans8x8DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
@@ -675,6 +676,7 @@ INSTANTIATE_TEST_CASE_P(NEON, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&vpx_fdct8x8_neon,
                                                      &vpx_idct8x8_64_add_neon,
                                                      0, VPX_BITS_8)));
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8HT,
diff --git a/libs/libvpx/test/frame_size_tests.cc b/libs/libvpx/test/frame_size_tests.cc
index 5a9b166e5b1..f66972b4a19 100644
--- a/libs/libvpx/test/frame_size_tests.cc
+++ b/libs/libvpx/test/frame_size_tests.cc
@@ -34,7 +34,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
diff --git a/libs/libvpx/test/hadamard_test.cc b/libs/libvpx/test/hadamard_test.cc
index 3b7cfeddcfe..b194ace6747 100644
--- a/libs/libvpx/test/hadamard_test.cc
+++ b/libs/libvpx/test/hadamard_test.cc
@@ -25,13 +25,13 @@ using ::libvpx_test::ACMRandom;
 typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
                              tran_low_t *b);
 
-void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
-  int16_t b[8];
+void hadamard_loop(const tran_low_t *a, tran_low_t *out) {
+  tran_low_t b[8];
   for (int i = 0; i < 8; i += 2) {
-    b[i + 0] = a[i * a_stride] + a[(i + 1) * a_stride];
-    b[i + 1] = a[i * a_stride] - a[(i + 1) * a_stride];
+    b[i + 0] = a[i * 8] + a[(i + 1) * 8];
+    b[i + 1] = a[i * 8] - a[(i + 1) * 8];
   }
-  int16_t c[8];
+  tran_low_t c[8];
   for (int i = 0; i < 8; i += 4) {
     c[i + 0] = b[i + 0] + b[i + 2];
     c[i + 1] = b[i + 1] + b[i + 3];
@@ -49,12 +49,15 @@ void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
 }
 
 void reference_hadamard8x8(const int16_t *a, int a_stride, tran_low_t *b) {
-  int16_t buf[64];
-  int16_t buf2[64];
-  for (int i = 0; i < 8; ++i) hadamard_loop(a + i, a_stride, buf + i * 8);
-  for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, 8, buf2 + i * 8);
-
-  for (int i = 0; i < 64; ++i) b[i] = (tran_low_t)buf2[i];
+  tran_low_t input[64];
+  tran_low_t buf[64];
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      input[i * 8 + j] = static_cast<tran_low_t>(a[i * a_stride + j]);
+    }
+  }
+  for (int i = 0; i < 8; ++i) hadamard_loop(input + i, buf + i * 8);
+  for (int i = 0; i < 8; ++i) hadamard_loop(buf + i, b + i * 8);
 }
 
 void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
@@ -89,205 +92,229 @@ void reference_hadamard16x16(const int16_t *a, int a_stride, tran_low_t *b) {
   }
 }
 
-class HadamardTestBase : public ::testing::TestWithParam<HadamardFunc> {
- public:
-  virtual void SetUp() {
-    h_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
+void reference_hadamard32x32(const int16_t *a, int a_stride, tran_low_t *b) {
+  reference_hadamard16x16(a + 0 + 0 * a_stride, a_stride, b + 0);
+  reference_hadamard16x16(a + 16 + 0 * a_stride, a_stride, b + 256);
+  reference_hadamard16x16(a + 0 + 16 * a_stride, a_stride, b + 512);
+  reference_hadamard16x16(a + 16 + 16 * a_stride, a_stride, b + 768);
 
- protected:
-  HadamardFunc h_func_;
-  ACMRandom rnd_;
-};
+  for (int i = 0; i < 256; ++i) {
+    const tran_low_t a0 = b[0];
+    const tran_low_t a1 = b[256];
+    const tran_low_t a2 = b[512];
+    const tran_low_t a3 = b[768];
 
-void HadamardSpeedTest(const char *name, HadamardFunc const func,
-                       const int16_t *input, int stride, tran_low_t *output,
-                       int times) {
-  int i;
-  vpx_usec_timer timer;
+    const tran_low_t b0 = (a0 + a1) >> 2;
+    const tran_low_t b1 = (a0 - a1) >> 2;
+    const tran_low_t b2 = (a2 + a3) >> 2;
+    const tran_low_t b3 = (a2 - a3) >> 2;
 
-  vpx_usec_timer_start(&timer);
-  for (i = 0; i < times; ++i) {
-    func(input, stride, output);
-  }
-  vpx_usec_timer_mark(&timer);
+    b[0] = b0 + b2;
+    b[256] = b1 + b3;
+    b[512] = b0 - b2;
+    b[768] = b1 - b3;
 
-  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-  printf("%s[%12d runs]: %d us\n", name, times, elapsed_time);
+    ++b;
+  }
 }
 
-class Hadamard8x8Test : public HadamardTestBase {};
+struct HadamardFuncWithSize {
+  HadamardFuncWithSize(HadamardFunc f, int s) : func(f), block_size(s) {}
+  HadamardFunc func;
+  int block_size;
+};
 
-void HadamardSpeedTest8x8(HadamardFunc const func, int times) {
-  DECLARE_ALIGNED(16, int16_t, input[64]);
-  DECLARE_ALIGNED(16, tran_low_t, output[64]);
-  memset(input, 1, sizeof(input));
-  HadamardSpeedTest("Hadamard8x8", func, input, 8, output, times);
+std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
+  return os << "block size: " << hfs.block_size;
 }
 
-TEST_P(Hadamard8x8Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[64]);
-  DECLARE_ALIGNED(16, tran_low_t, b[64]);
-  tran_low_t b_ref[64];
-  for (int i = 0; i < 64; ++i) {
-    a[i] = rnd_.Rand9Signed();
+class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
+ public:
+  virtual void SetUp() {
+    h_func_ = GetParam().func;
+    bwh_ = GetParam().block_size;
+    block_size_ = bwh_ * bwh_;
+    rnd_.Reset(ACMRandom::DeterministicSeed());
   }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
-
-  reference_hadamard8x8(a, 8, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 8, b));
 
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 64);
-  std::sort(b_ref, b_ref + 64);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-}
+  virtual int16_t Rand() = 0;
 
-TEST_P(Hadamard8x8Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[64 * 8]);
-  DECLARE_ALIGNED(16, tran_low_t, b[64]);
-  tran_low_t b_ref[64];
-  for (int i = 0; i < 64 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
+  void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
+                         int bwh) {
+    if (bwh == 32)
+      reference_hadamard32x32(a, a_stride, b);
+    else if (bwh == 16)
+      reference_hadamard16x16(a, a_stride, b);
+    else
+      reference_hadamard8x8(a, a_stride, b);
   }
 
-  for (int i = 8; i < 64; i += 8) {
+  void CompareReferenceRandom() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
     memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
     memset(b_ref, 0, sizeof(b_ref));
 
-    reference_hadamard8x8(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+    for (int i = 0; i < block_size_; ++i) a[i] = Rand();
+
+    ReferenceHadamard(a, bwh_, b_ref, bwh_);
+    ASM_REGISTER_STATE_CHECK(h_func_(a, bwh_, b));
 
     // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 64);
-    std::sort(b_ref, b_ref + 64);
+    std::sort(b, b + block_size_);
+    std::sort(b_ref, b_ref + block_size_);
     EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
   }
-}
 
-TEST_P(Hadamard8x8Test, DISABLED_Speed) {
-  HadamardSpeedTest8x8(h_func_, 10);
-  HadamardSpeedTest8x8(h_func_, 10000);
-  HadamardSpeedTest8x8(h_func_, 10000000);
+  void VaryStride() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(a, 0, sizeof(a));
+    for (int i = 0; i < block_size_ * 8; ++i) a[i] = Rand();
+
+    tran_low_t b_ref[kMaxBlockSize];
+    for (int i = 8; i < 64; i += 8) {
+      memset(b, 0, sizeof(b));
+      memset(b_ref, 0, sizeof(b_ref));
+
+      ReferenceHadamard(a, i, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
+  void SpeedTest(int times) {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, output[kMaxBlockSize]);
+    memset(input, 1, sizeof(input));
+    memset(output, 0, sizeof(output));
+
+    vpx_usec_timer timer;
+    vpx_usec_timer_start(&timer);
+    for (int i = 0; i < times; ++i) {
+      h_func_(input, bwh_, output);
+    }
+    vpx_usec_timer_mark(&timer);
+
+    const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+    printf("Hadamard%dx%d[%12d runs]: %d us\n", bwh_, bwh_, times,
+           elapsed_time);
+  }
+
+ protected:
+  int bwh_;
+  int block_size_;
+  HadamardFunc h_func_;
+  ACMRandom rnd_;
+};
+
+class HadamardLowbdTest : public HadamardTestBase {
+ protected:
+  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+};
+
+TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
+
+TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
+
+TEST_P(HadamardLowbdTest, DISABLED_Speed) {
+  SpeedTest(10);
+  SpeedTest(10000);
+  SpeedTest(10000000);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_c));
+INSTANTIATE_TEST_CASE_P(
+    C, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_c, 32)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_sse2));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_sse2, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_sse2, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_sse2, 32)));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_avx2, 32)));
+#endif  // HAVE_AVX2
+
 #if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_ssse3));
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_ssse3, 8)));
 #endif  // HAVE_SSSE3 && ARCH_X86_64
 
 #if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_neon));
+INSTANTIATE_TEST_CASE_P(
+    NEON, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16)));
 #endif  // HAVE_NEON
 
 // TODO(jingning): Remove highbitdepth flag when the SIMD functions are
 // in place and turn on the unit test.
 #if !CONFIG_VP9_HIGHBITDEPTH
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_msa));
+INSTANTIATE_TEST_CASE_P(
+    MSA, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_msa, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_msa, 16)));
 #endif  // HAVE_MSA
 #endif  // !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_VSX
-INSTANTIATE_TEST_CASE_P(VSX, Hadamard8x8Test,
-                        ::testing::Values(&vpx_hadamard_8x8_vsx));
+INSTANTIATE_TEST_CASE_P(
+    VSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_vsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
 #endif  // HAVE_VSX
 
-class Hadamard16x16Test : public HadamardTestBase {};
-
-void HadamardSpeedTest16x16(HadamardFunc const func, int times) {
-  DECLARE_ALIGNED(16, int16_t, input[256]);
-  DECLARE_ALIGNED(16, tran_low_t, output[256]);
-  memset(input, 1, sizeof(input));
-  HadamardSpeedTest("Hadamard16x16", func, input, 16, output, times);
-}
-
-TEST_P(Hadamard16x16Test, CompareReferenceRandom) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16]);
-  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
-  tran_low_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-  memset(b, 0, sizeof(b));
-  memset(b_ref, 0, sizeof(b_ref));
-
-  reference_hadamard16x16(a, 16, b_ref);
-  ASM_REGISTER_STATE_CHECK(h_func_(a, 16, b));
-
-  // The order of the output is not important. Sort before checking.
-  std::sort(b, b + 16 * 16);
-  std::sort(b_ref, b_ref + 16 * 16);
-  EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-}
-
-TEST_P(Hadamard16x16Test, VaryStride) {
-  DECLARE_ALIGNED(16, int16_t, a[16 * 16 * 8]);
-  DECLARE_ALIGNED(16, tran_low_t, b[16 * 16]);
-  tran_low_t b_ref[16 * 16];
-  for (int i = 0; i < 16 * 16 * 8; ++i) {
-    a[i] = rnd_.Rand9Signed();
-  }
-
-  for (int i = 8; i < 64; i += 8) {
-    memset(b, 0, sizeof(b));
-    memset(b_ref, 0, sizeof(b_ref));
+#if CONFIG_VP9_HIGHBITDEPTH
+class HadamardHighbdTest : public HadamardTestBase {
+ protected:
+  virtual int16_t Rand() { return rnd_.Rand13Signed(); }
+};
 
-    reference_hadamard16x16(a, i, b_ref);
-    ASM_REGISTER_STATE_CHECK(h_func_(a, i, b));
+TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
-    // The order of the output is not important. Sort before checking.
-    std::sort(b, b + 16 * 16);
-    std::sort(b_ref, b_ref + 16 * 16);
-    EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
-  }
-}
+TEST_P(HadamardHighbdTest, VaryStride) { VaryStride(); }
 
-TEST_P(Hadamard16x16Test, DISABLED_Speed) {
-  HadamardSpeedTest16x16(h_func_, 10);
-  HadamardSpeedTest16x16(h_func_, 10000);
-  HadamardSpeedTest16x16(h_func_, 10000000);
+TEST_P(HadamardHighbdTest, DISABLED_Speed) {
+  SpeedTest(10);
+  SpeedTest(10000);
+  SpeedTest(10000000);
 }
 
-INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_c));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_sse2));
-#endif  // HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    C, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_c, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_c, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_c, 32)));
 
 #if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_avx2));
+INSTANTIATE_TEST_CASE_P(
+    AVX2, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_avx2, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_avx2, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_avx2,
+                                           32)));
 #endif  // HAVE_AVX2
 
-#if HAVE_VSX
-INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_vsx));
-#endif  // HAVE_VSX
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_neon));
-#endif  // HAVE_NEON
-
-#if !CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, Hadamard16x16Test,
-                        ::testing::Values(&vpx_hadamard_16x16_msa));
-#endif  // HAVE_MSA
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libs/libvpx/test/i420_video_source.h b/libs/libvpx/test/i420_video_source.h
index 49573823b4f..97473b5c2f1 100644
--- a/libs/libvpx/test/i420_video_source.h
+++ b/libs/libvpx/test/i420_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_I420_VIDEO_SOURCE_H_
-#define TEST_I420_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_I420_VIDEO_SOURCE_H_
+#define VPX_TEST_I420_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
 #include <string>
@@ -30,4 +30,4 @@ class I420VideoSource : public YUVVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_I420_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_I420_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/idct_test.cc b/libs/libvpx/test/idct_test.cc
index 3700374d7a9..3564c0bd5d1 100644
--- a/libs/libvpx/test/idct_test.cc
+++ b/libs/libvpx/test/idct_test.cc
@@ -72,6 +72,7 @@ TEST_P(IDCTTest, TestAllZeros) {
 
 TEST_P(IDCTTest, TestAllOnes) {
   input->Set(0);
+  ASSERT_TRUE(input->TopLeftPixel() != NULL);
   // When the first element is '4' it will fill the output buffer with '1'.
   input->TopLeftPixel()[0] = 4;
   predict->Set(0);
@@ -89,6 +90,7 @@ TEST_P(IDCTTest, TestAddOne) {
   // Set the transform output to '1' and make sure it gets added to the
   // prediction buffer.
   input->Set(0);
+  ASSERT_TRUE(input->TopLeftPixel() != NULL);
   input->TopLeftPixel()[0] = 4;
   output->Set(0);
 
@@ -174,4 +176,4 @@ INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
 INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmi));
 #endif  // HAVE_MMI
-}
+}  // namespace
diff --git a/libs/libvpx/test/invalid_file_test.cc b/libs/libvpx/test/invalid_file_test.cc
index 79220b0f69e..8eed05eb494 100644
--- a/libs/libvpx/test/invalid_file_test.cc
+++ b/libs/libvpx/test/invalid_file_test.cc
@@ -10,6 +10,7 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <string>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
@@ -89,7 +90,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
     const std::string filename = input.filename;
 
     // Open compressed video file.
-    testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+    std::unique_ptr<libvpx_test::CompressedVideoSource> video;
     if (filename.substr(filename.length() - 3, 3) == "ivf") {
       video.reset(new libvpx_test::IVFVideoSource(filename));
     } else if (filename.substr(filename.length() - 4, 4) == "webm") {
@@ -123,6 +124,8 @@ TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 #if CONFIG_VP8_DECODER
 const DecodeParam kVP8InvalidFileTests[] = {
   { 1, "invalid-bug-1443.ivf" },
+  { 1, "invalid-token-partition.ivf" },
+  { 1, "invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf" },
 };
 
 VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -202,6 +205,8 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
   { 2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf" },
   { 4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf" },
   { 2, "invalid-crbug-629481.webm" },
+  { 3, "invalid-crbug-1558.ivf" },
+  { 4, "invalid-crbug-1562.ivf" },
 };
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/libs/libvpx/test/ivf_video_source.h b/libs/libvpx/test/ivf_video_source.h
index 5862d2649f8..22c05ecde9e 100644
--- a/libs/libvpx/test/ivf_video_source.h
+++ b/libs/libvpx/test/ivf_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_IVF_VIDEO_SOURCE_H_
-#define TEST_IVF_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_IVF_VIDEO_SOURCE_H_
+#define VPX_TEST_IVF_VIDEO_SOURCE_H_
 #include <cstdio>
 #include <cstdlib>
 #include <new>
@@ -16,7 +16,7 @@
 #include "test/video_source.h"
 
 namespace libvpx_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
 const unsigned int kIvfFileHdrSize = 32;
 const unsigned int kIvfFrameHdrSize = 12;
 
@@ -103,4 +103,4 @@ class IVFVideoSource : public CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_IVF_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_IVF_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/keyframe_test.cc b/libs/libvpx/test/keyframe_test.cc
index ee75f401ca5..582d448168d 100644
--- a/libs/libvpx/test/keyframe_test.cc
+++ b/libs/libvpx/test/keyframe_test.cc
@@ -38,7 +38,7 @@ class KeyframeTest
     if (kf_do_force_kf_) {
       frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
     }
-    if (set_cpu_used_ && video->frame() == 1) {
+    if (set_cpu_used_ && video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
     }
   }
@@ -68,7 +68,9 @@ TEST_P(KeyframeTest, TestRandomVideoSource) {
 
   // In realtime mode - auto placed keyframes are exceedingly rare,  don't
   // bother with this check   if(GetParam() > 0)
-  if (GET_PARAM(1) > 0) EXPECT_GT(kf_count_, 1);
+  if (GET_PARAM(1) > 0) {
+    EXPECT_GT(kf_count_, 1);
+  }
 }
 
 TEST_P(KeyframeTest, TestDisableKeyframes) {
@@ -128,8 +130,9 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
 
   // In realtime mode - auto placed keyframes are exceedingly rare,  don't
   // bother with this check
-  if (GET_PARAM(1) > 0)
+  if (GET_PARAM(1) > 0) {
     EXPECT_EQ(2u, kf_pts_list_.size()) << " Not the right number of keyframes ";
+  }
 
   // Verify that keyframes match the file keyframes in the file.
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
diff --git a/libs/libvpx/test/lpf_test.cc b/libs/libvpx/test/lpf_test.cc
index e04b996cd86..dfdd5159920 100644
--- a/libs/libvpx/test/lpf_test.cc
+++ b/libs/libvpx/test/lpf_test.cc
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -56,8 +57,8 @@ typedef void (*dual_loop_op_t)(Pixel *s, int p, const uint8_t *blimit0,
                                const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
-typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
+typedef std::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
 
 void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
                const int mask, const int32_t p, const int i) {
@@ -402,7 +403,7 @@ TEST_P(Loop8Test9Param, ValueCheck) {
       << "First failed at test case " << first_failure;
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/test/md5_helper.h b/libs/libvpx/test/md5_helper.h
index ef310a2d900..dc28dc6283b 100644
--- a/libs/libvpx/test/md5_helper.h
+++ b/libs/libvpx/test/md5_helper.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_MD5_HELPER_H_
-#define TEST_MD5_HELPER_H_
+#ifndef VPX_TEST_MD5_HELPER_H_
+#define VPX_TEST_MD5_HELPER_H_
 
 #include "./md5_utils.h"
 #include "vpx/vpx_decoder.h"
@@ -72,4 +72,4 @@ class MD5 {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_MD5_HELPER_H_
+#endif  // VPX_TEST_MD5_HELPER_H_
diff --git a/libs/libvpx/test/partial_idct_test.cc b/libs/libvpx/test/partial_idct_test.cc
index f7b50f53a11..e66a695eb0d 100644
--- a/libs/libvpx/test/partial_idct_test.cc
+++ b/libs/libvpx/test/partial_idct_test.cc
@@ -11,8 +11,8 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <limits>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -51,8 +51,8 @@ void highbd_wrapper(const tran_low_t *in, uint8_t *out, int stride, int bd) {
 }
 #endif
 
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc,
-                        TX_SIZE, int, int, int>
+typedef std::tuple<FwdTxfmFunc, InvTxfmWithBdFunc, InvTxfmWithBdFunc, TX_SIZE,
+                   int, int, int>
     PartialInvTxfmParam;
 const int kMaxNumCoeffs = 1024;
 const int kCountTestBlock = 1000;
@@ -324,7 +324,7 @@ TEST_P(PartialIDctTest, DISABLED_Speed) {
       << "Error: partial inverse transform produces different results";
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 const PartialInvTxfmParam c_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/test/pp_filter_test.cc b/libs/libvpx/test/pp_filter_test.cc
index 5a2ade1ef4f..1ed261bf9b6 100644
--- a/libs/libvpx/test/pp_filter_test.cc
+++ b/libs/libvpx/test/pp_filter_test.cc
@@ -11,6 +11,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -32,7 +33,6 @@ typedef void (*VpxMbPostProcDownFunc)(unsigned char *dst, int pitch, int rows,
                                       int cols, int flimit);
 
 namespace {
-
 // Compute the filter level used in post proc from the loop filter strength
 int q2mbl(int x) {
   if (x < 20) x = 20;
@@ -42,33 +42,52 @@ int q2mbl(int x) {
 }
 
 class VpxPostProcDownAndAcrossMbRowTest
-    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
  public:
+  VpxPostProcDownAndAcrossMbRowTest()
+      : mb_post_proc_down_and_across_(GetParam()) {}
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void Run();
+
+  const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_;
+  // Size of the underlying data block that will be filtered.
+  int block_width_;
+  int block_height_;
+  Buffer<uint8_t> *src_image_;
+  Buffer<uint8_t> *dst_image_;
+  uint8_t *flimits_;
 };
 
+void VpxPostProcDownAndAcrossMbRowTest::Run() {
+  mb_post_proc_down_and_across_(
+      src_image_->TopLeftPixel(), dst_image_->TopLeftPixel(),
+      src_image_->stride(), dst_image_->stride(), block_width_, flimits_, 16);
+}
+
 // Test routine for the VPx post-processing function
 // vpx_post_proc_down_and_across_mb_row_c.
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Size of the underlying data block that will be filtered.
-  const int block_width = 16;
-  const int block_height = 16;
+  block_width_ = 16;
+  block_height_ = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
-  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width_, block_height_, 2);
   ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
   // Though the left padding is only 8 bytes, the assembly code tries to
   // read 16 bytes before the pointer.
   Buffer<uint8_t> dst_image =
-      Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+      Buffer<uint8_t>(block_width_, block_height_, 8, 16, 8, 8);
   ASSERT_TRUE(dst_image.Init());
 
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
-  (void)memset(flimits, 255, block_width);
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width_));
+  (void)memset(flimits_, 255, block_width_);
 
   // Initialize pixels in the input:
   //   block pixels to value 1,
@@ -79,37 +98,36 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   // Initialize pixels in the output to 99.
   dst_image.Set(99);
 
-  ASM_REGISTER_STATE_CHECK(GetParam()(
+  ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_(
       src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
-      dst_image.stride(), block_width, flimits, 16));
+      dst_image.stride(), block_width_, flimits_, 16));
 
-  static const uint8_t kExpectedOutput[block_height] = {
-    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
-  };
+  static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 3, 4 };
 
   uint8_t *pixel_ptr = dst_image.TopLeftPixel();
-  for (int i = 0; i < block_height; ++i) {
-    for (int j = 0; j < block_width; ++j) {
+  for (int i = 0; i < block_height_; ++i) {
+    for (int j = 0; j < block_width_; ++j) {
       ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
           << "at (" << i << ", " << j << ")";
     }
     pixel_ptr += dst_image.stride();
   }
 
-  vpx_free(flimits);
+  vpx_free(flimits_);
 };
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
   // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
   // blocks are always a multiple of 8 wide and exactly 8 high.
-  const int block_width = 136;
-  const int block_height = 16;
+  block_width_ = 136;
+  block_height_ = 16;
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
   Buffer<uint8_t> src_image =
-      Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+      Buffer<uint8_t>(block_width_, block_height_, 2, 2, 10, 2);
   ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
@@ -118,17 +136,17 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // not a problem.
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
   Buffer<uint8_t> dst_image =
-      Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+      Buffer<uint8_t>(block_width_, block_height_, 8, 8, 16, 8);
   ASSERT_TRUE(dst_image.Init());
-  Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+  Buffer<uint8_t> dst_image_ref =
+      Buffer<uint8_t>(block_width_, block_height_, 8);
   ASSERT_TRUE(dst_image_ref.Init());
 
   // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
   // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
   // it must be padded out.
-  const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+  const int flimits_width = block_width_ % 16 ? block_width_ + 8 : block_width_;
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
 
   ACMRandom rnd;
   rnd.Reset(ACMRandom::DeterministicSeed());
@@ -138,37 +156,78 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   src_image.SetPadding(10);
   src_image.Set(&rnd, &ACMRandom::Rand8);
 
-  for (int blocks = 0; blocks < block_width; blocks += 8) {
-    (void)memset(flimits, 0, sizeof(*flimits) * flimits_width);
+  for (int blocks = 0; blocks < block_width_; blocks += 8) {
+    (void)memset(flimits_, 0, sizeof(*flimits_) * flimits_width);
 
     for (int f = 0; f < 255; f++) {
-      (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
-
+      (void)memset(flimits_ + blocks, f, sizeof(*flimits_) * 8);
       dst_image.Set(0);
       dst_image_ref.Set(0);
 
       vpx_post_proc_down_and_across_mb_row_c(
           src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
-          src_image.stride(), dst_image_ref.stride(), block_width, flimits,
-          block_height);
-      ASM_REGISTER_STATE_CHECK(
-          GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
-                     src_image.stride(), dst_image.stride(), block_width,
-                     flimits, block_height));
+          src_image.stride(), dst_image_ref.stride(), block_width_, flimits_,
+          block_height_);
+      ASM_REGISTER_STATE_CHECK(mb_post_proc_down_and_across_(
+          src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+          src_image.stride(), dst_image.stride(), block_width_, flimits_,
+          block_height_));
 
       ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
     }
   }
 
-  vpx_free(flimits);
+  vpx_free(flimits_);
 }
 
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
+  // Size of the underlying data block that will be filtered.
+  block_width_ = 16;
+  block_height_ = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width_, block_height_, 2);
+  ASSERT_TRUE(src_image.Init());
+  this->src_image_ = &src_image;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width_, block_height_, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
+  this->dst_image_ = &dst_image;
+
+  flimits_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width_));
+  (void)memset(flimits_, 255, block_width_);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(1);
+
+  // Initialize pixels in the output to 99.
+  dst_image.Set(99);
+
+  RunNTimes(INT16_MAX);
+  PrintMedian("16x16");
+
+  vpx_free(flimits_);
+};
+
 class VpxMbPostProcAcrossIpTest
-    : public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
  public:
+  VpxMbPostProcAcrossIpTest()
+      : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()),
+        src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {}
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
+  virtual void Run();
+
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
@@ -195,71 +254,67 @@ class VpxMbPostProcAcrossIpTest
         GetParam()(s, src_width, rows, cols, filter_level));
     RunComparison(expected_output, s, rows, cols, src_width);
   }
+
+  const int rows_;
+  const int cols_;
+  const VpxMbPostProcAcrossIpFunc mb_post_proc_across_ip_;
+  Buffer<uint8_t> src_;
 };
 
-TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+void VpxMbPostProcAcrossIpTest::Run() {
+  mb_post_proc_across_ip_(src_.TopLeftPixel(), src_.stride(), rows_, cols_,
+                          q2mbl(0));
+}
 
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
-  ASSERT_TRUE(src.Init());
-  src.SetPadding(10);
-  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+  Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols_, rows_, 0);
   ASSERT_TRUE(expected_output.Init());
-  SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
+  SetCols(expected_output.TopLeftPixel(), rows_, cols_,
+          expected_output.stride());
 
-  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(0),
                  expected_output.TopLeftPixel());
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
-  ASSERT_TRUE(src.Init());
-  src.SetPadding(10);
-  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 13
   };
 
-  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(70),
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(70),
                  kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
-  ASSERT_TRUE(src.Init());
-  src.SetPadding(10);
-  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
-
-  static const unsigned char kExpectedOutput[cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 11, 12, 13, 13
   };
 
-  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), INT_MAX,
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), INT_MAX,
                  kExpectedOutput);
 
-  SetCols(src.TopLeftPixel(), rows, cols, src.stride());
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
 
-  RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(100),
+  RunFilterLevel(src_.TopLeftPixel(), rows_, cols_, src_.stride(), q2mbl(100),
                  kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-
-  Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols_, rows_, 8, 8, 17, 8);
   ASSERT_TRUE(c_mem.Init());
-  Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols_, rows_, 8, 8, 17, 8);
   ASSERT_TRUE(asm_mem.Init());
 
   // When level >= 100, the filter behaves the same as the level = INT_MAX
@@ -267,24 +322,41 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
   for (int level = 0; level < 100; level++) {
     c_mem.SetPadding(10);
     asm_mem.SetPadding(10);
-    SetCols(c_mem.TopLeftPixel(), rows, cols, c_mem.stride());
-    SetCols(asm_mem.TopLeftPixel(), rows, cols, asm_mem.stride());
+    SetCols(c_mem.TopLeftPixel(), rows_, cols_, c_mem.stride());
+    SetCols(asm_mem.TopLeftPixel(), rows_, cols_, asm_mem.stride());
 
-    vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows,
-                                cols, q2mbl(level));
+    vpx_mbpost_proc_across_ip_c(c_mem.TopLeftPixel(), c_mem.stride(), rows_,
+                                cols_, q2mbl(level));
     ASM_REGISTER_STATE_CHECK(GetParam()(
-        asm_mem.TopLeftPixel(), asm_mem.stride(), rows, cols, q2mbl(level)));
+        asm_mem.TopLeftPixel(), asm_mem.stride(), rows_, cols_, q2mbl(level)));
 
     ASSERT_TRUE(asm_mem.CheckValues(c_mem));
   }
 }
 
+TEST_P(VpxMbPostProcAcrossIpTest, DISABLED_Speed) {
+  ASSERT_TRUE(src_.Init());
+  src_.SetPadding(10);
+
+  SetCols(src_.TopLeftPixel(), rows_, cols_, src_.stride());
+
+  RunNTimes(100000);
+  PrintMedian("16x16");
+}
+
 class VpxMbPostProcDownTest
-    : public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxMbPostProcDownFunc> {
  public:
+  VpxMbPostProcDownTest()
+      : rows_(16), cols_(16), mb_post_proc_down_(GetParam()),
+        src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {}
+
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
  protected:
+  virtual void Run();
+
   void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
       memset(src_c, r, cols);
@@ -306,22 +378,28 @@ class VpxMbPostProcDownTest
   void RunFilterLevel(unsigned char *s, int rows, int cols, int src_width,
                       int filter_level, const unsigned char *expected_output) {
     ASM_REGISTER_STATE_CHECK(
-        GetParam()(s, src_width, rows, cols, filter_level));
+        mb_post_proc_down_(s, src_width, rows, cols, filter_level));
     RunComparison(expected_output, s, rows, cols, src_width);
   }
+
+  const int rows_;
+  const int cols_;
+  const VpxMbPostProcDownFunc mb_post_proc_down_;
+  Buffer<uint8_t> src_c_;
 };
 
-TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+void VpxMbPostProcDownTest::Run() {
+  mb_post_proc_down_(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
+                     q2mbl(0));
+}
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
-  ASSERT_TRUE(src_c.Init());
-  src_c.SetPadding(10);
+TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  3,  3,  3,  3,  3,  3,
     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  3,  4,  4,  3,  3,  3,
@@ -338,26 +416,22 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
     13, 13, 13, 13, 14, 13, 13, 13, 13
   };
 
-  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), INT_MAX,
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), INT_MAX,
                  kExpectedOutput);
 
-  src_c.SetPadding(10);
-  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
-  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(100),
-                 kExpectedOutput);
+  src_c_.SetPadding(10);
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(),
+                 q2mbl(100), kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
-  ASSERT_TRUE(src_c.Init());
-  src_c.SetPadding(10);
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
-
-  static const unsigned char kExpectedOutput[rows * cols] = {
+  static const unsigned char kExpectedOutput[] = {
     2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  1,  1,  2,  2,  2,  2,  2,  2,  2,
     2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,
     2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,
@@ -374,67 +448,69 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
     13, 13, 13, 13, 14, 13, 13, 13, 13
   };
 
-  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(70),
-                 kExpectedOutput);
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(),
+                 q2mbl(70), kExpectedOutput);
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
-  const int rows = 16;
-  const int cols = 16;
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
-  ASSERT_TRUE(src_c.Init());
-  src_c.SetPadding(10);
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
-
-  unsigned char *expected_output = new unsigned char[rows * cols];
+  unsigned char *expected_output = new unsigned char[rows_ * cols_];
   ASSERT_TRUE(expected_output != NULL);
-  SetRows(expected_output, rows, cols, cols);
+  SetRows(expected_output, rows_, cols_, cols_);
 
-  RunFilterLevel(src_c.TopLeftPixel(), rows, cols, src_c.stride(), q2mbl(0),
+  RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
                  expected_output);
 
   delete[] expected_output;
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
-  const int rows = 16;
-  const int cols = 16;
-
   ACMRandom rnd;
   rnd.Reset(ACMRandom::DeterministicSeed());
 
-  Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
-  ASSERT_TRUE(src_c.Init());
-  Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c_.Init());
+  Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols_, rows_, 8, 8, 8, 17);
   ASSERT_TRUE(src_asm.Init());
 
   for (int level = 0; level < 100; level++) {
-    src_c.SetPadding(10);
+    src_c_.SetPadding(10);
     src_asm.SetPadding(10);
-    src_c.Set(&rnd, &ACMRandom::Rand8);
-    src_asm.CopyFrom(src_c);
+    src_c_.Set(&rnd, &ACMRandom::Rand8);
+    src_asm.CopyFrom(src_c_);
 
-    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+    vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
                            q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(GetParam()(
-        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
-    ASSERT_TRUE(src_asm.CheckValues(src_c));
+    ASM_REGISTER_STATE_CHECK(mb_post_proc_down_(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c_));
 
-    src_c.SetPadding(10);
+    src_c_.SetPadding(10);
     src_asm.SetPadding(10);
-    src_c.Set(&rnd, &ACMRandom::Rand8Extremes);
-    src_asm.CopyFrom(src_c);
+    src_c_.Set(&rnd, &ACMRandom::Rand8Extremes);
+    src_asm.CopyFrom(src_c_);
 
-    vpx_mbpost_proc_down_c(src_c.TopLeftPixel(), src_c.stride(), rows, cols,
+    vpx_mbpost_proc_down_c(src_c_.TopLeftPixel(), src_c_.stride(), rows_, cols_,
                            q2mbl(level));
-    ASM_REGISTER_STATE_CHECK(GetParam()(
-        src_asm.TopLeftPixel(), src_asm.stride(), rows, cols, q2mbl(level)));
-    ASSERT_TRUE(src_asm.CheckValues(src_c));
+    ASM_REGISTER_STATE_CHECK(mb_post_proc_down_(
+        src_asm.TopLeftPixel(), src_asm.stride(), rows_, cols_, q2mbl(level)));
+    ASSERT_TRUE(src_asm.CheckValues(src_c_));
   }
 }
 
+TEST_P(VpxMbPostProcDownTest, DISABLED_Speed) {
+  ASSERT_TRUE(src_c_.Init());
+  src_c_.SetPadding(10);
+
+  SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
+
+  RunNTimes(100000);
+  PrintMedian("16x16");
+}
+
 INSTANTIATE_TEST_CASE_P(
     C, VpxPostProcDownAndAcrossMbRowTest,
     ::testing::Values(vpx_post_proc_down_and_across_mb_row_c));
@@ -481,4 +557,16 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
                         ::testing::Values(vpx_mbpost_proc_down_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx));
+
+INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest,
+                        ::testing::Values(vpx_mbpost_proc_across_ip_vsx));
+
+INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcDownTest,
+                        ::testing::Values(vpx_mbpost_proc_down_vsx));
+#endif  // HAVE_VSX
+
 }  // namespace
diff --git a/libs/libvpx/test/predict_test.cc b/libs/libvpx/test/predict_test.cc
index 9f366ae5299..d40d9c755e6 100644
--- a/libs/libvpx/test/predict_test.cc
+++ b/libs/libvpx/test/predict_test.cc
@@ -10,30 +10,34 @@
 
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vp8_rtcd.h"
 #include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/msvc.h"
 
 namespace {
 
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 typedef void (*PredictFunc)(uint8_t *src_ptr, int src_pixels_per_line,
                             int xoffset, int yoffset, uint8_t *dst_ptr,
                             int dst_pitch);
 
-typedef std::tr1::tuple<int, int, PredictFunc> PredictParam;
+typedef std::tuple<int, int, PredictFunc> PredictParam;
 
-class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
+class PredictTestBase : public AbstractBench,
+                        public ::testing::TestWithParam<PredictParam> {
  public:
   PredictTestBase()
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
@@ -204,7 +208,20 @@ class PredictTestBase : public ::testing::TestWithParam<PredictParam> {
       }
     }
   }
-};
+
+  void Run() {
+    for (int xoffset = 0; xoffset < 8; ++xoffset) {
+      for (int yoffset = 0; yoffset < 8; ++yoffset) {
+        if (xoffset == 0 && yoffset == 0) {
+          continue;
+        }
+
+        predict_(&src_[kSrcStride * 2 + 2], kSrcStride, xoffset, yoffset, dst_,
+                 dst_stride_);
+      }
+    }
+  }
+};  // namespace
 
 class SixtapPredictTest : public PredictTestBase {};
 
@@ -341,6 +358,14 @@ TEST_P(BilinearPredictTest, TestWithRandomData) {
 TEST_P(BilinearPredictTest, TestWithUnalignedDst) {
   TestWithUnalignedDst(vp8_bilinear_predict16x16_c);
 }
+TEST_P(BilinearPredictTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 5000000 / (width_ * height_);
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", width_, height_);
+  PrintMedian(title);
+}
 
 INSTANTIATE_TEST_CASE_P(
     C, BilinearPredictTest,
@@ -356,17 +381,13 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(8, 4, &vp8_bilinear_predict8x4_neon),
                       make_tuple(4, 4, &vp8_bilinear_predict4x4_neon)));
 #endif
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(
-    MMX, BilinearPredictTest,
-    ::testing::Values(make_tuple(8, 4, &vp8_bilinear_predict8x4_mmx),
-                      make_tuple(4, 4, &vp8_bilinear_predict4x4_mmx)));
-#endif
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, BilinearPredictTest,
     ::testing::Values(make_tuple(16, 16, &vp8_bilinear_predict16x16_sse2),
-                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2)));
+                      make_tuple(8, 8, &vp8_bilinear_predict8x8_sse2),
+                      make_tuple(8, 4, &vp8_bilinear_predict8x4_sse2),
+                      make_tuple(4, 4, &vp8_bilinear_predict4x4_sse2)));
 #endif
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
diff --git a/libs/libvpx/test/quantize_test.cc b/libs/libvpx/test/quantize_test.cc
index 40bb2642e42..a7497742ceb 100644
--- a/libs/libvpx/test/quantize_test.cc
+++ b/libs/libvpx/test/quantize_test.cc
@@ -9,12 +9,14 @@
  */
 
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
 #include "./vp8_rtcd.h"
+#include "./vpx_config.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -33,10 +35,10 @@ const int kNumBlockEntries = 16;
 
 typedef void (*VP8Quantize)(BLOCK *b, BLOCKD *d);
 
-typedef std::tr1::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
+typedef std::tuple<VP8Quantize, VP8Quantize> VP8QuantizeParam;
 
 using libvpx_test::ACMRandom;
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 // Create and populate a VP8_COMP instance which has a complete set of
 // quantization inputs as well as a second MACROBLOCKD for output.
@@ -116,7 +118,8 @@ class QuantizeTestBase {
 };
 
 class QuantizeTest : public QuantizeTestBase,
-                     public ::testing::TestWithParam<VP8QuantizeParam> {
+                     public ::testing::TestWithParam<VP8QuantizeParam>,
+                     public AbstractBench {
  protected:
   virtual void SetUp() {
     SetupCompressor();
@@ -124,6 +127,10 @@ class QuantizeTest : public QuantizeTestBase,
     c_quant_ = GET_PARAM(1);
   }
 
+  virtual void Run() {
+    asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
+  }
+
   void RunComparison() {
     for (int i = 0; i < kNumBlocks; ++i) {
       ASM_REGISTER_STATE_CHECK(
@@ -166,6 +173,13 @@ TEST_P(QuantizeTest, TestMultipleQ) {
   }
 }
 
+TEST_P(QuantizeTest, DISABLED_Speed) {
+  FillCoeffRandom();
+
+  RunNTimes(10000000);
+  PrintMedian("vp8 quantize");
+}
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, QuantizeTest,
diff --git a/libs/libvpx/test/register_state_check.h b/libs/libvpx/test/register_state_check.h
index a779e5c06a5..238508ac0e5 100644
--- a/libs/libvpx/test/register_state_check.h
+++ b/libs/libvpx/test/register_state_check.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_REGISTER_STATE_CHECK_H_
-#define TEST_REGISTER_STATE_CHECK_H_
+#ifndef VPX_TEST_REGISTER_STATE_CHECK_H_
+#define VPX_TEST_REGISTER_STATE_CHECK_H_
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "./vpx_config.h"
@@ -28,7 +28,7 @@
 //   See platform implementations of RegisterStateCheckXXX for details.
 //
 
-#if defined(_WIN64)
+#if defined(_WIN64) && ARCH_X86_64
 
 #undef NOMINMAX
 #define NOMINMAX
@@ -138,7 +138,7 @@ class RegisterStateCheck {};
 
 }  // namespace libvpx_test
 
-#endif  // _WIN64
+#endif  // _WIN64 && ARCH_X86_64
 
 #if ARCH_X86 || ARCH_X86_64
 #if defined(__GNUC__)
@@ -184,4 +184,4 @@ class RegisterStateCheckMMX {
 #define API_REGISTER_STATE_CHECK ASM_REGISTER_STATE_CHECK
 #endif
 
-#endif  // TEST_REGISTER_STATE_CHECK_H_
+#endif  // VPX_TEST_REGISTER_STATE_CHECK_H_
diff --git a/libs/libvpx/test/resize_test.cc b/libs/libvpx/test/resize_test.cc
index e95dc6651a1..5f80af6fb17 100644
--- a/libs/libvpx/test/resize_test.cc
+++ b/libs/libvpx/test/resize_test.cc
@@ -277,12 +277,29 @@ class ResizeTest
     SetMode(GET_PARAM(1));
   }
 
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
   virtual void DecompressedFrameHook(const vpx_image_t &img,
                                      vpx_codec_pts_t pts) {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
   std::vector<FrameInfo> frame_info_list_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };
 
 TEST_P(ResizeTest, TestExternalResizeWorks) {
@@ -296,6 +313,9 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
     const unsigned int frame = static_cast<unsigned>(info->pts);
     unsigned int expected_w;
     unsigned int expected_h;
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 0);
     EXPECT_EQ(expected_w, info->w)
@@ -464,8 +484,23 @@ class ResizeRealtimeTest
     ++mismatch_nframes_;
   }
 
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
+    ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
+    encode_frame_width_.push_back(pkt->data.frame.width[0]);
+    encode_frame_height_.push_back(pkt->data.frame.height[0]);
+  }
+
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
 
+  unsigned int GetFrameWidth(size_t idx) const {
+    return encode_frame_width_[idx];
+  }
+
+  unsigned int GetFrameHeight(size_t idx) const {
+    return encode_frame_height_[idx];
+  }
+
   void DefaultConfig() {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -493,6 +528,8 @@ class ResizeRealtimeTest
   bool change_bitrate_;
   double mismatch_psnr_;
   int mismatch_nframes_;
+  std::vector<unsigned int> encode_frame_width_;
+  std::vector<unsigned int> encode_frame_height_;
 };
 
 TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
@@ -582,6 +619,9 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) {
   int resize_count = 0;
   for (std::vector<FrameInfo>::const_iterator info = frame_info_list_.begin();
        info != frame_info_list_.end(); ++info) {
+    const size_t idx = info - frame_info_list_.begin();
+    ASSERT_EQ(info->w, GetFrameWidth(idx));
+    ASSERT_EQ(info->h, GetFrameHeight(idx));
     if (info->w != last_w || info->h != last_h) {
       resize_count++;
       if (resize_count == 1) {
diff --git a/libs/libvpx/test/sad_test.cc b/libs/libvpx/test/sad_test.cc
index 67c3c531506..0902df70adb 100644
--- a/libs/libvpx/test/sad_test.cc
+++ b/libs/libvpx/test/sad_test.cc
@@ -10,19 +10,21 @@
 
 #include <string.h>
 #include <limits.h>
-#include <stdio.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"
+#include "vpx_ports/vpx_timer.h"
 
 template <typename Function>
 struct TestParams {
@@ -84,7 +86,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
     mask_ = (1 << bit_depth_) - 1;
-    source_stride_ = (params_.width + 31) & ~31;
+    source_stride_ = (params_.width + 63) & ~63;
     reference_stride_ = params_.width * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -108,7 +110,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
 
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
-  static const int kDataAlignment = 16;
+  static const int kDataAlignment = 32;
   static const int kDataBlockSize = 64 * 128;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
@@ -264,7 +266,7 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
   }
 };
 
-class SADTest : public SADTestBase<SadMxNParam> {
+class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GetParam()) {}
 
@@ -284,6 +286,11 @@ class SADTest : public SADTestBase<SadMxNParam> {
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
 };
 
 class SADavgTest : public SADTestBase<SadMxNAvgParam> {
@@ -350,6 +357,17 @@ TEST_P(SADTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -463,6 +481,38 @@ TEST_P(SADx4Test, SrcAlignedByWidth) {
   source_data_ = tmp_source_data;
 }
 
+TEST_P(SADx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4], exp_sad[4];
+  vpx_usec_timer timer;
+
+  memset(reference_sad, 0, sizeof(reference_sad));
+  SADs(exp_sad);
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    for (int block = 0; block < 4; ++block) {
+      reference_sad[block] = ReferenceSAD(block);
+    }
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -971,6 +1021,9 @@ const SadMxNParam vsx_tests[] = {
   SadMxNParam(16, 32, &vpx_sad16x32_vsx),
   SadMxNParam(16, 16, &vpx_sad16x16_vsx),
   SadMxNParam(16, 8, &vpx_sad16x8_vsx),
+  SadMxNParam(8, 16, &vpx_sad8x16_vsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_vsx),
+  SadMxNParam(8, 4, &vpx_sad8x4_vsx),
 };
 INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
 
diff --git a/libs/libvpx/test/stress.sh b/libs/libvpx/test/stress.sh
index a899c800ca6..fdec764c7ae 100755
--- a/libs/libvpx/test/stress.sh
+++ b/libs/libvpx/test/stress.sh
@@ -30,7 +30,7 @@ SHA1_FILE="$(dirname $0)/test-data.sha1"
 # Download a file from the url and check its sha1sum.
 download_and_check_file() {
   # Get the file from the file path.
-  local readonly root="${1#${LIBVPX_TEST_DATA_PATH}/}"
+  local root="${1#${LIBVPX_TEST_DATA_PATH}/}"
 
   # Download the file using curl. Trap to insure non partial file.
   (trap "rm -f $1" INT TERM \
@@ -72,13 +72,13 @@ stress_verify_environment() {
 # This function runs tests on libvpx that run multiple encodes and decodes
 # in parallel in hopes of catching synchronization and/or threading issues.
 stress() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly codec="$1"
-  local readonly webm="$2"
-  local readonly decode_count="$3"
-  local readonly threads="$4"
-  local readonly enc_args="$5"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local codec="$1"
+  local webm="$2"
+  local decode_count="$3"
+  local threads="$4"
+  local enc_args="$5"
   local pids=""
   local rt_max_jobs=${STRESS_RT_MAX_JOBS:-5}
   local onepass_max_jobs=${STRESS_ONEPASS_MAX_JOBS:-5}
@@ -144,6 +144,19 @@ vp8_stress_test() {
   fi
 }
 
+vp8_stress_test_token_parititions() {
+  local vp8_max_jobs=${STRESS_VP8_DECODE_MAX_JOBS:-40}
+  if [ "$(vp8_decode_available)" = "yes" -a \
+       "$(vp8_encode_available)" = "yes" ]; then
+    for threads in 2 4 8; do
+      for token_partitions in 1 2 3; do
+        stress vp8 "${VP8}" "${vp8_max_jobs}" ${threads} \
+          "--token-parts=$token_partitions"
+      done
+    done
+  fi
+}
+
 vp9_stress() {
   local vp9_max_jobs=${STRESS_VP9_DECODE_MAX_JOBS:-25}
 
@@ -154,16 +167,17 @@ vp9_stress() {
 }
 
 vp9_stress_test() {
-  for threads in 4 8 100; do
+  for threads in 4 8 64; do
     vp9_stress "$threads" "--row-mt=0"
   done
 }
 
 vp9_stress_test_row_mt() {
-  for threads in 4 8 100; do
+  for threads in 4 8 64; do
     vp9_stress "$threads" "--row-mt=1"
   done
 }
 
 run_tests stress_verify_environment \
-  "vp8_stress_test vp9_stress_test vp9_stress_test_row_mt"
+  "vp8_stress_test vp8_stress_test_token_parititions
+   vp9_stress_test vp9_stress_test_row_mt"
diff --git a/libs/libvpx/test/sum_squares_test.cc b/libs/libvpx/test/sum_squares_test.cc
index 9c407c649f4..d2c70f4d4b4 100644
--- a/libs/libvpx/test/sum_squares_test.cc
+++ b/libs/libvpx/test/sum_squares_test.cc
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -28,7 +29,7 @@ namespace {
 const int kNumIterations = 10000;
 
 typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
-typedef std::tr1::tuple<SSI16Func, SSI16Func> SumSquaresParam;
+typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
@@ -102,7 +103,14 @@ TEST_P(SumSquaresTest, ExtremeValues) {
   }
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+    NEON, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_neon)));
+#endif  // HAVE_NEON
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
@@ -112,8 +120,9 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, SumSquaresTest, ::testing::Values(make_tuple(
-                                                 &vpx_sum_squares_2d_i16_c,
-                                                 &vpx_sum_squares_2d_i16_msa)));
+INSTANTIATE_TEST_CASE_P(
+    MSA, SumSquaresTest,
+    ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+                                 &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
 }  // namespace
diff --git a/libs/libvpx/test/superframe_test.cc b/libs/libvpx/test/superframe_test.cc
index 421dfccd609..8c8d1ae2904 100644
--- a/libs/libvpx/test/superframe_test.cc
+++ b/libs/libvpx/test/superframe_test.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
+#include <tuple>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
@@ -18,7 +20,7 @@ namespace {
 
 const int kTestMode = 0;
 
-typedef std::tr1::tuple<libvpx_test::TestMode, int> SuperframeTestParam;
+typedef std::tuple<libvpx_test::TestMode, int> SuperframeTestParam;
 
 class SuperframeTest
     : public ::libvpx_test::EncoderTest,
@@ -31,7 +33,7 @@ class SuperframeTest
   virtual void SetUp() {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
-    const libvpx_test::TestMode mode = std::tr1::get<kTestMode>(input);
+    const libvpx_test::TestMode mode = std::get<kTestMode>(input);
     SetMode(mode);
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
@@ -41,7 +43,7 @@ class SuperframeTest
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
     }
   }
diff --git a/libs/libvpx/test/svc_datarate_test.cc b/libs/libvpx/test/svc_datarate_test.cc
new file mode 100644
index 00000000000..d6b247723f2
--- /dev/null
+++ b/libs/libvpx/test/svc_datarate_test.cc
@@ -0,0 +1,1428 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/svc_test.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+namespace {
+
+typedef enum {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+} INTER_LAYER_PRED;
+
+class DatarateOnePassCbrSvc : public OnePassCbrSvc {
+ public:
+  explicit DatarateOnePassCbrSvc(const ::libvpx_test::CodecFactory *codec)
+      : OnePassCbrSvc(codec) {
+    inter_layer_pred_mode_ = 0;
+  }
+
+ protected:
+  virtual ~DatarateOnePassCbrSvc() {}
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    duration_ = 0.0;
+    mismatch_psnr_ = 0.0;
+    mismatch_nframes_ = 0;
+    denoiser_on_ = 0;
+    tune_content_ = 0;
+    base_speed_setting_ = 5;
+    spatial_layer_id_ = 0;
+    temporal_layer_id_ = 0;
+    update_pattern_ = 0;
+    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+    memset(bits_total_, 0, sizeof(bits_total_));
+    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
+    dynamic_drop_layer_ = false;
+    change_bitrate_ = false;
+    last_pts_ref_ = 0;
+    middle_bitrate_ = 0;
+    top_bitrate_ = 0;
+    superframe_count_ = -1;
+    key_frame_spacing_ = 9999;
+    num_nonref_frames_ = 0;
+    layer_framedrop_ = 0;
+    force_key_ = 0;
+    force_key_test_ = 0;
+    insert_layer_sync_ = 0;
+    layer_sync_on_base_ = 0;
+    force_intra_only_frame_ = 0;
+    superframe_has_intra_only_ = 0;
+    use_post_encode_drop_ = 0;
+    denoiser_off_on_ = false;
+    denoiser_enable_layers_ = false;
+  }
+  virtual void BeginPassHook(unsigned int /*pass*/) {}
+
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode, except that we disable inter-layer prediction.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    for (int sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (int sl = 0; sl < num_spatial_layers; ++sl) {
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl - 1);
+        ref_frame_config->alt_fb_idx[sl] =
+            VPXMIN(REF_FRAMES - 1, num_spatial_layers + sl);
+      }
+      if (!tl) {
+        if (!sl) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        }
+      }
+    }
+  }
+
+  void CheckLayerRateTargeting(int num_spatial_layers, int num_temporal_layers,
+                               double thresh_overshoot,
+                               double thresh_undershoot) const {
+    for (int sl = 0; sl < num_spatial_layers; ++sl)
+      for (int tl = 0; tl < num_temporal_layers; ++tl) {
+        const int layer = sl * num_temporal_layers + tl;
+        ASSERT_GE(cfg_.layer_target_bitrate[layer],
+                  file_datarate_[layer] * thresh_overshoot)
+            << " The datarate for the file exceeds the target by too much!";
+        ASSERT_LE(cfg_.layer_target_bitrate[layer],
+                  file_datarate_[layer] * thresh_undershoot)
+            << " The datarate for the file is lower than the target by too "
+               "much!";
+      }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    PreEncodeFrameHookSetup(video, encoder);
+
+    if (video->frame() == 0) {
+      if (force_intra_only_frame_) {
+        // Decoder sets the color_space for Intra-only frames
+        // to BT_601 (see line 1810 in vp9_decodeframe.c).
+        // So set it here in these tess to avoid encoder-decoder
+        // mismatch check on color space setting.
+        encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601);
+      }
+      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
+      encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_);
+
+      if (layer_framedrop_) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = LAYER_DROP;
+        for (int i = 0; i < number_spatial_layers_; i++)
+          svc_drop_frame.framedrop_thresh[i] = 30;
+        svc_drop_frame.max_consec_drop = 30;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
+
+      if (use_post_encode_drop_) {
+        encoder->Control(VP9E_SET_POSTENCODE_DROP, use_post_encode_drop_);
+      }
+    }
+
+    if (denoiser_off_on_) {
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+      // Set inter_layer_pred to INTER_LAYER_PRED_OFF_NONKEY (K-SVC).
+      encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, 2);
+      if (!denoiser_enable_layers_) {
+        if (video->frame() == 0)
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0);
+        else if (video->frame() == 100)
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1);
+      } else {
+        // Cumulative bitrates for top spatial layers, for
+        // 3 temporal layers.
+        if (video->frame() == 0) {
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 0);
+          // Change layer bitrates to set top spatial layer to 0.
+          // This is for 3 spatial 3 temporal layers.
+          // This will trigger skip encoding/dropping of top spatial layer.
+          cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+          for (int i = 0; i < 3; i++)
+            bitrate_sl3_[i] = cfg_.layer_target_bitrate[i + 6];
+          cfg_.layer_target_bitrate[6] = 0;
+          cfg_.layer_target_bitrate[7] = 0;
+          cfg_.layer_target_bitrate[8] = 0;
+          encoder->Config(&cfg_);
+        } else if (video->frame() == 100) {
+          // Change layer bitrates to non-zero on top spatial layer.
+          // This will trigger skip encoding of top spatial layer
+          // on key frame (period = 100).
+          for (int i = 0; i < 3; i++)
+            cfg_.layer_target_bitrate[i + 6] = bitrate_sl3_[i];
+          cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[8];
+          encoder->Config(&cfg_);
+        } else if (video->frame() == 120) {
+          // Enable denoiser and top spatial layer after key frame (period is
+          // 100).
+          encoder->Control(VP9E_SET_NOISE_SENSITIVITY, 1);
+        }
+      }
+    }
+
+    if (update_pattern_ && video->frame() >= 100) {
+      vpx_svc_layer_id_t layer_id;
+      if (video->frame() == 100) {
+        cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+        encoder->Config(&cfg_);
+      }
+      // Set layer id since the pattern changed.
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++)
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+    }
+
+    if (change_bitrate_ && video->frame() == 200) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+        for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+          const int layer = sl * number_temporal_layers_ + tl;
+          const double file_size_in_kb = bits_total_[layer] / 1000.;
+          file_datarate_[layer] = file_size_in_kb / duration_;
+        }
+      }
+
+      CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_,
+                              0.78, 1.15);
+
+      memset(file_datarate_, 0, sizeof(file_datarate_));
+      memset(bits_total_, 0, sizeof(bits_total_));
+      int64_t bits_in_buffer_model_tmp[VPX_MAX_LAYERS];
+      last_pts_ref_ = last_pts_;
+      // Set new target bitarate.
+      cfg_.rc_target_bitrate = cfg_.rc_target_bitrate >> 1;
+      // Buffer level should not reset on dynamic bitrate change.
+      memcpy(bits_in_buffer_model_tmp, bits_in_buffer_model_,
+             sizeof(bits_in_buffer_model_));
+      AssignLayerBitrates();
+      memcpy(bits_in_buffer_model_, bits_in_buffer_model_tmp,
+             sizeof(bits_in_buffer_model_));
+
+      // Change config to update encoder with new bitrate configuration.
+      encoder->Config(&cfg_);
+    }
+
+    if (dynamic_drop_layer_) {
+      // TODO(jian): Disable AQ Mode for this test for now.
+      encoder->Control(VP9E_SET_AQ_MODE, 0);
+      if (video->frame() == 0) {
+        // Change layer bitrates to set top layers to 0. This will trigger skip
+        // encoding/dropping of top two spatial layers.
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 50) {
+        // Change layer bitrates to non-zero on two top spatial layers.
+        // This will trigger skip encoding of top two spatial layers.
+        cfg_.layer_target_bitrate[1] = middle_bitrate_;
+        cfg_.layer_target_bitrate[2] = top_bitrate_;
+        cfg_.rc_target_bitrate +=
+            cfg_.layer_target_bitrate[2] + cfg_.layer_target_bitrate[1];
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 100) {
+        // Change layer bitrates to set top layers to 0. This will trigger skip
+        // encoding/dropping of top two spatial layers.
+        cfg_.rc_target_bitrate -=
+            (cfg_.layer_target_bitrate[1] + cfg_.layer_target_bitrate[2]);
+        middle_bitrate_ = cfg_.layer_target_bitrate[1];
+        top_bitrate_ = cfg_.layer_target_bitrate[2];
+        cfg_.layer_target_bitrate[1] = 0;
+        cfg_.layer_target_bitrate[2] = 0;
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 150) {
+        // Change layer bitrate on second layer to non-zero to start
+        // encoding it again.
+        cfg_.layer_target_bitrate[1] = middle_bitrate_;
+        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[1];
+        encoder->Config(&cfg_);
+      } else if (video->frame() == 200) {
+        // Change layer bitrate on top layer to non-zero to start
+        // encoding it again.
+        cfg_.layer_target_bitrate[2] = top_bitrate_;
+        cfg_.rc_target_bitrate += cfg_.layer_target_bitrate[2];
+        encoder->Config(&cfg_);
+      }
+    }
+
+    if (force_key_test_ && force_key_) frame_flags_ = VPX_EFLAG_FORCE_KF;
+
+    if (insert_layer_sync_) {
+      vpx_svc_spatial_layer_sync_t svc_layer_sync;
+      svc_layer_sync.base_layer_intra_only = 0;
+      for (int i = 0; i < number_spatial_layers_; i++)
+        svc_layer_sync.spatial_layer_sync[i] = 0;
+      if (force_intra_only_frame_) {
+        superframe_has_intra_only_ = 0;
+        if (video->frame() == 0) {
+          svc_layer_sync.base_layer_intra_only = 1;
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+          superframe_has_intra_only_ = 1;
+        } else if (video->frame() == 100) {
+          svc_layer_sync.base_layer_intra_only = 1;
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+          superframe_has_intra_only_ = 1;
+        }
+      } else {
+        layer_sync_on_base_ = 0;
+        if (video->frame() == 150) {
+          svc_layer_sync.spatial_layer_sync[1] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        } else if (video->frame() == 240) {
+          svc_layer_sync.spatial_layer_sync[2] = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        } else if (video->frame() == 320) {
+          svc_layer_sync.spatial_layer_sync[0] = 1;
+          layer_sync_on_base_ = 1;
+          encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync);
+        }
+      }
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                         uint32_t sizes[8], int *count) {
+    uint8_t marker;
+    marker = *(data + data_sz - 1);
+    *count = 0;
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      {
+        uint32_t i, j;
+        const uint8_t *x = &data[data_sz - index_sz + 1];
+        for (i = 0; i < frames; ++i) {
+          uint32_t this_sz = 0;
+
+          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+          sizes[i] = this_sz;
+        }
+        *count = frames;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    uint32_t sizes[8] = { 0 };
+    uint32_t sizes_parsed[8] = { 0 };
+    int count = 0;
+    int num_layers_encoded = 0;
+    last_pts_ = pkt->data.frame.pts;
+    const bool key_frame =
+        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+    if (key_frame) {
+      // For test that inserts layer sync frames: requesting a layer_sync on
+      // the base layer must force key frame. So if any key frame occurs after
+      // first superframe it must due to layer sync on base spatial layer.
+      if (superframe_count_ > 0 && insert_layer_sync_ &&
+          !force_intra_only_frame_) {
+        ASSERT_EQ(layer_sync_on_base_, 1);
+      }
+      temporal_layer_id_ = 0;
+      superframe_count_ = 0;
+    }
+    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz, sizes_parsed, &count);
+    // Count may be less than number of spatial layers because of frame drops.
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      if (pkt->data.frame.spatial_layer_encoded[sl]) {
+        sizes[sl] = sizes_parsed[num_layers_encoded];
+        num_layers_encoded++;
+      }
+    }
+    // For superframe with Intra-only count will be +1 larger
+    // because of no-show frame.
+    if (force_intra_only_frame_ && superframe_has_intra_only_)
+      ASSERT_EQ(count, num_layers_encoded + 1);
+    else
+      ASSERT_EQ(count, num_layers_encoded);
+
+    // In the constrained frame drop mode, if a given spatial is dropped all
+    // upper layers must be dropped too.
+    if (!layer_framedrop_) {
+      int num_layers_dropped = 0;
+      for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+        if (!pkt->data.frame.spatial_layer_encoded[sl]) {
+          // Check that all upper layers are dropped.
+          num_layers_dropped++;
+          for (int sl2 = sl + 1; sl2 < number_spatial_layers_; ++sl2)
+            ASSERT_EQ(pkt->data.frame.spatial_layer_encoded[sl2], 0);
+        }
+      }
+      if (num_layers_dropped == number_spatial_layers_ - 1)
+        force_key_ = 1;
+      else
+        force_key_ = 0;
+    }
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      sizes[sl] = sizes[sl] << 3;
+      // Update the total encoded bits per layer.
+      // For temporal layers, update the cumulative encoded bits per layer.
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+        // Update the per-layer buffer level with the encoded frame size.
+        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+        // There should be no buffer underrun, except on the base
+        // temporal layer, since there may be key frames there.
+        // Fo short key frame spacing, buffer can underrun on individual frames.
+        if (!key_frame && tl > 0 && key_frame_spacing_ < 100) {
+          ASSERT_GE(bits_in_buffer_model_[layer], 0)
+              << "Buffer Underrun at frame " << pkt->data.frame.pts;
+        }
+      }
+
+      ASSERT_EQ(pkt->data.frame.width[sl],
+                top_sl_width_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
+
+      ASSERT_EQ(pkt->data.frame.height[sl],
+                top_sl_height_ * svc_params_.scaling_factor_num[sl] /
+                    svc_params_.scaling_factor_den[sl]);
+    }
+  }
+
+  virtual void EndPassHook(void) {
+    if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
+    duration_ = (last_pts_ + 1) * timebase_;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        const double file_size_in_kb = bits_total_[layer] / 1000.;
+        file_datarate_[layer] = file_size_in_kb / duration_;
+      }
+    }
+  }
+
+  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+    double mismatch_psnr = compute_psnr(img1, img2);
+    mismatch_psnr_ += mismatch_psnr;
+    ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() { return num_nonref_frames_; }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int64_t bits_total_[VPX_MAX_LAYERS];
+  double duration_;
+  double file_datarate_[VPX_MAX_LAYERS];
+  size_t bits_in_last_frame_;
+  double mismatch_psnr_;
+  int denoiser_on_;
+  int tune_content_;
+  int spatial_layer_id_;
+  bool dynamic_drop_layer_;
+  unsigned int top_sl_width_;
+  unsigned int top_sl_height_;
+  vpx_svc_ref_frame_config_t ref_frame_config;
+  int update_pattern_;
+  bool change_bitrate_;
+  vpx_codec_pts_t last_pts_ref_;
+  int middle_bitrate_;
+  int top_bitrate_;
+  int key_frame_spacing_;
+  int layer_framedrop_;
+  int force_key_;
+  int force_key_test_;
+  int inter_layer_pred_mode_;
+  int insert_layer_sync_;
+  int layer_sync_on_base_;
+  int force_intra_only_frame_;
+  int superframe_has_intra_only_;
+  int use_post_encode_drop_;
+  int bitrate_sl3_[3];
+  // Denoiser switched on the fly.
+  bool denoiser_off_on_;
+  // Top layer enabled on the fly.
+  bool denoiser_enable_layers_;
+
+ private:
+  virtual void SetConfig(const int num_temporal_layer) {
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    if (num_temporal_layer == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (num_temporal_layer == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (num_temporal_layer == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
+  }
+
+  unsigned int num_nonref_frames_;
+  unsigned int mismatch_nframes_;
+};
+
+// Params: speed setting.
+class DatarateOnePassCbrSvcSingleBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcSingleBR() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
+// temporal layer, with screen content mode on and same speed setting for all
+// layers.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TLScreenContent1) {
+  SetSvcConfig(2, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  tune_content_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers, with force key frame after frame drop
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLForceKey) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 100;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 2 temporal layers, with a change on the fly from the fixed SVC pattern to one
+// generate via SVC_SET_REF_FRAME_CONFIG. The new pattern also disables
+// inter-layer prediction.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL2TLDynamicPatternChange) {
+  SetSvcConfig(3, 2);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  // Change SVC pattern on the fly.
+  update_pattern_ = 1;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal
+// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching
+// of denoiser from off to on (on at frame = 100). Key frame period is set to
+// 1000 so denoise is enabled on non-key.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiserOffOnFixedLayers) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 1000;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 30, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  denoiser_off_on_ = true;
+  denoiser_enable_layers_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial and 3 temporal
+// layers, for inter_layer_pred=OffKey (K-SVC) and on the fly switching
+// of denoiser from off to on, for dynamic layers. Start at 2 spatial layers
+// and enable 3rd spatial layer at frame = 100. Use periodic key frame with
+// period 100 so enabling of spatial layer occurs at key frame. Enable denoiser
+// at frame > 100, after the key frame sync.
+TEST_P(DatarateOnePassCbrSvcSingleBR,
+       OnePassCbrSvc3SL3TL_DenoiserOffOnEnableLayers) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 100;
+  ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv", 1280,
+                                       720, 30, 1, 0, 300);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  denoiser_off_on_ = true;
+  denoiser_enable_layers_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC with 3 spatial layers and on
+// the fly switching to 1 and then 2 and back to 3 spatial layers. This switch
+// is done by setting spatial layer bitrates to 0, and then back to non-zero,
+// during the sequence.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL_DisableEnableLayers) {
+  SetSvcConfig(3, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  dynamic_drop_layer_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Don't check rate targeting on two top spatial layer since they will be
+  // skipped for part of the sequence.
+  CheckLayerRateTargeting(number_spatial_layers_ - 2, number_temporal_layers_,
+                          0.78, 1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 1 temporal layer, 2 spatial layers, with spatial
+// downscale 5x5.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 3;
+  cfg_.temporal_layering_mode = 0;
+  svc_params_.scaling_factor_num[0] = 256;
+  svc_params_.scaling_factor_den[0] = 1280;
+  svc_params_.scaling_factor_num[1] = 1280;
+  svc_params_.scaling_factor_den[1] = 1280;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.kf_max_dist = 999999;
+  cfg_.kf_min_dist = 0;
+  cfg_.ss_target_bitrate[0] = 300;
+  cfg_.ss_target_bitrate[1] = 1400;
+  cfg_.layer_target_bitrate[0] = 300;
+  cfg_.layer_target_bitrate[1] = 1400;
+  cfg_.rc_target_bitrate = 1700;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ResetModel();
+  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+  bits_in_buffer_model_[0] =
+      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+  bits_in_buffer_model_[1] =
+      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting and index for bitrate array.
+class DatarateOnePassCbrSvcMultiBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcMultiBR() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run CIF clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  // TODO(marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.75,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting, layer framedrop control and index for bitrate array.
+class DatarateOnePassCbrSvcFrameDropMultiBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith3Params<int, int, int> {
+ public:
+  DatarateOnePassCbrSvcFrameDropMultiBR()
+      : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc2SL3TL4Threads) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.64,
+                          1.45);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Run HD clip with 4 threads.
+TEST_P(DatarateOnePassCbrSvcFrameDropMultiBR, OnePassCbrSvc3SL3TL4Threads) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  top_sl_width_ = 1280;
+  top_sl_height_ = 720;
+  layer_framedrop_ = 0;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  layer_framedrop_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.58,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting, inter-layer prediction mode.
+class DatarateOnePassCbrSvcInterLayerPredSingleBR
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcInterLayerPredSingleBR()
+      : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    inter_layer_pred_mode_ = GET_PARAM(2);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting with different inter-layer prediction modes for 1
+// pass CBR SVC: 3 spatial layers and 3 temporal layers. Run CIF clip with 1
+// thread.
+TEST_P(DatarateOnePassCbrSvcInterLayerPredSingleBR, OnePassCbrSvc3SL3TL) {
+  // Disable test for inter-layer pred off for now since simulcast_mode fails.
+  if (inter_layer_pred_mode_ == INTER_LAYER_PRED_OFF) return;
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.temporal_layering_mode = 3;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check rate targeting with different inter-layer prediction modes for 1 pass
+// CBR SVC: 3 spatial layers and 3 temporal layers, changing the target bitrate
+// at the middle of encoding.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLDynamicBitrateChange) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  cfg_.rc_target_bitrate = 800;
+  ResetModel();
+  change_bitrate_ = true;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Params: speed setting, noise sensitivity, index for bitrate array and inter
+// layer pred mode.
+class DatarateOnePassCbrSvcDenoiser
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith4Params<int, int, int, int> {
+ public:
+  DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcDenoiser() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    inter_layer_pred_mode_ = GET_PARAM(3);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC with denoising.
+// 2 spatial layers and 3 temporal layer. Run HD clip with 2 threads.
+TEST_P(DatarateOnePassCbrSvcDenoiser, OnePassCbrSvc2SL3TLDenoiserOn) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 2;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 600, 800, 1000 };
+  // TODO(marpan): Check that effective_datarate for each layer hits the
+  // layer target_bitrate.
+  // For SVC, noise_sen = 1 means denoising only the top spatial layer
+  // noise_sen = 2 means denoising the two top spatial layers.
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(3)];
+  ResetModel();
+  denoiser_on_ = GET_PARAM(2);
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
+// Params: speed setting, key frame dist.
+class DatarateOnePassCbrSvcSmallKF
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcSmallKF() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc3SL3TLSmallKf) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 800;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  const int kf_dist = GET_PARAM(2);
+  cfg_.kf_max_dist = kf_dist;
+  key_frame_spacing_ = kf_dist;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(jianj): webm:1554
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
+TEST_P(DatarateOnePassCbrSvcSmallKF, OnePassCbrSvc2SL3TLSmallKf) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  const int kf_dist = GET_PARAM(2) + 32;
+  cfg_.kf_max_dist = kf_dist;
+  key_frame_spacing_ = kf_dist;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers. Run VGA clip with 1 thread, and place layer sync frames:
+// one at middle layer first, then another one for top layer, and another
+// insert for base spatial layer (which forces key frame).
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TLSyncFrames) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  insert_layer_sync_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 3 spatial layers, 1 temporal layer, with
+// intra-only frame as sync frame on base spatial layer.
+// Intra_only is inserted at start and in middle of sequence.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL1TLSyncWithIntraOnly) {
+  SetSvcConfig(3, 1);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 4;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_target_bitrate = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  insert_layer_sync_ = 1;
+  // Use intra_only frame for sync on base layer.
+  force_intra_only_frame_ = 1;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.2);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Run SVC encoder for 2 quality layers (same resolution different,
+// bitrates), 1 temporal layer, with screen content mode.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2QL1TLScreen) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 2;
+  svc_params_.scaling_factor_num[0] = 1;
+  svc_params_.scaling_factor_den[0] = 1;
+  svc_params_.scaling_factor_num[1] = 1;
+  svc_params_.scaling_factor_den[1] = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  ResetModel();
+  tune_content_ = 1;
+  // Set the layer bitrates, for 2 spatial layers, 1 temporal.
+  cfg_.rc_target_bitrate = 400;
+  cfg_.ss_target_bitrate[0] = 100;
+  cfg_.ss_target_bitrate[1] = 300;
+  cfg_.layer_target_bitrate[0] = 100;
+  cfg_.layer_target_bitrate[1] = 300;
+  for (int sl = 0; sl < 2; ++sl) {
+    float layer_framerate = 30.0;
+    layer_target_avg_bandwidth_[sl] = static_cast<int>(
+        cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate);
+    bits_in_buffer_model_[sl] =
+        cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: speed setting.
+class DatarateOnePassCbrSvcPostencodeDrop
+    : public DatarateOnePassCbrSvc,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+  }
+  virtual ~DatarateOnePassCbrSvcPostencodeDrop() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    speed_setting_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Run SVC encoder for 2 quality layers (same resolution different,
+// bitrates), 1 temporal layer, with screen content mode.
+TEST_P(DatarateOnePassCbrSvcPostencodeDrop, OnePassCbrSvc2QL1TLScreen) {
+  cfg_.rc_buf_initial_sz = 200;
+  cfg_.rc_buf_optimal_sz = 200;
+  cfg_.rc_buf_sz = 400;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 52;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.ss_number_layers = 2;
+  cfg_.ts_number_layers = 1;
+  cfg_.ts_rate_decimator[0] = 1;
+  cfg_.temporal_layering_mode = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.g_threads = 2;
+  svc_params_.scaling_factor_num[0] = 1;
+  svc_params_.scaling_factor_den[0] = 1;
+  svc_params_.scaling_factor_num[1] = 1;
+  svc_params_.scaling_factor_den[1] = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  ResetModel();
+  base_speed_setting_ = speed_setting_;
+  tune_content_ = 1;
+  use_post_encode_drop_ = 1;
+  // Set the layer bitrates, for 2 spatial layers, 1 temporal.
+  cfg_.rc_target_bitrate = 400;
+  cfg_.ss_target_bitrate[0] = 100;
+  cfg_.ss_target_bitrate[1] = 300;
+  cfg_.layer_target_bitrate[0] = 100;
+  cfg_.layer_target_bitrate[1] = 300;
+  for (int sl = 0; sl < 2; ++sl) {
+    float layer_framerate = 30.0;
+    layer_target_avg_bandwidth_[sl] = static_cast<int>(
+        cfg_.layer_target_bitrate[sl] * 1000.0 / layer_framerate);
+    bits_in_buffer_model_[sl] =
+        cfg_.layer_target_bitrate[sl] * cfg_.rc_buf_initial_sz;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.73,
+                          1.25);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSingleBR,
+                          ::testing::Range(5, 10));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcPostencodeDrop,
+                          ::testing::Range(5, 6));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcInterLayerPredSingleBR,
+                          ::testing::Range(5, 10), ::testing::Range(0, 3));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcMultiBR, ::testing::Range(5, 10),
+                          ::testing::Range(0, 3));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcFrameDropMultiBR,
+                          ::testing::Range(5, 10), ::testing::Range(0, 2),
+                          ::testing::Range(0, 3));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcDenoiser,
+                          ::testing::Range(5, 10), ::testing::Range(1, 3),
+                          ::testing::Range(0, 3), ::testing::Range(0, 4));
+#endif
+
+VP9_INSTANTIATE_TEST_CASE(DatarateOnePassCbrSvcSmallKF, ::testing::Range(5, 10),
+                          ::testing::Range(32, 36));
+}  // namespace
+}  // namespace svc_test
diff --git a/libs/libvpx/test/svc_end_to_end_test.cc b/libs/libvpx/test/svc_end_to_end_test.cc
new file mode 100644
index 00000000000..82259ac30ca
--- /dev/null
+++ b/libs/libvpx/test/svc_end_to_end_test.cc
@@ -0,0 +1,481 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/svc_test.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+namespace {
+
+typedef enum {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+} INTER_LAYER_PRED;
+
+class ScalePartitionOnePassCbrSvc
+    : public OnePassCbrSvc,
+      public ::testing::TestWithParam<const ::libvpx_test::CodecFactory *> {
+ public:
+  ScalePartitionOnePassCbrSvc()
+      : OnePassCbrSvc(GetParam()), mismatch_nframes_(0), num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+ protected:
+  virtual ~ScalePartitionOnePassCbrSvc() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    PreEncodeFrameHookSetup(video, encoder);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1])
+      num_nonref_frames_++;
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {
+    ++mismatch_nframes_;
+  }
+
+  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+
+  unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
+
+ private:
+  unsigned int mismatch_nframes_;
+  unsigned int num_nonref_frames_;
+};
+
+TEST_P(ScalePartitionOnePassCbrSvc, OnePassCbrSvc3SL3TL1080P) {
+  SetSvcConfig(3, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_target_bitrate = 800;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 1;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+  cfg_.temporal_layering_mode = 3;
+  ::libvpx_test::I420VideoSource video(
+      "slides_code_term_web_plot.1920_1080.yuv", 1920, 1080, 30, 1, 0, 100);
+  // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
+  // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Params: Inter layer prediction modes.
+class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
+                               public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  SyncFrameOnePassCbrSvc()
+      : OnePassCbrSvc(GET_PARAM(0)), current_video_frame_(0),
+        frame_to_start_decode_(0), frame_to_sync_(0),
+        inter_layer_pred_mode_(GET_PARAM(1)), decode_to_layer_before_sync_(-1),
+        decode_to_layer_after_sync_(-1), denoiser_on_(0),
+        intra_only_test_(false), mismatch_nframes_(0), num_nonref_frames_(0) {
+    SetMode(::libvpx_test::kRealTime);
+    memset(&svc_layer_sync_, 0, sizeof(svc_layer_sync_));
+  }
+
+ protected:
+  virtual ~SyncFrameOnePassCbrSvc() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    speed_setting_ = 7;
+  }
+
+  virtual bool DoDecode() const {
+    return current_video_frame_ >= frame_to_start_decode_;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    current_video_frame_ = video->frame();
+    PreEncodeFrameHookSetup(video, encoder);
+    if (video->frame() == 0) {
+      // Do not turn off inter-layer pred completely because simulcast mode
+      // fails.
+      if (inter_layer_pred_mode_ != INTER_LAYER_PRED_OFF)
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED, inter_layer_pred_mode_);
+      encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      if (intra_only_test_)
+        // Decoder sets the color_space for Intra-only frames
+        // to BT_601 (see line 1810 in vp9_decodeframe.c).
+        // So set it here in these tess to avoid encoder-decoder
+        // mismatch check on color space setting.
+        encoder->Control(VP9E_SET_COLOR_SPACE, VPX_CS_BT_601);
+    }
+    if (video->frame() == frame_to_sync_) {
+      encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
+    }
+  }
+
+#if CONFIG_VP9_DECODER
+  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Decoder *decoder) {
+    if (video->frame() < frame_to_sync_) {
+      if (decode_to_layer_before_sync_ >= 0)
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
+                         decode_to_layer_before_sync_);
+    } else {
+      if (decode_to_layer_after_sync_ >= 0)
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
+                         decode_to_layer_after_sync_);
+    }
+  }
+#endif
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Keep track of number of non-reference frames, needed for mismatch check.
+    // Non-reference frames are top spatial and temporal layer frames,
+    // for TL > 0.
+    if (temporal_layer_id_ == number_temporal_layers_ - 1 &&
+        temporal_layer_id_ > 0 &&
+        pkt->data.frame.spatial_layer_encoded[number_spatial_layers_ - 1] &&
+        current_video_frame_ >= frame_to_sync_)
+      num_nonref_frames_++;
+
+    if (intra_only_test_ && current_video_frame_ == frame_to_sync_) {
+      // Intra-only frame is only generated for spatial layers > 1 and <= 3,
+      // among other conditions (see constraint in set_intra_only_frame(). If
+      // intra-only is no allowed then encoder will insert key frame instead.
+      const bool key_frame =
+          (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+      if (number_spatial_layers_ == 1 || number_spatial_layers_ > 3)
+        ASSERT_TRUE(key_frame);
+      else
+        ASSERT_FALSE(key_frame);
+    }
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {
+    if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_;
+  }
+
+  unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
+  unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
+
+  unsigned int current_video_frame_;
+  unsigned int frame_to_start_decode_;
+  unsigned int frame_to_sync_;
+  int inter_layer_pred_mode_;
+  int decode_to_layer_before_sync_;
+  int decode_to_layer_after_sync_;
+  int denoiser_on_;
+  bool intra_only_test_;
+  vpx_svc_spatial_layer_sync_t svc_layer_sync_;
+
+ private:
+  virtual void SetConfig(const int num_temporal_layer) {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_min_quantizer = 0;
+    cfg_.rc_max_quantizer = 63;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.g_threads = 1;
+    cfg_.rc_dropframe_thresh = 30;
+    cfg_.kf_max_dist = 9999;
+    if (num_temporal_layer == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (num_temporal_layer == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (num_temporal_layer == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 1;
+    }
+  }
+
+  unsigned int mismatch_nframes_;
+  unsigned int num_nonref_frames_;
+};
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Only start decoding on the sync layer.
+// Full sync: insert key frame on base layer.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLFullSync) {
+  SetSvcConfig(3, 3);
+  // Sync is on base layer so the frame to sync and the frame to start decoding
+  // is the same.
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = -1;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// VGA on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncToVGA) {
+  SetSvcConfig(2, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 100;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 1;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 400;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Decoding QVGA and VGA before sync frame and decode up to
+// HD on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToHD) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 1;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Test for sync layer for 1 pass CBR SVC: 3 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// HD on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncToVGAHD) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 2;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+  svc_layer_sync_.spatial_layer_sync[2] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Test for sync layer for 1 pass CBR SVC: 2 spatial layers and
+// 3 temporal layers. Decoding QVGA before sync frame and decode up to
+// VGA on and after sync.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc2SL3TLSyncFrameVGADenoise) {
+  SetSvcConfig(2, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 100;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 1;
+
+  denoiser_on_ = 1;
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 0;
+  svc_layer_sync_.spatial_layer_sync[0] = 0;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 400;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
+// Start decoding from beginning of sequence, during sequence insert intra-only
+// on base/qvga layer. Decode all layers.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  // The superframe containing intra-only layer will have 4 frames. Thus set the
+  // layer to decode after sync frame to 3.
+  decode_to_layer_after_sync_ = 3;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Start decoding from beginning of sequence, during sequence insert intra-only
+// on base/qvga layer and sync_layer on middle/VGA layer. Decode all layers.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 0;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  // The superframe containing intra-only layer will have 4 frames. Thus set the
+  // layer to decode after sync frame to 3.
+  decode_to_layer_after_sync_ = 3;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 1;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Start decoding from sync frame, insert intra-only on base/qvga layer. Decode
+// all layers. For 1 spatial layer, it inserts a key frame.
+TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc1SL3TLSyncFrameIntraOnlyQVGA) {
+  SetSvcConfig(1, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 0;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+VP9_INSTANTIATE_TEST_CASE(SyncFrameOnePassCbrSvc, ::testing::Range(0, 3));
+
+INSTANTIATE_TEST_CASE_P(
+    VP9, ScalePartitionOnePassCbrSvc,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+
+}  // namespace
+}  // namespace svc_test
diff --git a/libs/libvpx/test/svc_test.cc b/libs/libvpx/test/svc_test.cc
index 482d9fffa1d..4798c771832 100644
--- a/libs/libvpx/test/svc_test.cc
+++ b/libs/libvpx/test/svc_test.cc
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,782 +8,127 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/i420_video_source.h"
-
-#include "vp9/decoder/vp9_decoder.h"
-
-#include "vpx/svc_context.h"
-#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
-
-namespace {
-
-using libvpx_test::CodecFactory;
-using libvpx_test::Decoder;
-using libvpx_test::DxDataIterator;
-using libvpx_test::VP9CodecFactory;
-
-class SvcTest : public ::testing::Test {
- protected:
-  static const uint32_t kWidth = 352;
-  static const uint32_t kHeight = 288;
-
-  SvcTest()
-      : codec_iface_(0), test_file_name_("hantro_collage_w352h288.yuv"),
-        codec_initialized_(false), decoder_(0) {
-    memset(&svc_, 0, sizeof(svc_));
-    memset(&codec_, 0, sizeof(codec_));
-    memset(&codec_enc_, 0, sizeof(codec_enc_));
-  }
-
-  virtual ~SvcTest() {}
-
-  virtual void SetUp() {
-    svc_.log_level = SVC_LOG_DEBUG;
-    svc_.log_print = 0;
-
-    codec_iface_ = vpx_codec_vp9_cx();
-    const vpx_codec_err_t res =
-        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-
-    codec_enc_.g_w = kWidth;
-    codec_enc_.g_h = kHeight;
-    codec_enc_.g_timebase.num = 1;
-    codec_enc_.g_timebase.den = 60;
-    codec_enc_.kf_min_dist = 100;
-    codec_enc_.kf_max_dist = 100;
-
-    vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
-    VP9CodecFactory codec_factory;
-    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
-
-    tile_columns_ = 0;
-    tile_rows_ = 0;
-  }
-
-  virtual void TearDown() {
-    ReleaseEncoder();
-    delete (decoder_);
-  }
-
-  void InitializeEncoder() {
-    const vpx_codec_err_t res =
-        vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
-    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
-    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
-    codec_initialized_ = true;
-  }
-
-  void ReleaseEncoder() {
-    vpx_svc_release(&svc_);
-    if (codec_initialized_) vpx_codec_destroy(&codec_);
-    codec_initialized_ = false;
+#include "test/svc_test.h"
+
+namespace svc_test {
+void OnePassCbrSvc::SetSvcConfig(const int num_spatial_layer,
+                                 const int num_temporal_layer) {
+  SetConfig(num_temporal_layer);
+  cfg_.ss_number_layers = num_spatial_layer;
+  cfg_.ts_number_layers = num_temporal_layer;
+  if (num_spatial_layer == 1) {
+    svc_params_.scaling_factor_num[0] = 288;
+    svc_params_.scaling_factor_den[0] = 288;
+  } else if (num_spatial_layer == 2) {
+    svc_params_.scaling_factor_num[0] = 144;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 288;
+    svc_params_.scaling_factor_den[1] = 288;
+  } else if (num_spatial_layer == 3) {
+    svc_params_.scaling_factor_num[0] = 72;
+    svc_params_.scaling_factor_den[0] = 288;
+    svc_params_.scaling_factor_num[1] = 144;
+    svc_params_.scaling_factor_den[1] = 288;
+    svc_params_.scaling_factor_num[2] = 288;
+    svc_params_.scaling_factor_den[2] = 288;
   }
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+}
 
-  void GetStatsData(std::string *const stats_buf) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
-        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
-        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
-        stats_buf->append(static_cast<char *>(cx_pkt->data.twopass_stats.buf),
-                          cx_pkt->data.twopass_stats.sz);
-      }
+void OnePassCbrSvc::PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
+                                            ::libvpx_test::Encoder *encoder) {
+  if (video->frame() == 0) {
+    for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
+      svc_params_.max_quantizers[i] = 63;
+      svc_params_.min_quantizers[i] = 0;
     }
-  }
-
-  void Pass1EncodeNFrames(const int n, const int layers,
-                          std::string *const stats_buf) {
-    vpx_codec_err_t res;
-
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.g_pass = VPX_RC_FIRST_PASS;
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
-
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      GetStatsData(stats_buf);
-      video.Next();
+    svc_params_.speed_per_layer[0] = base_speed_setting_;
+    for (int i = 1; i < VPX_SS_MAX_LAYERS; ++i) {
+      svc_params_.speed_per_layer[i] = speed_setting_;
     }
 
-    // Flush encoder and test EOS packet.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(), video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    ASSERT_EQ(VPX_CODEC_OK, res);
-    GetStatsData(stats_buf);
-
-    ReleaseEncoder();
+    encoder->Control(VP9E_SET_SVC, 1);
+    encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+    encoder->Control(VP8E_SET_CPUUSED, speed_setting_);
+    encoder->Control(VP9E_SET_AQ_MODE, 3);
+    encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 300);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
+    encoder->Control(VP9E_SET_ROW_MT, 1);
+    encoder->Control(VP8E_SET_STATIC_THRESHOLD, 1);
   }
 
-  void StoreFrames(const size_t max_frame_received,
-                   struct vpx_fixed_buf *const outputs,
-                   size_t *const frame_received) {
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *cx_pkt;
-
-    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
-      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
-        const size_t frame_size = cx_pkt->data.frame.sz;
-
-        EXPECT_GT(frame_size, 0U);
-        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
-        ASSERT_LT(*frame_received, max_frame_received);
-
-        if (*frame_received == 0)
-          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
-
-        outputs[*frame_received].buf = malloc(frame_size + 16);
-        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
-        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
-               frame_size);
-        outputs[*frame_received].sz = frame_size;
-        ++(*frame_received);
-      }
+  superframe_count_++;
+  temporal_layer_id_ = 0;
+  if (number_temporal_layers_ == 2) {
+    temporal_layer_id_ = (superframe_count_ % 2 != 0);
+  } else if (number_temporal_layers_ == 3) {
+    if (superframe_count_ % 2 != 0) temporal_layer_id_ = 2;
+    if (superframe_count_ > 1) {
+      if ((superframe_count_ - 2) % 4 == 0) temporal_layer_id_ = 1;
     }
   }
 
-  void Pass2EncodeNFrames(std::string *const stats_buf, const int n,
-                          const int layers,
-                          struct vpx_fixed_buf *const outputs) {
-    vpx_codec_err_t res;
-    size_t frame_received = 0;
-
-    ASSERT_TRUE(outputs != NULL);
-    ASSERT_GT(n, 0);
-    ASSERT_GT(layers, 0);
-    svc_.spatial_layers = layers;
-    codec_enc_.rc_target_bitrate = 500;
-    if (codec_enc_.g_pass == VPX_RC_LAST_PASS) {
-      ASSERT_TRUE(stats_buf != NULL);
-      ASSERT_GT(stats_buf->size(), 0U);
-      codec_enc_.rc_twopass_stats_in.buf = &(*stats_buf)[0];
-      codec_enc_.rc_twopass_stats_in.sz = stats_buf->size();
-    }
-    InitializeEncoder();
-
-    libvpx_test::I420VideoSource video(
-        test_file_name_, codec_enc_.g_w, codec_enc_.g_h,
-        codec_enc_.g_timebase.den, codec_enc_.g_timebase.num, 0, 30);
-    video.Begin();
+  frame_flags_ = 0;
+}
 
-    for (int i = 0; i < n; ++i) {
-      res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
-                           video.duration(), VPX_DL_GOOD_QUALITY);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-      StoreFrames(n, outputs, &frame_received);
-      video.Next();
+void OnePassCbrSvc::PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  vpx_svc_layer_id_t layer_id;
+  encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+  temporal_layer_id_ = layer_id.temporal_layer_id;
+  for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+    for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+      const int layer = sl * number_temporal_layers_ + tl;
+      bits_in_buffer_model_[layer] +=
+          static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
     }
-
-    // Flush encoder.
-    res = vpx_svc_encode(&svc_, &codec_, NULL, 0, video.duration(),
-                         VPX_DL_GOOD_QUALITY);
-    EXPECT_EQ(VPX_CODEC_OK, res);
-    StoreFrames(n, outputs, &frame_received);
-
-    EXPECT_EQ(frame_received, static_cast<size_t>(n));
-
-    ReleaseEncoder();
   }
+}
 
-  void DecodeNFrames(const struct vpx_fixed_buf *const inputs, const int n) {
-    int decoded_frames = 0;
-    int received_frames = 0;
-
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-      const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz);
-      ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
-      ++decoded_frames;
-
-      DxDataIterator dec_iter = decoder_->GetDxData();
-      while (dec_iter.Next() != NULL) {
-        ++received_frames;
-      }
+void OnePassCbrSvc::AssignLayerBitrates() {
+  int sl, spatial_layer_target;
+  int spatial_layers = cfg_.ss_number_layers;
+  int temporal_layers = cfg_.ts_number_layers;
+  float total = 0;
+  float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+  float framerate = 30.0;
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    if (svc_params_.scaling_factor_den[sl] > 0) {
+      alloc_ratio[sl] =
+          static_cast<float>((svc_params_.scaling_factor_num[sl] * 1.0 /
+                              svc_params_.scaling_factor_den[sl]));
+      total += alloc_ratio[sl];
     }
-    EXPECT_EQ(decoded_frames, n);
-    EXPECT_EQ(received_frames, n);
   }
-
-  void DropEnhancementLayers(struct vpx_fixed_buf *const inputs,
-                             const int num_super_frames,
-                             const int remained_spatial_layers) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(num_super_frames, 0);
-    ASSERT_GT(remained_spatial_layers, 0);
-
-    for (int i = 0; i < num_super_frames; ++i) {
-      uint32_t frame_sizes[8] = { 0 };
-      int frame_count = 0;
-      int frames_found = 0;
-      int frame;
-      ASSERT_TRUE(inputs[i].buf != NULL);
-      ASSERT_GT(inputs[i].sz, 0U);
-
-      vpx_codec_err_t res = vp9_parse_superframe_index(
-          static_cast<const uint8_t *>(inputs[i].buf), inputs[i].sz,
-          frame_sizes, &frame_count, NULL, NULL);
-      ASSERT_EQ(VPX_CODEC_OK, res);
-
-      if (frame_count == 0) {
-        // There's no super frame but only a single frame.
-        ASSERT_EQ(1, remained_spatial_layers);
-      } else {
-        // Found a super frame.
-        uint8_t *frame_data = static_cast<uint8_t *>(inputs[i].buf);
-        uint8_t *frame_start = frame_data;
-        for (frame = 0; frame < frame_count; ++frame) {
-          // Looking for a visible frame.
-          if (frame_data[0] & 0x02) {
-            ++frames_found;
-            if (frames_found == remained_spatial_layers) break;
-          }
-          frame_data += frame_sizes[frame];
-        }
-        ASSERT_LT(frame, frame_count)
-            << "Couldn't find a visible frame. "
-            << "remained_spatial_layers: " << remained_spatial_layers
-            << "    super_frame: " << i;
-        if (frame == frame_count - 1) continue;
-
-        frame_data += frame_sizes[frame];
-
-        // We need to add one more frame for multiple frame contexts.
-        uint8_t marker =
-            static_cast<const uint8_t *>(inputs[i].buf)[inputs[i].sz - 1];
-        const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-        const size_t index_sz = 2 + mag * frame_count;
-        const size_t new_index_sz = 2 + mag * (frame + 1);
-        marker &= 0x0f8;
-        marker |= frame;
-
-        // Copy existing frame sizes.
-        memmove(frame_data + 1, frame_start + inputs[i].sz - index_sz + 1,
-                new_index_sz - 2);
-        // New marker.
-        frame_data[0] = marker;
-        frame_data += (mag * (frame + 1) + 1);
-
-        *frame_data++ = marker;
-        inputs[i].sz = frame_data - frame_start;
-      }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    cfg_.ss_target_bitrate[sl] = spatial_layer_target =
+        static_cast<unsigned int>(cfg_.rc_target_bitrate * alloc_ratio[sl] /
+                                  total);
+    const int index = sl * temporal_layers;
+    if (cfg_.temporal_layering_mode == 3) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target >> 1;
+      cfg_.layer_target_bitrate[index + 1] =
+          (spatial_layer_target >> 1) + (spatial_layer_target >> 2);
+      cfg_.layer_target_bitrate[index + 2] = spatial_layer_target;
+    } else if (cfg_.temporal_layering_mode == 2) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
+      cfg_.layer_target_bitrate[index + 1] = spatial_layer_target;
+    } else if (cfg_.temporal_layering_mode <= 1) {
+      cfg_.layer_target_bitrate[index] = spatial_layer_target;
     }
   }
-
-  void FreeBitstreamBuffers(struct vpx_fixed_buf *const inputs, const int n) {
-    ASSERT_TRUE(inputs != NULL);
-    ASSERT_GT(n, 0);
-
-    for (int i = 0; i < n; ++i) {
-      free(inputs[i].buf);
-      inputs[i].buf = NULL;
-      inputs[i].sz = 0;
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    for (int tl = 0; tl < temporal_layers; ++tl) {
+      const int layer = sl * temporal_layers + tl;
+      float layer_framerate = framerate;
+      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+      layer_target_avg_bandwidth_[layer] = static_cast<int>(
+          cfg_.layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+      bits_in_buffer_model_[layer] =
+          cfg_.layer_target_bitrate[layer] * cfg_.rc_buf_initial_sz;
     }
   }
-
-  SvcContext svc_;
-  vpx_codec_ctx_t codec_;
-  struct vpx_codec_enc_cfg codec_enc_;
-  vpx_codec_iface_t *codec_iface_;
-  std::string test_file_name_;
-  bool codec_initialized_;
-  Decoder *decoder_;
-  int tile_columns_;
-  int tile_rows_;
-};
-
-TEST_F(SvcTest, SvcInit) {
-  // test missing parameters
-  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 6;  // too many layers
-  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 0;  // use default layers
-  InitializeEncoder();
-  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, InitTwoLayers) {
-  svc_.spatial_layers = 2;
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, InvalidOptions) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "not-an-option=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-}
-
-TEST_F(SvcTest, SetLayersOption) {
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "spatial-layers=3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(3, svc_.spatial_layers);
-}
-
-TEST_F(SvcTest, SetMultipleOptions) {
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "spatial-layers=2 scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-  EXPECT_EQ(2, svc_.spatial_layers);
 }
-
-TEST_F(SvcTest, SetScaleFactorsOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res =
-      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3, 3*3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetQuantizersOption) {
-  svc_.spatial_layers = 2;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "max-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=nothing");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "min-quantizers=40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=30,30 min-quantizers=40,40");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "max-quantizers=40,40 min-quantizers=30,30");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, SetAutoAltRefOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "auto-alt-refs=none");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  res = vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1,1,0");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  InitializeEncoder();
-}
-
-// Test that decoder can handle an SVC frame as the first frame in a sequence.
-TEST_F(SvcTest, OnePassEncodeOneFrame) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  vpx_fixed_buf output = vpx_fixed_buf();
-  Pass2EncodeNFrames(NULL, 1, 2, &output);
-  DecodeNFrames(&output, 1);
-  FreeBitstreamBuffers(&output, 1);
-}
-
-TEST_F(SvcTest, OnePassEncodeThreeFrames) {
-  codec_enc_.g_pass = VPX_RC_ONE_PASS;
-  codec_enc_.g_lag_in_frames = 0;
-  vpx_fixed_buf outputs[3];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(NULL, 3, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 3);
-  FreeBitstreamBuffers(&outputs[0], 3);
-}
-
-TEST_F(SvcTest, TwoPassEncode10Frames) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode20FramesWithAltRef) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersDecodeBaseLayerOnly) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode5SpatialLayersDecode54321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 5, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=0,1,1,1,0");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 5, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 4);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 3);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(20, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 scale-factors=1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, TwoPassEncode3SNRLayersDecode321Layers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(20, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1");
-  vpx_fixed_buf outputs[20];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 20, 3, &outputs[0]);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 2);
-  DecodeNFrames(&outputs[0], 20);
-  DropEnhancementLayers(&outputs[0], 20, 1);
-  DecodeNFrames(&outputs[0], 20);
-
-  FreeBitstreamBuffers(&outputs[0], 20);
-}
-
-TEST_F(SvcTest, SetMultipleFrameContextsOption) {
-  svc_.spatial_layers = 5;
-  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  EXPECT_EQ(VPX_CODEC_OK, res);
-  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
-  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
-
-  svc_.spatial_layers = 2;
-  res = vpx_svc_set_options(&svc_, "multi-frame-contexts=1");
-  InitializeEncoder();
-}
-
-TEST_F(SvcTest, TwoPassEncode2SpatialLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2SpatialLayersWithMultipleFrameContextsDecodeBaselayer) {
-  // First pass encode
-  std::string stats_buf;
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1,1 multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2SNRLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1");
-  Pass1EncodeNFrames(10, 2, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1 scale-factors=1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 2, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode3SNRLayersWithMultipleFrameContextsDecode321Layer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1,1/1,1/1");
-  Pass1EncodeNFrames(10, 3, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1,1,1 scale-factors=1/1,1/1,1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 3, &outputs[0]);
-
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 2);
-  DecodeNFrames(&outputs[0], 10);
-  DropEnhancementLayers(&outputs[0], 10, 1);
-  DecodeNFrames(&outputs[0], 10);
-
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayers) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContexts) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest,
-       TwoPassEncode2TemporalLayersWithMultipleFrameContextsDecodeBaseLayer) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-
-  vpx_fixed_buf base_layer[5];
-  for (int i = 0; i < 5; ++i) base_layer[i] = outputs[i * 2];
-
-  DecodeNFrames(&base_layer[0], 5);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
-  // First pass encode
-  std::string stats_buf;
-  vpx_svc_set_options(&svc_, "scale-factors=1/1");
-  svc_.temporal_layers = 2;
-  Pass1EncodeNFrames(10, 1, &stats_buf);
-
-  // Second pass encode
-  codec_enc_.g_pass = VPX_RC_LAST_PASS;
-  svc_.temporal_layers = 2;
-  codec_enc_.g_error_resilient = 0;
-  codec_enc_.g_w = 704;
-  codec_enc_.g_h = 144;
-  tile_columns_ = 1;
-  tile_rows_ = 1;
-  vpx_svc_set_options(&svc_,
-                      "auto-alt-refs=1 scale-factors=1/1 "
-                      "multi-frame-contexts=1");
-  vpx_fixed_buf outputs[10];
-  memset(&outputs[0], 0, sizeof(outputs));
-  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
-  DecodeNFrames(&outputs[0], 10);
-  FreeBitstreamBuffers(&outputs[0], 10);
-}
-
-}  // namespace
+}  // namespace svc_test
diff --git a/libs/libvpx/test/svc_test.h b/libs/libvpx/test/svc_test.h
new file mode 100644
index 00000000000..f1d727fd9de
--- /dev/null
+++ b/libs/libvpx/test/svc_test.h
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_TEST_SVC_TEST_H_
+#define VPX_TEST_SVC_TEST_H_
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace svc_test {
+class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
+ public:
+  explicit OnePassCbrSvc(const ::libvpx_test::CodecFactory *codec)
+      : EncoderTest(codec), base_speed_setting_(0), speed_setting_(0),
+        superframe_count_(0), temporal_layer_id_(0), number_temporal_layers_(0),
+        number_spatial_layers_(0) {
+    memset(&svc_params_, 0, sizeof(svc_params_));
+    memset(bits_in_buffer_model_, 0,
+           sizeof(bits_in_buffer_model_[0]) * VPX_MAX_LAYERS);
+    memset(layer_target_avg_bandwidth_, 0,
+           sizeof(layer_target_avg_bandwidth_[0]) * VPX_MAX_LAYERS);
+  }
+
+ protected:
+  virtual ~OnePassCbrSvc() {}
+
+  virtual void SetConfig(const int num_temporal_layer) = 0;
+
+  virtual void SetSvcConfig(const int num_spatial_layer,
+                            const int num_temporal_layer);
+
+  virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
+                                       ::libvpx_test::Encoder *encoder);
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder);
+
+  virtual void AssignLayerBitrates();
+
+  virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {}
+
+  vpx_svc_extra_cfg_t svc_params_;
+  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
+  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
+  int base_speed_setting_;
+  int speed_setting_;
+  int superframe_count_;
+  int temporal_layer_id_;
+  int number_temporal_layers_;
+  int number_spatial_layers_;
+};
+}  // namespace svc_test
+
+#endif  // VPX_TEST_SVC_TEST_H_
diff --git a/libs/libvpx/test/temporal_filter_test.cc b/libs/libvpx/test/temporal_filter_test.cc
deleted file mode 100644
index 655a36be9a4..00000000000
--- a/libs/libvpx/test/temporal_filter_test.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <limits>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "test/acm_random.h"
-#include "test/buffer.h"
-#include "test/register_state_check.h"
-#include "vpx_ports/vpx_timer.h"
-
-namespace {
-
-using ::libvpx_test::ACMRandom;
-using ::libvpx_test::Buffer;
-
-typedef void (*TemporalFilterFunc)(const uint8_t *a, unsigned int stride,
-                                   const uint8_t *b, unsigned int w,
-                                   unsigned int h, int filter_strength,
-                                   int filter_weight, unsigned int *accumulator,
-                                   uint16_t *count);
-
-// Calculate the difference between 'a' and 'b', sum in blocks of 9, and apply
-// filter based on strength and weight. Store the resulting filter amount in
-// 'count' and apply it to 'b' and store it in 'accumulator'.
-void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
-                      int h, int filter_strength, int filter_weight,
-                      Buffer<unsigned int> *accumulator,
-                      Buffer<uint16_t> *count) {
-  Buffer<int> diff_sq = Buffer<int>(w, h, 0);
-  ASSERT_TRUE(diff_sq.Init());
-  diff_sq.Set(0);
-
-  int rounding = 0;
-  if (filter_strength > 0) {
-    rounding = 1 << (filter_strength - 1);
-  }
-
-  // Calculate all the differences. Avoids re-calculating a bunch of extra
-  // values.
-  for (int height = 0; height < h; ++height) {
-    for (int width = 0; width < w; ++width) {
-      int diff = a.TopLeftPixel()[height * a.stride() + width] -
-                 b.TopLeftPixel()[height * b.stride() + width];
-      diff_sq.TopLeftPixel()[height * diff_sq.stride() + width] = diff * diff;
-    }
-  }
-
-  // For any given point, sum the neighboring values and calculate the
-  // modifier.
-  for (int height = 0; height < h; ++height) {
-    for (int width = 0; width < w; ++width) {
-      // Determine how many values are being summed.
-      int summed_values = 9;
-
-      if (height == 0 || height == (h - 1)) {
-        summed_values -= 3;
-      }
-
-      if (width == 0 || width == (w - 1)) {
-        if (summed_values == 6) {  // corner
-          summed_values -= 2;
-        } else {
-          summed_values -= 3;
-        }
-      }
-
-      // Sum the diff_sq of the surrounding values.
-      int sum = 0;
-      for (int idy = -1; idy <= 1; ++idy) {
-        for (int idx = -1; idx <= 1; ++idx) {
-          const int y = height + idy;
-          const int x = width + idx;
-
-          // If inside the border.
-          if (y >= 0 && y < h && x >= 0 && x < w) {
-            sum += diff_sq.TopLeftPixel()[y * diff_sq.stride() + x];
-          }
-        }
-      }
-
-      sum *= 3;
-      sum /= summed_values;
-      sum += rounding;
-      sum >>= filter_strength;
-
-      // Clamp the value and invert it.
-      if (sum > 16) sum = 16;
-      sum = 16 - sum;
-
-      sum *= filter_weight;
-
-      count->TopLeftPixel()[height * count->stride() + width] += sum;
-      accumulator->TopLeftPixel()[height * accumulator->stride() + width] +=
-          sum * b.TopLeftPixel()[height * b.stride() + width];
-    }
-  }
-}
-
-class TemporalFilterTest : public ::testing::TestWithParam<TemporalFilterFunc> {
- public:
-  virtual void SetUp() {
-    filter_func_ = GetParam();
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-  }
-
- protected:
-  TemporalFilterFunc filter_func_;
-  ACMRandom rnd_;
-};
-
-TEST_P(TemporalFilterTest, SizeCombinations) {
-  // Depending on subsampling this function may be called with values of 8 or 16
-  // for width and height, in any combination.
-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
-  ASSERT_TRUE(a.Init());
-
-  const int filter_weight = 2;
-  const int filter_strength = 6;
-
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      // The difference between the buffers must be small to pass the threshold
-      // to apply the filter.
-      a.Set(&rnd_, 0, 7);
-      b.Set(&rnd_, 0, 7);
-
-      accum_ref.Set(rnd_.Rand8());
-      accum_chk.CopyFrom(accum_ref);
-      count_ref.Set(rnd_.Rand8());
-      count_chk.CopyFrom(count_ref);
-      reference_filter(a, b, width, height, filter_strength, filter_weight,
-                       &accum_ref, &count_ref);
-      ASM_REGISTER_STATE_CHECK(
-          filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
-                       height, filter_strength, filter_weight,
-                       accum_chk.TopLeftPixel(), count_chk.TopLeftPixel()));
-      EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
-      EXPECT_TRUE(count_chk.CheckValues(count_ref));
-      if (HasFailure()) {
-        printf("Width: %d Height: %d\n", width, height);
-        count_chk.PrintDifference(count_ref);
-        accum_chk.PrintDifference(accum_ref);
-        return;
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterTest, CompareReferenceRandom) {
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
-      ASSERT_TRUE(a.Init());
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
-        for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
-          for (int repeat = 0; repeat < 100; ++repeat) {
-            if (repeat < 50) {
-              a.Set(&rnd_, 0, 7);
-              b.Set(&rnd_, 0, 7);
-            } else {
-              // Check large (but close) values as well.
-              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
-                    std::numeric_limits<uint8_t>::max());
-              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
-                    std::numeric_limits<uint8_t>::max());
-            }
-
-            accum_ref.Set(rnd_.Rand8());
-            accum_chk.CopyFrom(accum_ref);
-            count_ref.Set(rnd_.Rand8());
-            count_chk.CopyFrom(count_ref);
-            reference_filter(a, b, width, height, filter_strength,
-                             filter_weight, &accum_ref, &count_ref);
-            ASM_REGISTER_STATE_CHECK(filter_func_(
-                a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width, height,
-                filter_strength, filter_weight, accum_chk.TopLeftPixel(),
-                count_chk.TopLeftPixel()));
-            EXPECT_TRUE(accum_chk.CheckValues(accum_ref));
-            EXPECT_TRUE(count_chk.CheckValues(count_ref));
-            if (HasFailure()) {
-              printf("Weight: %d Strength: %d\n", filter_weight,
-                     filter_strength);
-              count_chk.PrintDifference(count_ref);
-              accum_chk.PrintDifference(accum_ref);
-              return;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterTest, DISABLED_Speed) {
-  Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
-  ASSERT_TRUE(a.Init());
-
-  const int filter_weight = 2;
-  const int filter_strength = 6;
-
-  for (int width = 8; width <= 16; width += 8) {
-    for (int height = 8; height <= 16; height += 8) {
-      // The second buffer must not have any border.
-      Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
-      ASSERT_TRUE(b.Init());
-      Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_ref.Init());
-      Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
-      ASSERT_TRUE(accum_chk.Init());
-      Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_ref.Init());
-      Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
-      ASSERT_TRUE(count_chk.Init());
-
-      a.Set(&rnd_, 0, 7);
-      b.Set(&rnd_, 0, 7);
-
-      accum_chk.Set(0);
-      count_chk.Set(0);
-
-      vpx_usec_timer timer;
-      vpx_usec_timer_start(&timer);
-      for (int i = 0; i < 10000; ++i) {
-        filter_func_(a.TopLeftPixel(), a.stride(), b.TopLeftPixel(), width,
-                     height, filter_strength, filter_weight,
-                     accum_chk.TopLeftPixel(), count_chk.TopLeftPixel());
-      }
-      vpx_usec_timer_mark(&timer);
-      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-      printf("Temporal filter %dx%d time: %5d us\n", width, height,
-             elapsed_time);
-    }
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, TemporalFilterTest,
-                        ::testing::Values(&vp9_temporal_filter_apply_c));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_CASE_P(SSE4_1, TemporalFilterTest,
-                        ::testing::Values(&vp9_temporal_filter_apply_sse4_1));
-#endif  // HAVE_SSE4_1
-}  // namespace
diff --git a/libs/libvpx/test/test-data.mk b/libs/libvpx/test/test-data.mk
index f405e4ef14f..27a955760ad 100644
--- a/libs/libvpx/test/test-data.mk
+++ b/libs/libvpx/test/test-data.mk
@@ -3,14 +3,16 @@ LIBVPX_TEST_SRCS-yes += test-data.mk
 # Encoder test source
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv
 
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_444_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_440.yuv
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422.y4m
-LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_420_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_422_20f.y4m
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_444_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_12_440.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420_a10-1.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_8_420.y4m
@@ -734,8 +736,12 @@ endif  # CONFIG_VP9_HIGHBITDEPTH
 # Invalid files for testing libvpx error checking.
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-token-partition.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-02-v2.webm
@@ -783,8 +789,13 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-2.web
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-3.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-629481.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1558.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-1562.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-crbug-667044.webm.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += crbug-1539.rawfile
 
 ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
 # Encode / Decode test
diff --git a/libs/libvpx/test/test-data.sha1 b/libs/libvpx/test/test-data.sha1
index 99b4e1e4654..88f1e10d73b 100644
--- a/libs/libvpx/test/test-data.sha1
+++ b/libs/libvpx/test/test-data.sha1
@@ -17,13 +17,13 @@ df1a1453feb3c00d7d89746c7003b4163523bff3 *invalid-vp90-03-v3.webm
 d637297561dd904eb2c97a9015deeb31c4a1e8d2 *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
 3a204bdbeaa3c6458b77bcebb8366d107267f55d *invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
 9aa21d8b2cb9d39abe8a7bb6032dc66955fb4342 *noisy_clip_640_360.y4m
-a432f96ff0a787268e2f94a8092ab161a18d1b06 *park_joy_90p_10_420.y4m
-0b194cc312c3a2e84d156a221b0a5eb615dfddc5 *park_joy_90p_10_422.y4m
-ff0e0a21dc2adc95b8c1b37902713700655ced17 *park_joy_90p_10_444.y4m
+0936b837708ae68c034719f8e07596021c2c214f *park_joy_90p_10_420_20f.y4m
+5727a853c083c1099f837d27967bc1322d50ed4f *park_joy_90p_10_422_20f.y4m
+e13489470ef8e8b2a871a5640d795a42a39be58d *park_joy_90p_10_444_20f.y4m
 c934da6fb8cc54ee2a8c17c54cf6076dac37ead0 *park_joy_90p_10_440.yuv
-614c32ae1eca391e867c70d19974f0d62664dd99 *park_joy_90p_12_420.y4m
-c92825f1ea25c5c37855083a69faac6ac4641a9e *park_joy_90p_12_422.y4m
-b592189b885b6cc85db55cc98512a197d73d3b34 *park_joy_90p_12_444.y4m
+79b0dc1784635a7f291e21c4e8d66a29c496ab99 *park_joy_90p_12_420_20f.y4m
+9cf22b0f809f7464c8b9058f0cfa9d905921cbd1 *park_joy_90p_12_422_20f.y4m
+22b2a4abaecc4a9ade6bb503d25fb82367947e85 *park_joy_90p_12_444_20f.y4m
 82c1bfcca368c2f22bad7d693d690d5499ecdd11 *park_joy_90p_12_440.yuv
 b9e1e90aece2be6e2c90d89e6ab2372d5f8c792d *park_joy_90p_8_420_a10-1.y4m
 4e0eb61e76f0684188d9bc9f3ce61f6b6b77bb2c *park_joy_90p_8_420.y4m
@@ -852,5 +852,16 @@ e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
 fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
 fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+1a0e405606939f2febab1a21b30c37cb8f2c8cb1 *invalid-token-partition.ivf
+90a8a95e7024f015b87f5483a65036609b3d1b74 *invalid-token-partition.ivf.res
 17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
 e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
+a0fbbbc5dd50fd452096f4455a58c1a8c9f66697 *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf
+a61774cf03fc584bd9f0904fc145253bb8ea6c4c *invalid-vp80-00-comprehensive-s17661_r01-05_b6-.ivf.res
+894fae3afee0290546590823974203ab4b8abd95 *crbug-1539.rawfile
+f1026c03efd5da21b381c8eb21f0d64e6d7e4ba3 *invalid-crbug-1558.ivf
+eb198c25f861c3fe2cbd310de11eb96843019345 *invalid-crbug-1558.ivf.res
+c62b005a9fd32c36a1b3f67de6840330f9915e34 *invalid-crbug-1562.ivf
+f0cd8389948ad16085714d96567612136f6a46c5 *invalid-crbug-1562.ivf.res
+bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
+094be4b80fa30bd227149ea16ab6476d549ea092 *slides_code_term_web_plot.1920_1080.yuv
diff --git a/libs/libvpx/test/test.mk b/libs/libvpx/test/test.mk
index a3716be60c4..8ab4932ce4e 100644
--- a/libs/libvpx/test/test.mk
+++ b/libs/libvpx/test/test.mk
@@ -1,4 +1,6 @@
 LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += bench.h
+LIBVPX_TEST_SRCS-yes += bench.cc
 LIBVPX_TEST_SRCS-yes += buffer.h
 LIBVPX_TEST_SRCS-yes += clear_system_state.h
 LIBVPX_TEST_SRCS-yes += codec_factory.h
@@ -22,7 +24,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp8_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@@ -46,9 +49,15 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += decode_corrupted.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_motion_vector_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_datarate_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
@@ -67,6 +76,7 @@ LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.cc
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.cc
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvparser.h
 LIBWEBM_PARSER_SRCS += ../third_party/libwebm/mkvparser/mkvreader.h
+LIBWEBM_PARSER_SRCS += ../third_party/libwebm/common/webmids.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(LIBWEBM_PARSER_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../tools_common.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
@@ -161,7 +171,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
 ifneq ($(CONFIG_REALTIME_ONLY),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
@@ -169,7 +179,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
-LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += blockiness_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_INTERNAL_STATS) += consistency_test.cc
 endif
diff --git a/libs/libvpx/test/test_intra_pred_speed.cc b/libs/libvpx/test/test_intra_pred_speed.cc
index 1cdeda410a7..0be9feefd99 100644
--- a/libs/libvpx/test/test_intra_pred_speed.cc
+++ b/libs/libvpx/test/test_intra_pred_speed.cc
@@ -313,6 +313,8 @@ INTRA_PRED_TEST(MSA, TestIntraPred32, vpx_dc_predictor_32x32_msa,
 #endif  // HAVE_MSA
 
 #if HAVE_VSX
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 INTRA_PRED_TEST(VSX, TestIntraPred4, NULL, NULL, NULL, NULL, NULL,
                 vpx_h_predictor_4x4_vsx, NULL, NULL, NULL, NULL, NULL, NULL,
                 vpx_tm_predictor_4x4_vsx)
@@ -321,6 +323,7 @@ INTRA_PRED_TEST(VSX, TestIntraPred8, vpx_dc_predictor_8x8_vsx, NULL, NULL, NULL,
                 NULL, vpx_h_predictor_8x8_vsx, vpx_d45_predictor_8x8_vsx, NULL,
                 NULL, NULL, NULL, vpx_d63_predictor_8x8_vsx,
                 vpx_tm_predictor_8x8_vsx)
+#endif
 
 INTRA_PRED_TEST(VSX, TestIntraPred16, vpx_dc_predictor_16x16_vsx,
                 vpx_dc_left_predictor_16x16_vsx, vpx_dc_top_predictor_16x16_vsx,
diff --git a/libs/libvpx/test/test_libvpx.cc b/libs/libvpx/test/test_libvpx.cc
index 30641ae8c89..3405e4566b6 100644
--- a/libs/libvpx/test/test_libvpx.cc
+++ b/libs/libvpx/test/test_libvpx.cc
@@ -61,7 +61,6 @@ int main(int argc, char **argv) {
 #if !CONFIG_SHARED
 // Shared library builds don't support whitebox tests
 // that exercise internal symbols.
-
 #if CONFIG_VP8
   vp8_rtcd();
 #endif  // CONFIG_VP8
diff --git a/libs/libvpx/test/test_vector_test.cc b/libs/libvpx/test/test_vector_test.cc
index 1879b3d2773..5a9737122f6 100644
--- a/libs/libvpx/test/test_vector_test.cc
+++ b/libs/libvpx/test/test_vector_test.cc
@@ -10,8 +10,11 @@
 
 #include <cstdio>
 #include <cstdlib>
+#include <memory>
 #include <set>
 #include <string>
+#include <tuple>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "../tools_common.h"
 #include "./vpx_config.h"
@@ -29,9 +32,10 @@
 namespace {
 
 const int kThreads = 0;
-const int kFileName = 1;
+const int kMtMode = 1;
+const int kFileName = 2;
 
-typedef std::tr1::tuple<int, const char *> DecodeParam;
+typedef std::tuple<int, int, const char *> DecodeParam;
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@@ -54,6 +58,25 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
+#if CONFIG_VP9_DECODER
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (video.frame_number() == 0 && mt_mode_ >= 0) {
+      if (mt_mode_ == 1) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      } else if (mt_mode_ == 2) {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 1);
+      } else {
+        decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 0);
+        decoder->Control(VP9D_SET_ROW_MT, 0);
+      }
+    }
+  }
+#endif
+
   virtual void DecompressedFrameHook(const vpx_image_t &img,
                                      const unsigned int frame_number) {
     ASSERT_TRUE(md5_file_ != NULL);
@@ -77,6 +100,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #if CONFIG_VP9_DECODER
   std::set<std::string> resize_clips_;
 #endif
+  int mt_mode_;
 
  private:
   FILE *md5_file_;
@@ -88,19 +112,20 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 // the test failed.
 TEST_P(TestVectorTest, MD5Match) {
   const DecodeParam input = GET_PARAM(1);
-  const std::string filename = std::tr1::get<kFileName>(input);
+  const std::string filename = std::get<kFileName>(input);
   vpx_codec_flags_t flags = 0;
   vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
-  cfg.threads = std::tr1::get<kThreads>(input);
-
-  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
-           filename.c_str(), cfg.threads);
+  cfg.threads = std::get<kThreads>(input);
+  mt_mode_ = std::get<kMtMode>(input);
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
+           "file: %s threads: %d MT mode: %d", filename.c_str(), cfg.threads,
+           mt_mode_);
   SCOPED_TRACE(str);
 
   // Open compressed video file.
-  testing::internal::scoped_ptr<libvpx_test::CompressedVideoSource> video;
+  std::unique_ptr<libvpx_test::CompressedVideoSource> video;
   if (filename.substr(filename.length() - 3, 3) == "ivf") {
     video.reset(new libvpx_test::IVFVideoSource(filename));
   } else if (filename.substr(filename.length() - 4, 4) == "webm") {
@@ -131,7 +156,8 @@ TEST_P(TestVectorTest, MD5Match) {
 VP8_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                             libvpx_test::kVP8TestVectors +
                                 libvpx_test::kNumVP8TestVectors)));
@@ -144,6 +170,7 @@ INSTANTIATE_TEST_CASE_P(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
         ::testing::Combine(
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Values(-1),   // LPF opt and Row MT is not applicable
             ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                 libvpx_test::kVP8TestVectors +
                                     libvpx_test::kNumVP8TestVectors))));
@@ -154,7 +181,8 @@ INSTANTIATE_TEST_CASE_P(
 VP9_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(1),  // Single thread.
+        ::testing::Values(1),   // Single thread.
+        ::testing::Values(-1),  // LPF opt and Row MT is not applicable
         ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                             libvpx_test::kVP9TestVectors +
                                 libvpx_test::kNumVP9TestVectors)));
@@ -166,6 +194,10 @@ INSTANTIATE_TEST_CASE_P(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
         ::testing::Combine(
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
+            ::testing::Range(0, 3),  // With multi threads modes 0 ~ 2
+                                     // 0: LPF opt and Row MT disabled
+                                     // 1: LPF opt enabled
+                                     // 2: Row MT enabled
             ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                 libvpx_test::kVP9TestVectors +
                                     libvpx_test::kNumVP9TestVectors))));
diff --git a/libs/libvpx/test/test_vectors.h b/libs/libvpx/test/test_vectors.h
index 3df3e811333..0a4be0f1a28 100644
--- a/libs/libvpx/test/test_vectors.h
+++ b/libs/libvpx/test/test_vectors.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_TEST_VECTORS_H_
-#define TEST_TEST_VECTORS_H_
+#ifndef VPX_TEST_TEST_VECTORS_H_
+#define VPX_TEST_TEST_VECTORS_H_
 
 #include "./vpx_config.h"
 
@@ -31,4 +31,4 @@ extern const char *const kVP9TestVectorsResize[];
 
 }  // namespace libvpx_test
 
-#endif  // TEST_TEST_VECTORS_H_
+#endif  // VPX_TEST_TEST_VECTORS_H_
diff --git a/libs/libvpx/test/tile_independence_test.cc b/libs/libvpx/test/tile_independence_test.cc
index e24981c68d3..1d1020a9d3e 100644
--- a/libs/libvpx/test/tile_independence_test.cc
+++ b/libs/libvpx/test/tile_independence_test.cc
@@ -48,7 +48,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
 
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
     }
   }
diff --git a/libs/libvpx/test/timestamp_test.cc b/libs/libvpx/test/timestamp_test.cc
new file mode 100644
index 00000000000..20240fb77dd
--- /dev/null
+++ b/libs/libvpx/test/timestamp_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace {
+
+const int kVideoSourceWidth = 320;
+const int kVideoSourceHeight = 240;
+const int kFramesToEncode = 3;
+
+// A video source that exposes functions to set the timebase, framerate and
+// starting pts.
+class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource {
+ public:
+  // Parameters num and den set the timebase for the video source.
+  DummyTimebaseVideoSource(int num, int den)
+      : timebase_({ num, den }), framerate_numerator_(30),
+        framerate_denominator_(1), starting_pts_(0) {
+    SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    set_limit(kFramesToEncode);
+  }
+
+  void SetFramerate(int numerator, int denominator) {
+    framerate_numerator_ = numerator;
+    framerate_denominator_ = denominator;
+  }
+
+  // Returns one frames duration in timebase units as a double.
+  double FrameDuration() const {
+    return (static_cast<double>(timebase_.den) / timebase_.num) /
+           (static_cast<double>(framerate_numerator_) / framerate_denominator_);
+  }
+
+  virtual vpx_codec_pts_t pts() const {
+    return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() +
+                                        starting_pts_ + 0.5);
+  }
+
+  virtual unsigned long duration() const {
+    return static_cast<unsigned long>(FrameDuration() + 0.5);
+  }
+
+  virtual vpx_rational_t timebase() const { return timebase_; }
+
+  void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
+
+ private:
+  vpx_rational_t timebase_;
+  int framerate_numerator_;
+  int framerate_denominator_;
+  int64_t starting_pts_;
+};
+
+class TimestampTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
+ protected:
+  TimestampTest() : EncoderTest(GET_PARAM(0)) {}
+  virtual ~TimestampTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+  }
+};
+
+class TimestampTestVp9Only : public TimestampTest {};
+
+// Tests encoding in millisecond timebase.
+TEST_P(TimestampTest, EncodeFrames) {
+  DummyTimebaseVideoSource video(1, 1000);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// TODO(fgalligan): Enable test when
+// https://bugs.chromium.org/p/webm/issues/detail?id=1614 is fixed.
+TEST_P(TimestampTest, DISABLED_TestMicrosecondTimebase) {
+  // Set the timebase to microseconds.
+  DummyTimebaseVideoSource video(1, 1000000);
+  video.set_limit(1);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+// TODO(webm:701): Enable VP8 test when the overflow issue in
+// TestVpxRollover is fixed.
+TEST_P(TimestampTestVp9Only, TestVpxRollover) {
+  DummyTimebaseVideoSource video(1, 1000);
+  video.set_starting_pts(922337170351ll);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+VP8_INSTANTIATE_TEST_CASE(TimestampTest,
+                          ::testing::Values(::libvpx_test::kTwoPassGood));
+VP9_INSTANTIATE_TEST_CASE(TimestampTest,
+                          ::testing::Values(::libvpx_test::kTwoPassGood));
+VP9_INSTANTIATE_TEST_CASE(TimestampTestVp9Only,
+                          ::testing::Values(::libvpx_test::kTwoPassGood));
+}  // namespace
diff --git a/libs/libvpx/test/tools_common.sh b/libs/libvpx/test/tools_common.sh
index 0bdcc08d787..844a12534de 100755
--- a/libs/libvpx/test/tools_common.sh
+++ b/libs/libvpx/test/tools_common.sh
@@ -150,7 +150,7 @@ is_windows_target() {
 # empty string. Caller is responsible for testing the string once the function
 # returns.
 vpx_tool_path() {
-  local readonly tool_name="$1"
+  local tool_name="$1"
   local tool_path="${LIBVPX_BIN_PATH}/${tool_name}${VPX_TEST_EXE_SUFFIX}"
   if [ ! -x "${tool_path}" ]; then
     # Try one directory up: when running via examples.sh the tool could be in
@@ -404,12 +404,16 @@ VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
 VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
 VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
 
+VP9_RAW_FILE="${LIBVPX_TEST_DATA_PATH}/crbug-1539.rawfile"
+
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352
 YUV_RAW_INPUT_HEIGHT=288
 
 Y4M_NOSQ_PAR_INPUT="${LIBVPX_TEST_DATA_PATH}/park_joy_90p_8_420_a10-1.y4m"
 Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+Y4M_720P_INPUT_WIDTH=1280
+Y4M_720P_INPUT_HEIGHT=720
 
 # Setup a trap function to clean up after tests complete.
 trap cleanup EXIT
diff --git a/libs/libvpx/test/user_priv_test.cc b/libs/libvpx/test/user_priv_test.cc
index 4b5de094e95..7bea76b0a94 100644
--- a/libs/libvpx/test/user_priv_test.cc
+++ b/libs/libvpx/test/user_priv_test.cc
@@ -27,8 +27,8 @@
 
 namespace {
 
-using std::string;
 using libvpx_test::ACMRandom;
+using std::string;
 
 #if CONFIG_WEBM_IO
 
@@ -73,7 +73,7 @@ string DecodeFile(const string &filename) {
         CheckUserPrivateData(img->user_priv, &frame_num);
 
         // Also test ctrl_get_reference api.
-        struct vp9_ref_frame ref;
+        struct vp9_ref_frame ref = vp9_ref_frame();
         // Randomly fetch a reference frame.
         ref.idx = rnd.Rand8() % 3;
         decoder.Control(VP9_GET_REFERENCE, &ref);
diff --git a/libs/libvpx/test/util.h b/libs/libvpx/test/util.h
index 1f2540ecf22..985f487094f 100644
--- a/libs/libvpx/test/util.h
+++ b/libs/libvpx/test/util.h
@@ -8,16 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_UTIL_H_
-#define TEST_UTIL_H_
+#ifndef VPX_TEST_UTIL_H_
+#define VPX_TEST_UTIL_H_
 
 #include <stdio.h>
 #include <math.h>
+#include <tuple>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_image.h"
 
 // Macros
-#define GET_PARAM(k) std::tr1::get<k>(GetParam())
+#define GET_PARAM(k) std::get<k>(GetParam())
 
 inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
   assert((img1->fmt == img2->fmt) && (img1->d_w == img2->d_w) &&
@@ -43,4 +45,4 @@ inline double compute_psnr(const vpx_image_t *img1, const vpx_image_t *img2) {
   return psnr;
 }
 
-#endif  // TEST_UTIL_H_
+#endif  // VPX_TEST_UTIL_H_
diff --git a/libs/libvpx/test/variance_test.cc b/libs/libvpx/test/variance_test.cc
index 421024ad889..e9fa03c680a 100644
--- a/libs/libvpx/test/variance_test.cc
+++ b/libs/libvpx/test/variance_test.cc
@@ -20,24 +20,13 @@
 #include "test/register_state_check.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 
 namespace {
 
-typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
-                                        const uint8_t *b, int b_stride,
-                                        unsigned int *sse);
-typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
-                                         int xoffset, int yoffset,
-                                         const uint8_t *b, int b_stride,
-                                         unsigned int *sse);
-typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *b, int b_stride,
-                                            uint32_t *sse,
-                                            const uint8_t *second_pred);
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
 typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
@@ -572,15 +561,16 @@ class SubpelVarianceTest
     if (!use_high_bit_depth()) {
       src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
       sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
-      ref_ = new uint8_t[block_size() + width() + height() + 1];
+      ref_ = reinterpret_cast<uint8_t *>(
+          vpx_malloc(block_size() + width() + height() + 1));
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
           vpx_memalign(16, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
           vpx_memalign(16, block_size() * sizeof(uint16_t))));
-      ref_ = CONVERT_TO_BYTEPTR(
-          new uint16_t[block_size() + width() + height() + 1]);
+      ref_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(vpx_malloc(
+          (block_size() + width() + height() + 1) * sizeof(uint16_t))));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
     ASSERT_TRUE(src_ != NULL);
@@ -591,12 +581,12 @@ class SubpelVarianceTest
   virtual void TearDown() {
     if (!use_high_bit_depth()) {
       vpx_free(src_);
-      delete[] ref_;
       vpx_free(sec_);
+      vpx_free(ref_);
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       vpx_free(CONVERT_TO_SHORTPTR(src_));
-      delete[] CONVERT_TO_SHORTPTR(ref_);
+      vpx_free(CONVERT_TO_SHORTPTR(ref_));
       vpx_free(CONVERT_TO_SHORTPTR(sec_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
@@ -692,7 +682,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
 }
 
 template <>
-void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
       if (!use_high_bit_depth()) {
@@ -728,10 +718,10 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
 }
 
 typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
 
 TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
@@ -756,14 +746,14 @@ INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_c)));
 
-typedef TestParams<VarianceMxNFunc> MseParams;
+typedef TestParams<vpx_variance_fn_t> MseParams;
 INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c),
                                           MseParams(4, 3, &vpx_mse16x8_c),
                                           MseParams(3, 4, &vpx_mse8x16_c),
                                           MseParams(3, 3, &vpx_mse8x8_c)));
 
-typedef TestParams<VarianceMxNFunc> VarianceParams;
+typedef TestParams<vpx_variance_fn_t> VarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c),
@@ -780,7 +770,7 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
-typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
     ::testing::Values(
@@ -798,7 +788,7 @@ INSTANTIATE_TEST_CASE_P(
         SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
         SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
 
-typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+typedef TestParams<vpx_subp_avg_variance_fn_t> SubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
     ::testing::Values(
@@ -817,10 +807,11 @@ INSTANTIATE_TEST_CASE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxHBDVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxHBDSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
+    VpxHBDSubpelAvgVarianceTest;
 
 TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
@@ -1384,15 +1375,19 @@ INSTANTIATE_TEST_CASE_P(
 
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
-                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2)));
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_avx2),
+                                          MseParams(4, 3, &vpx_mse16x8_avx2)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxVarianceTest,
     ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_avx2),
                       VarianceParams(6, 5, &vpx_variance64x32_avx2),
+                      VarianceParams(5, 6, &vpx_variance32x64_avx2),
                       VarianceParams(5, 5, &vpx_variance32x32_avx2),
                       VarianceParams(5, 4, &vpx_variance32x16_avx2),
-                      VarianceParams(4, 4, &vpx_variance16x16_avx2)));
+                      VarianceParams(4, 5, &vpx_variance16x32_avx2),
+                      VarianceParams(4, 4, &vpx_variance16x16_avx2),
+                      VarianceParams(4, 3, &vpx_variance16x8_avx2)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelVarianceTest,
@@ -1539,6 +1534,27 @@ INSTANTIATE_TEST_CASE_P(VSX, SumOfSquaresTest,
 INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_vsx)));
+INSTANTIATE_TEST_CASE_P(VSX, VpxMseTest,
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_vsx),
+                                          MseParams(4, 3, &vpx_mse16x8_vsx),
+                                          MseParams(3, 4, &vpx_mse8x16_vsx),
+                                          MseParams(3, 3, &vpx_mse8x8_vsx)));
+
+INSTANTIATE_TEST_CASE_P(
+    VSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_vsx),
+                      VarianceParams(6, 5, &vpx_variance64x32_vsx),
+                      VarianceParams(5, 6, &vpx_variance32x64_vsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_vsx),
+                      VarianceParams(5, 4, &vpx_variance32x16_vsx),
+                      VarianceParams(4, 5, &vpx_variance16x32_vsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_vsx),
+                      VarianceParams(4, 3, &vpx_variance16x8_vsx),
+                      VarianceParams(3, 4, &vpx_variance8x16_vsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_vsx),
+                      VarianceParams(3, 2, &vpx_variance8x4_vsx),
+                      VarianceParams(2, 3, &vpx_variance4x8_vsx),
+                      VarianceParams(2, 2, &vpx_variance4x4_vsx)));
 #endif  // HAVE_VSX
 
 #if HAVE_MMI
diff --git a/libs/libvpx/test/video_source.h b/libs/libvpx/test/video_source.h
index 54f692865b6..e9340f21e9e 100644
--- a/libs/libvpx/test/video_source.h
+++ b/libs/libvpx/test/video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_VIDEO_SOURCE_H_
-#define TEST_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_VIDEO_SOURCE_H_
+#define VPX_TEST_VIDEO_SOURCE_H_
 
 #if defined(_WIN32)
 #undef NOMINMAX
@@ -255,4 +255,4 @@ class CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/vp8_datarate_test.cc b/libs/libvpx/test/vp8_datarate_test.cc
new file mode 100644
index 00000000000..95a1157f6c2
--- /dev/null
+++ b/libs/libvpx/test/vp8_datarate_test.cc
@@ -0,0 +1,416 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+
+namespace {
+
+class DatarateTestLarge
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ public:
+  DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
+
+  virtual ~DatarateTestLarge() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(GET_PARAM(1));
+    set_cpu_used_ = GET_PARAM(2);
+    ResetModel();
+  }
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    first_drop_ = 0;
+    bits_total_ = 0;
+    duration_ = 0.0;
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    gf_boost_ = 0;
+    use_roi_ = false;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
+    }
+
+    if (use_roi_) {
+      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+      encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    }
+
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    // TODO(jimbankoski): Remove these lines when the issue:
+    // http://code.google.com/p/webm/issues/detail?id=496 is fixed.
+    // For now the codec assumes buffer starts at starting buffer rate
+    // plus one frame's time.
+    if (last_pts_ == 0) duration = 1;
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    /* Test the buffer model here before subtracting the frame. Do so because
+     * the way the leaky bucket model works in libvpx is to allow the buffer to
+     * empty - and then stop showing frames until we've got enough bits to
+     * show one. As noted in comment below (issue 495), this does not currently
+     * apply to key frames. For now exclude key frames in condition below. */
+    const bool key_frame =
+        (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
+    if (!key_frame) {
+      ASSERT_GE(bits_in_buffer_model_, 0)
+          << "Buffer Underrun at frame " << pkt->data.frame.pts;
+    }
+
+    const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Subtract from the buffer the bits associated with a played back frame.
+    bits_in_buffer_model_ -= frame_size_in_bits;
+
+    // Update the running total of bits for end of test datarate checks.
+    bits_total_ += frame_size_in_bits;
+
+    // If first drop not set and we have a drop set it to this time.
+    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+
+    // We update this so that we can calculate the datarate minus the last
+    // frame encoded in the file.
+    bits_in_last_frame_ = frame_size_in_bits;
+
+    ++frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    if (bits_total_) {
+      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
+
+      duration_ = (last_pts_ + 1) * timebase_;
+
+      // Effective file datarate includes the time spent prebuffering.
+      effective_datarate_ = (bits_total_ - bits_in_last_frame_) / 1000.0 /
+                            (cfg_.rc_buf_initial_sz / 1000.0 + duration_);
+
+      file_datarate_ = file_size_in_kb / duration_;
+    }
+  }
+
+  virtual void DenoiserLevelsTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    for (int j = 1; j < 5; ++j) {
+      // Run over the denoiser levels.
+      // For the temporal denoiser (#if CONFIG_TEMPORAL_DENOISING) the level j
+      // refers to the 4 denoiser modes: denoiserYonly, denoiserOnYUV,
+      // denoiserOnAggressive, and denoiserOnAdaptive.
+      denoiser_on_ = j;
+      cfg_.rc_target_bitrate = 300;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+          << " The datarate for the file exceeds the target!";
+
+      ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+          << " The datarate for the file missed the target!";
+    }
+  }
+
+  virtual void DenoiserOffOnTest() {
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 299);
+    cfg_.rc_target_bitrate = 300;
+    ResetModel();
+    // The denoiser is off by default.
+    denoiser_on_ = 0;
+    // Set the offon test flag.
+    denoiser_offon_test_ = 1;
+    denoiser_offon_period_ = 100;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+
+  virtual void BasicBufferModelTest() {
+    denoiser_on_ = 0;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 1;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    // 2 pass cbr datarate control has a bug hidden by the small # of
+    // frames selected in this encode. The problem is that even if the buffer is
+    // negative we produce a keyframe on a cutscene. Ignoring datarate
+    // constraints
+    // TODO(jimbankoski): ( Fix when issue
+    // http://code.google.com/p/webm/issues/detail?id=495 is addressed. )
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+
+    // There is an issue for low bitrates in real-time mode, where the
+    // effective_datarate slightly overshoots the target bitrate.
+    // This is same the issue as noted about (#495).
+    // TODO(jimbankoski/marpan): Update test to run for lower bitrates (< 100),
+    // when the issue is resolved.
+    for (int i = 100; i < 800; i += 200) {
+      cfg_.rc_target_bitrate = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+          << " The datarate for the file exceeds the target!";
+      ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+          << " The datarate for the file missed the target!";
+    }
+  }
+
+  virtual void ChangingDropFrameThreshTest() {
+    denoiser_on_ = 0;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_max_quantizer = 36;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_target_bitrate = 200;
+    cfg_.kf_mode = VPX_KF_DISABLED;
+
+    const int frame_count = 40;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, frame_count);
+
+    // Here we check that the first dropped frame gets earlier and earlier
+    // as the drop frame threshold is increased.
+
+    const int kDropFrameThreshTestStep = 30;
+    vpx_codec_pts_t last_drop = frame_count;
+    for (int i = 1; i < 91; i += kDropFrameThreshTestStep) {
+      cfg_.rc_dropframe_thresh = i;
+      ResetModel();
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      ASSERT_LE(first_drop_, last_drop)
+          << " The first dropped frame for drop_thresh " << i
+          << " > first dropped frame for drop_thresh "
+          << i - kDropFrameThreshTestStep;
+      last_drop = first_drop_;
+    }
+  }
+
+  virtual void DropFramesMultiThreadsTest() {
+    denoiser_on_ = 0;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_dropframe_thresh = 30;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_threads = 2;
+
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352,
+                                         288, 30, 1, 0, 140);
+    cfg_.rc_target_bitrate = 200;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+        << " The datarate for the file exceeds the target!";
+
+    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+        << " The datarate for the file missed the target!";
+  }
+
+  vpx_codec_pts_t last_pts_;
+  int64_t bits_in_buffer_model_;
+  double timebase_;
+  int frame_number_;
+  vpx_codec_pts_t first_drop_;
+  int64_t bits_total_;
+  double duration_;
+  double file_datarate_;
+  double effective_datarate_;
+  int64_t bits_in_last_frame_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int set_cpu_used_;
+  int gf_boost_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestLarge, DenoiserLevels) { DenoiserLevelsTest(); }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestLarge, DenoiserOffOn) { DenoiserOffOnTest(); }
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestLarge, BasicBufferModel) { BasicBufferModelTest(); }
+
+TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
+  DropFramesMultiThreadsTest();
+}
+
+class DatarateTestRealTime : public DatarateTestLarge {
+ public:
+  virtual ~DatarateTestRealTime() {}
+};
+
+#if CONFIG_TEMPORAL_DENOISING
+// Check basic datarate targeting, for a single bitrate, but loop over the
+// various denoiser settings.
+TEST_P(DatarateTestRealTime, DenoiserLevels) { DenoiserLevelsTest(); }
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestRealTime, DenoiserOffOn) {}
+#endif  // CONFIG_TEMPORAL_DENOISING
+
+TEST_P(DatarateTestRealTime, BasicBufferModel) { BasicBufferModelTest(); }
+
+TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
+  ChangingDropFrameThreshTest();
+}
+
+TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
+  DropFramesMultiThreadsTest();
+}
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 15) / 16;
+  roi_.cols = (cfg_.g_w + 15) / 16;
+
+  roi_.delta_q[0] = 0;
+  roi_.delta_q[1] = -20;
+  roi_.delta_q[2] = 0;
+  roi_.delta_q[3] = 0;
+
+  roi_.delta_lf[0] = 0;
+  roi_.delta_lf[1] = -20;
+  roi_.delta_lf[2] = 0;
+  roi_.delta_lf[3] = 0;
+
+  roi_.static_threshold[0] = 0;
+  roi_.static_threshold[1] = 1000;
+  roi_.static_threshold[2] = 0;
+  roi_.static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map =
+      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+TEST_P(DatarateTestRealTime, GFBoost) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  // Apply a gf boost.
+  gf_boost_ = 50;
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+}
+
+VP8_INSTANTIATE_TEST_CASE(DatarateTestLarge, ALL_TEST_MODES,
+                          ::testing::Values(0));
+VP8_INSTANTIATE_TEST_CASE(DatarateTestRealTime,
+                          ::testing::Values(::libvpx_test::kRealTime),
+                          ::testing::Values(-6, -12));
+}  // namespace
diff --git a/libs/libvpx/test/vp8_multi_resolution_encoder.sh b/libs/libvpx/test/vp8_multi_resolution_encoder.sh
index a8b7fe78eea..bd45b5381f6 100755
--- a/libs/libvpx/test/vp8_multi_resolution_encoder.sh
+++ b/libs/libvpx/test/vp8_multi_resolution_encoder.sh
@@ -22,7 +22,7 @@ vp8_multi_resolution_encoder_verify_environment() {
       elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
       return 1
     fi
-    local readonly app="vp8_multi_resolution_encoder"
+    local app="vp8_multi_resolution_encoder"
     if [ -z "$(vpx_tool_path "${app}")" ]; then
       elog "${app} not found. It must exist in LIBVPX_BIN_PATH or its parent."
       return 1
@@ -33,7 +33,7 @@ vp8_multi_resolution_encoder_verify_environment() {
 # Runs vp8_multi_resolution_encoder. Simply forwards all arguments to
 # vp8_multi_resolution_encoder after building path to the executable.
 vp8_mre() {
-  local readonly encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
+  local encoder="$(vpx_tool_path vp8_multi_resolution_encoder)"
   if [ ! -x "${encoder}" ]; then
     elog "${encoder} does not exist or is not executable."
     return 1
@@ -43,22 +43,34 @@ vp8_mre() {
 }
 
 vp8_multi_resolution_encoder_three_formats() {
-  local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
-                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
-                               ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+  local output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
+                      ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
+                      ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+  local layer_bitrates="150 80 50"
+  local keyframe_insert="200"
+  local temporal_layers="3 3 3"
+  local framerate="30"
 
   if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
     if [ "$(vp8_encode_available)" = "yes" ]; then
       # Param order:
       #  Input width
       #  Input height
+      #  Framerate
       #  Input file path
       #  Output file names
+      #  Layer bitrates
+      #  Temporal layers
+      #  Keyframe insert
       #  Output PSNR
       vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
         "${YUV_RAW_INPUT_HEIGHT}" \
+        "${framerate}" \
         "${YUV_RAW_INPUT}" \
         ${output_files} \
+        ${layer_bitrates} \
+        ${temporal_layers} \
+        "${keyframe_insert}" \
         0
 
       for output_file in ${output_files}; do
diff --git a/libs/libvpx/test/vp9_arf_freq_test.cc b/libs/libvpx/test/vp9_arf_freq_test.cc
index 48a4ca73926..9a3455b4aa1 100644
--- a/libs/libvpx/test/vp9_arf_freq_test.cc
+++ b/libs/libvpx/test/vp9_arf_freq_test.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/codec_factory.h"
@@ -190,7 +192,7 @@ TEST_P(ArfFreqTest, MinArfFreqTest) {
   init_flags_ = VPX_CODEC_USE_PSNR;
   if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  std::unique_ptr<libvpx_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {
     video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 kFrames));
diff --git a/libs/libvpx/test/vp9_block_error_test.cc b/libs/libvpx/test/vp9_block_error_test.cc
index 0b4d1df9922..71a0686d7a5 100644
--- a/libs/libvpx/test/vp9_block_error_test.cc
+++ b/libs/libvpx/test/vp9_block_error_test.cc
@@ -11,6 +11,7 @@
 #include <cmath>
 #include <cstdlib>
 #include <string>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -35,7 +36,7 @@ typedef int64_t (*HBDBlockErrorFunc)(const tran_low_t *coeff,
                                      intptr_t block_size, int64_t *ssz,
                                      int bps);
 
-typedef std::tr1::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
+typedef std::tuple<HBDBlockErrorFunc, HBDBlockErrorFunc, vpx_bit_depth_t>
     BlockErrorParam;
 
 typedef int64_t (*BlockErrorFunc)(const tran_low_t *coeff,
@@ -168,7 +169,7 @@ TEST_P(BlockErrorTest, ExtremeValues) {
       << "First failed at test case " << first_failure;
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
 const BlockErrorParam sse2_block_error_tests[] = {
diff --git a/libs/libvpx/test/vp9_datarate_test.cc b/libs/libvpx/test/vp9_datarate_test.cc
new file mode 100644
index 00000000000..b8be275eaf7
--- /dev/null
+++ b/libs/libvpx/test/vp9_datarate_test.cc
@@ -0,0 +1,901 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/y4m_video_source.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
+ public:
+  explicit DatarateTestVP9(const ::libvpx_test::CodecFactory *codec)
+      : EncoderTest(codec) {
+    tune_content_ = 0;
+  }
+
+ protected:
+  virtual ~DatarateTestVP9() {}
+
+  virtual void ResetModel() {
+    last_pts_ = 0;
+    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
+    frame_number_ = 0;
+    tot_frame_number_ = 0;
+    first_drop_ = 0;
+    num_drops_ = 0;
+    aq_mode_ = 3;
+    // Denoiser is off by default.
+    denoiser_on_ = 0;
+    // For testing up to 3 layers.
+    for (int i = 0; i < 3; ++i) {
+      bits_total_[i] = 0;
+    }
+    denoiser_offon_test_ = 0;
+    denoiser_offon_period_ = -1;
+    frame_parallel_decoding_mode_ = 1;
+    use_roi_ = false;
+  }
+
+  //
+  // Frame flags and layer id for temporal layers.
+  //
+
+  // For two layers, test pattern is:
+  //   1     3
+  // 0    2     .....
+  // For three layers, test pattern is:
+  //   1      3    5      7
+  //      2           6
+  // 0          4            ....
+  // LAST is always update on base/layer 0, GOLDEN is updated on layer 1.
+  // For this 3 layer example, the 2nd enhancement layer (layer 2) updates
+  // the altref frame.
+  static int GetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L and ARF; update L.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G, ARF; update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARF.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  static int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
+      encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
+    }
+
+    if (denoiser_offon_test_) {
+      ASSERT_GT(denoiser_offon_period_, 0)
+          << "denoiser_offon_period_ is not positive.";
+      if ((video->frame() + 1) % denoiser_offon_period_ == 0) {
+        // Flip denoiser_on_ periodically
+        denoiser_on_ ^= 1;
+      }
+    }
+
+    encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+    encoder->Control(VP9E_SET_TILE_COLUMNS, get_msb(cfg_.g_threads));
+    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                     frame_parallel_decoding_mode_);
+
+    if (use_roi_) {
+      encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+      encoder->Control(VP9E_SET_AQ_MODE, 0);
+    }
+
+    if (cfg_.ts_number_layers > 1) {
+      if (video->frame() == 0) {
+        encoder->Control(VP9E_SET_SVC, 1);
+      }
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      layer_id.temporal_layer_id =
+          SetLayerId(video->frame(), cfg_.ts_number_layers);
+      layer_id.temporal_layer_id_per_spatial[0] =
+          SetLayerId(video->frame(), cfg_.ts_number_layers);
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+    }
+    const vpx_rational_t tb = video->timebase();
+    timebase_ = static_cast<double>(tb.num) / tb.den;
+    duration_ = 0;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Time since last timestamp = duration.
+    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
+
+    if (duration > 1) {
+      // If first drop not set and we have a drop set it to this time.
+      if (!first_drop_) first_drop_ = last_pts_ + 1;
+      // Update the number of frame drops.
+      num_drops_ += static_cast<int>(duration - 1);
+      // Update counter for total number of frames (#frames input to encoder).
+      // Needed for setting the proper layer_id below.
+      tot_frame_number_ += static_cast<int>(duration - 1);
+    }
+
+    int layer = SetLayerId(tot_frame_number_, cfg_.ts_number_layers);
+
+    // Add to the buffer the bits we'd expect from a constant bitrate server.
+    bits_in_buffer_model_ += static_cast<int64_t>(
+        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+
+    // Buffer should not go negative.
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
+
+    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
+
+    // Update the total encoded bits. For temporal layers, update the cumulative
+    // encoded bits per layer.
+    for (int i = layer; i < static_cast<int>(cfg_.ts_number_layers); ++i) {
+      bits_total_[i] += frame_size_in_bits;
+    }
+
+    // Update the most recent pts.
+    last_pts_ = pkt->data.frame.pts;
+    ++frame_number_;
+    ++tot_frame_number_;
+  }
+
+  virtual void EndPassHook(void) {
+    for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
+         ++layer) {
+      duration_ = (last_pts_ + 1) * timebase_;
+      if (bits_total_[layer]) {
+        // Effective file datarate:
+        effective_datarate_[layer] = (bits_total_[layer] / 1000.0) / duration_;
+      }
+    }
+  }
+
+  vpx_codec_pts_t last_pts_;
+  double timebase_;
+  int tune_content_;
+  int frame_number_;      // Counter for number of non-dropped/encoded frames.
+  int tot_frame_number_;  // Counter for total number of input frames.
+  int64_t bits_total_[3];
+  double duration_;
+  double effective_datarate_[3];
+  int set_cpu_used_;
+  int64_t bits_in_buffer_model_;
+  vpx_codec_pts_t first_drop_;
+  int num_drops_;
+  int aq_mode_;
+  int denoiser_on_;
+  int denoiser_offon_test_;
+  int denoiser_offon_period_;
+  int frame_parallel_decoding_mode_;
+  bool use_roi_;
+  vpx_roi_map_t roi_;
+};
+
+// Params: test mode, speed setting and index for bitrate array.
+class DatarateTestVP9RealTimeMultiBR
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Params: speed setting and index for bitrate array.
+class DatarateTestVP9LargeVBR
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ public:
+  DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for VBR mode with 0 lag.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.36)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZero) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9LargeVBR, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  const int bitrates[2] = { 400, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.35)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargetingFrameParDecOff) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 150, 350, 550, 750 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  frame_parallel_decoding_mode_ = 0;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for CBR.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting444) {
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+
+  cfg_.g_profile = 1;
+  cfg_.g_timebase = video.timebase();
+
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  const int bitrates[4] = { 250, 450, 650, 850 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 0.80)
+      << " The datarate for the file exceeds the target by too much!";
+  ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+            effective_datarate_[0] * 1.15)
+      << " The datarate for the file missed the target!"
+      << cfg_.rc_target_bitrate << " " << effective_datarate_;
+}
+
+// Check that (1) the first dropped frame gets earlier and earlier
+// as the drop frame threshold is increased, and (2) that the total number of
+// frame drops does not decrease as we increase frame drop threshold.
+// Use a lower qp-max to force some frame drops.
+TEST_P(DatarateTestVP9RealTimeMultiBR, ChangingDropFrameThresh) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  // TODO(marpan): Investigate datarate target failures with a smaller keyframe
+  // interval (128).
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  const int kDropFrameThreshTestStep = 30;
+  const int bitrates[2] = { 50, 150 };
+  const int bitrate_index = GET_PARAM(2);
+  if (bitrate_index > 1) return;
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  vpx_codec_pts_t last_drop = 140;
+  int last_num_drops = 0;
+  for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+    cfg_.rc_dropframe_thresh = i;
+    ResetModel();
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+        << " The datarate for the file is greater than target by too much!";
+    ASSERT_LE(first_drop_, last_drop)
+        << " The first dropped frame for drop_thresh " << i
+        << " > first dropped frame for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    ASSERT_GE(num_drops_, last_num_drops * 0.85)
+        << " The number of dropped frames for drop_thresh " << i
+        << " < number of dropped frames for drop_thresh "
+        << i - kDropFrameThreshTestStep;
+    last_drop = first_drop_;
+    last_num_drops = num_drops_;
+  }
+}  // namespace
+
+// Check basic rate targeting for 2 temporal layers.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting2TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 2 Temporal layers, no spatial layers: Framerate decimation (2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 2;
+  cfg_.ts_rate_decimator[0] = 2;
+  cfg_.ts_rate_decimator[1] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 60-40 bitrate allocation for 2 temporal layers.
+  cfg_.layer_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.15)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Check basic rate targeting for 3 temporal layers.
+TEST_P(DatarateTestVP9RealTimeMultiBR, BasicRateTargeting3TemporalLayers) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  const int bitrates[4] = { 200, 400, 600, 800 };
+  const int bitrate_index = GET_PARAM(2);
+  cfg_.rc_target_bitrate = bitrates[bitrate_index];
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than .75.
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.75)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    // TODO(yaowu): Work out more stable rc control strategy and
+    //              Adjust the thresholds to be tighter than 1.25.
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.25)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+  }
+}
+
+// Params: speed setting.
+class DatarateTestVP9RealTime : public DatarateTestVP9,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
+  virtual ~DatarateTestVP9RealTime() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
+TEST_P(DatarateTestVP9RealTime, BasicRateTargetingDropFramesMultiThreads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic rate targeting for 3 temporal layers, with frame dropping.
+// Only for one (low) bitrate with lower max_quantizer, and somewhat higher
+// frame drop threshold, to force frame dropping.
+TEST_P(DatarateTestVP9RealTime,
+       BasicRateTargeting3TemporalLayersFrameDropping) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  // Set frame drop threshold and rc_max_quantizer to force some frame drops.
+  cfg_.rc_dropframe_thresh = 20;
+  cfg_.rc_max_quantizer = 45;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  cfg_.rc_target_bitrate = 200;
+  ResetModel();
+  // 40-20-40 bitrate allocation for 3 temporal layers.
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  aq_mode_ = 0;
+  if (deadline_ == VPX_DL_REALTIME) {
+    aq_mode_ = 3;
+    cfg_.g_error_resilient = 1;
+  }
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  for (int j = 0; j < static_cast<int>(cfg_.ts_number_layers); ++j) {
+    ASSERT_GE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 0.85)
+        << " The datarate for the file is lower than target by too much, "
+           "for layer: "
+        << j;
+    ASSERT_LE(effective_datarate_[j], cfg_.layer_target_bitrate[j] * 1.20)
+        << " The datarate for the file is greater than target by too much, "
+           "for layer: "
+        << j;
+    // Expect some frame drops in this test: for this 200 frames test,
+    // expect at least 10% and not more than 60% drops.
+    ASSERT_GE(num_drops_, 20);
+    ASSERT_LE(num_drops_, 280);
+  }
+}
+
+// Check VP9 region of interest feature.
+TEST_P(DatarateTestVP9RealTime, RegionOfInterest) {
+  if (deadline_ != VPX_DL_REALTIME || set_cpu_used_ < 5) return;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 640;
+  cfg_.g_h = 480;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = true;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 7) / 8;
+  roi_.cols = (cfg_.g_w + 7) / 8;
+
+  roi_.delta_q[1] = -20;
+  roi_.delta_lf[1] = -20;
+  memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+  roi_.ref_frame[1] = 1;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map = reinterpret_cast<uint8_t *>(
+      calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map)));
+  ASSERT_TRUE(roi_.roi_map != NULL);
+
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_[0] * 0.90)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_[0] * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
+
+// Params: test mode, speed setting and index for bitrate array.
+class DatarateTestVP9PostEncodeDrop
+    : public DatarateTestVP9,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ public:
+  DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    set_cpu_used_ = GET_PARAM(1);
+    ResetModel();
+  }
+};
+
+// Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
+TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+  cfg_.g_error_resilient = 0;
+  tune_content_ = 1;
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 300;
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// Params: speed setting.
+class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
+ public:
+  virtual ~DatarateTestVP9RealTimeDenoiser() {}
+};
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on.
+TEST_P(DatarateTestVP9RealTimeDenoiser, LowNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 400;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for clip with high noise level. Use 2 threads.
+TEST_P(DatarateTestVP9RealTimeDenoiser, HighNoise) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::Y4mVideoSource video("noisy_clip_640_360.y4m", 0, 200);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: kDenoiserOnYOnly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is on,
+// for 1280x720 clip with 4 threads.
+TEST_P(DatarateTestVP9RealTimeDenoiser, 4threads) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 4;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 1000;
+  ResetModel();
+  // Turn on the denoiser.
+  denoiser_on_ = 1;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.29)
+      << " The datarate for the file is greater than target by too much!";
+}
+
+// Check basic datarate targeting, for a single bitrate, when denoiser is off
+// and on.
+TEST_P(DatarateTestVP9RealTimeDenoiser, DenoiserOffOn) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  // For the temporal denoiser (#if CONFIG_VP9_TEMPORAL_DENOISING),
+  // there is only one denoiser mode: denoiserYonly(which is 1),
+  // but may add more modes in the future.
+  cfg_.rc_target_bitrate = 400;
+  ResetModel();
+  // The denoiser is off by default.
+  denoiser_on_ = 0;
+  // Set the offon test flag.
+  denoiser_offon_test_ = 1;
+  denoiser_offon_period_ = 100;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+      << " The datarate for the file is lower than target by too much!";
+  ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+      << " The datarate for the file is greater than target by too much!";
+}
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeMultiBR,
+                          ::testing::Range(5, 10), ::testing::Range(0, 4));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
+                          ::testing::Range(0, 2));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
+
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9PostEncodeDrop,
+                          ::testing::Range(5, 6));
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+VP9_INSTANTIATE_TEST_CASE(DatarateTestVP9RealTimeDenoiser,
+                          ::testing::Range(5, 10));
+#endif
+}  // namespace
diff --git a/libs/libvpx/test/vp9_denoiser_test.cc b/libs/libvpx/test/vp9_denoiser_test.cc
index 56ca257c595..47fa587fca7 100644
--- a/libs/libvpx/test/vp9_denoiser_test.cc
+++ b/libs/libvpx/test/vp9_denoiser_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/acm_random.h"
@@ -35,7 +36,7 @@ typedef int (*Vp9DenoiserFilterFunc)(const uint8_t *sig, int sig_stride,
                                      uint8_t *avg, int avg_stride,
                                      int increase_denoising, BLOCK_SIZE bs,
                                      int motion_magnitude);
-typedef std::tr1::tuple<Vp9DenoiserFilterFunc, BLOCK_SIZE> VP9DenoiserTestParam;
+typedef std::tuple<Vp9DenoiserFilterFunc, BLOCK_SIZE> VP9DenoiserTestParam;
 
 class VP9DenoiserTest
     : public ::testing::Test,
@@ -99,7 +100,7 @@ TEST_P(VP9DenoiserTest, BitexactCheck) {
   }
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 // Test for all block size.
 #if HAVE_SSE2
diff --git a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index 62e8dcb9b52..fade08bbd47 100644
--- a/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/libs/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/codec_factory.h"
@@ -74,7 +76,7 @@ class VpxEncoderParmsGetToDecoder
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
       encoder->Control(VP9E_SET_LOSSLESS, encode_parms.lossless);
@@ -138,7 +140,7 @@ class VpxEncoderParmsGetToDecoder
 TEST_P(VpxEncoderParmsGetToDecoder, BitstreamParms) {
   init_flags_ = VPX_CODEC_USE_PSNR;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video(
+  std::unique_ptr<libvpx_test::VideoSource> video(
       new libvpx_test::Y4mVideoSource(test_video_.name, 0, test_video_.frames));
   ASSERT_TRUE(video.get() != NULL);
 
diff --git a/libs/libvpx/test/vp9_end_to_end_test.cc b/libs/libvpx/test/vp9_end_to_end_test.cc
index 955f567ce24..7cb716f2262 100644
--- a/libs/libvpx/test/vp9_end_to_end_test.cc
+++ b/libs/libvpx/test/vp9_end_to_end_test.cc
@@ -8,10 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "memory"
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
 #include "test/yuv_video_source.h"
@@ -21,14 +24,14 @@ namespace {
 const unsigned int kWidth = 160;
 const unsigned int kHeight = 90;
 const unsigned int kFramerate = 50;
-const unsigned int kFrames = 10;
+const unsigned int kFrames = 20;
 const int kBitrate = 500;
 // List of psnr thresholds for speed settings 0-7 and 5 encoding modes
 const double kPsnrThreshold[][5] = {
   { 36.0, 37.0, 37.0, 37.0, 37.0 }, { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 }, { 33.0, 34.0, 34.0, 34.0, 34.0 },
-  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 31.0, 32.0, 32.0, 32.0, 32.0 },
-  { 30.0, 31.0, 31.0, 31.0, 31.0 }, { 29.0, 30.0, 30.0, 30.0, 30.0 },
+  { 32.0, 33.0, 33.0, 33.0, 33.0 }, { 28.0, 32.0, 32.0, 32.0, 32.0 },
+  { 28.5, 31.0, 31.0, 31.0, 31.0 }, { 27.5, 30.0, 30.0, 30.0, 30.0 },
 };
 
 typedef struct {
@@ -45,13 +48,13 @@ const TestVideoParam kTestVectors[] = {
   { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444, VPX_BITS_8, 1 },
   { "park_joy_90p_8_440.yuv", 8, VPX_IMG_FMT_I440, VPX_BITS_8, 1 },
 #if CONFIG_VP9_HIGHBITDEPTH
-  { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 },
-  { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 },
-  { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 },
+  { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016, VPX_BITS_10, 2 },
+  { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216, VPX_BITS_10, 3 },
+  { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416, VPX_BITS_10, 3 },
   { "park_joy_90p_10_440.yuv", 10, VPX_IMG_FMT_I44016, VPX_BITS_10, 3 },
-  { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 },
-  { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 },
-  { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 },
+  { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016, VPX_BITS_12, 2 },
+  { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216, VPX_BITS_12, 3 },
+  { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416, VPX_BITS_12, 3 },
   { "park_joy_90p_12_440.yuv", 12, VPX_IMG_FMT_I44016, VPX_BITS_12, 3 },
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
@@ -59,11 +62,11 @@ const TestVideoParam kTestVectors[] = {
 // Encoding modes tested
 const libvpx_test::TestMode kEncodingModeVectors[] = {
   ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };
 
 // Speed settings tested
-const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6 };
+const int kCpuUsedVectors[] = { 1, 2, 3, 5, 6, 7 };
 
 int is_extension_y4m(const char *filename) {
   const char *dot = strrchr(filename, '.');
@@ -74,6 +77,43 @@ int is_extension_y4m(const char *filename) {
   }
 }
 
+class EndToEndTestAdaptiveRDThresh
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, int> {
+ protected:
+  EndToEndTestAdaptiveRDThresh()
+      : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
+        cpu_used_end_(GET_PARAM(2)) {}
+
+  virtual ~EndToEndTestAdaptiveRDThresh() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 600;
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_);
+      encoder->Control(VP9E_SET_ROW_MT, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+    }
+    if (video->frame() == 100)
+      encoder->Control(VP8E_SET_CPUUSED, cpu_used_end_);
+  }
+
+ private:
+  int cpu_used_start_;
+  int cpu_used_end_;
+};
+
 class EndToEndTestLarge
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode,
@@ -82,7 +122,10 @@ class EndToEndTestLarge
   EndToEndTestLarge()
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(2)),
         cpu_used_(GET_PARAM(3)), psnr_(0.0), nframes_(0),
-        encoding_mode_(GET_PARAM(1)) {}
+        encoding_mode_(GET_PARAM(1)) {
+    cyclic_refresh_ = 0;
+    denoiser_on_ = 0;
+  }
 
   virtual ~EndToEndTestLarge() {}
 
@@ -114,7 +157,7 @@ class EndToEndTestLarge
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
@@ -123,6 +166,9 @@ class EndToEndTestLarge
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
         encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
         encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      } else {
+        encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
+        encoder->Control(VP9E_SET_AQ_MODE, cyclic_refresh_);
       }
     }
   }
@@ -138,6 +184,8 @@ class EndToEndTestLarge
 
   TestVideoParam test_video_param_;
   int cpu_used_;
+  int cyclic_refresh_;
+  int denoiser_on_;
 
  private:
   double psnr_;
@@ -145,6 +193,50 @@ class EndToEndTestLarge
   libvpx_test::TestMode encoding_mode_;
 };
 
+#if CONFIG_VP9_DECODER
+// The test parameters control VP9D_SET_LOOP_FILTER_OPT and the number of
+// decoder threads.
+class EndToEndTestLoopFilterThreading
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<bool, int> {
+ protected:
+  EndToEndTestLoopFilterThreading()
+      : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
+
+  virtual ~EndToEndTestLoopFilterThreading() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    cfg_.g_threads = 2;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_min_dist = 1;
+    cfg_.kf_max_dist = 1;
+    dec_cfg_.threads = GET_PARAM(2);
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 8);
+    }
+    encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5);
+  }
+
+  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Decoder *decoder) {
+    if (video->frame() == 0) {
+      decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0);
+    }
+  }
+
+ private:
+  const bool use_loop_filter_opt_;
+};
+#endif  // CONFIG_VP9_DECODER
+
 TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
   cfg_.rc_target_bitrate = kBitrate;
   cfg_.g_error_resilient = 0;
@@ -154,7 +246,7 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
   init_flags_ = VPX_CODEC_USE_PSNR;
   if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  std::unique_ptr<libvpx_test::VideoSource> video;
   if (is_extension_y4m(test_video_param_.filename)) {
     video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
                                                 kFrames));
@@ -170,8 +262,63 @@ TEST_P(EndToEndTestLarge, EndtoEndPSNRTest) {
   EXPECT_GT(psnr, GetPsnrThreshold());
 }
 
+TEST_P(EndToEndTestLarge, EndtoEndPSNRDenoiserAQTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  cyclic_refresh_ = 3;
+  denoiser_on_ = 1;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video.reset(new libvpx_test::Y4mVideoSource(test_video_param_.filename, 0,
+                                                kFrames));
+  } else {
+    video.reset(new libvpx_test::YUVVideoSource(
+        test_video_param_.filename, test_video_param_.fmt, kWidth, kHeight,
+        kFramerate, 1, 0, kFrames));
+  }
+  ASSERT_TRUE(video.get() != NULL);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, GetPsnrThreshold());
+}
+
+TEST_P(EndToEndTestAdaptiveRDThresh, EndtoEndAdaptiveRDThreshRowMT) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_threads = 2;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+#if CONFIG_VP9_DECODER
+TEST_P(EndToEndTestLoopFilterThreading, TileCountChange) {
+  ::libvpx_test::RandomVideoSource video;
+  video.SetSize(4096, 2160);
+  video.set_limit(10);
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+#endif  // CONFIG_VP9_DECODER
+
 VP9_INSTANTIATE_TEST_CASE(EndToEndTestLarge,
                           ::testing::ValuesIn(kEncodingModeVectors),
                           ::testing::ValuesIn(kTestVectors),
                           ::testing::ValuesIn(kCpuUsedVectors));
+
+VP9_INSTANTIATE_TEST_CASE(EndToEndTestAdaptiveRDThresh,
+                          ::testing::Values(5, 6, 7), ::testing::Values(8, 9));
+
+#if CONFIG_VP9_DECODER
+VP9_INSTANTIATE_TEST_CASE(EndToEndTestLoopFilterThreading, ::testing::Bool(),
+                          ::testing::Range(2, 6));
+#endif  // CONFIG_VP9_DECODER
 }  // namespace
diff --git a/libs/libvpx/test/vp9_ethread_test.cc b/libs/libvpx/test/vp9_ethread_test.cc
index 6b7e512116c..6de76e9e555 100644
--- a/libs/libvpx/test/vp9_ethread_test.cc
+++ b/libs/libvpx/test/vp9_ethread_test.cc
@@ -387,7 +387,7 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double multi_thr_psnr = GetAveragePsnr();
 
-  EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.1);
+  EXPECT_NEAR(single_thr_psnr, multi_thr_psnr, 0.2);
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -409,7 +409,7 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(::libvpx_test::kTwoPassGood,
                           ::libvpx_test::kOnePassGood,
                           ::libvpx_test::kRealTime),
-        ::testing::Range(3, 9),    // cpu_used
+        ::testing::Range(3, 10),   // cpu_used
         ::testing::Range(0, 3),    // tile_columns
         ::testing::Range(2, 5)));  // threads
 
diff --git a/libs/libvpx/test/vp9_intrapred_test.cc b/libs/libvpx/test/vp9_intrapred_test.cc
index 39c5e79ebd9..58091f875b1 100644
--- a/libs/libvpx/test/vp9_intrapred_test.cc
+++ b/libs/libvpx/test/vp9_intrapred_test.cc
@@ -130,6 +130,12 @@ TEST_P(VP9IntraPredTest, IntraPredTests) {
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+// Instantiate a token test to avoid -Wuninitialized warnings when none of the
+// other tests are enabled.
+INSTANTIATE_TEST_CASE_P(
+    C, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_4x4_c,
+                                     &vpx_d45_predictor_4x4_c, 4, 8)));
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9IntraPredTest,
@@ -378,58 +384,61 @@ INSTANTIATE_TEST_CASE_P(
                        8)));
 #endif  // HAVE_MSA
 
-#if HAVE_VSX
-INSTANTIATE_TEST_CASE_P(
-    VSX, VP9IntraPredTest,
-    ::testing::Values(
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
         IntraPredParam(&vpx_d45_predictor_8x8_vsx, &vpx_d45_predictor_8x8_c, 8,
                        8),
-        IntraPredParam(&vpx_d45_predictor_16x16_vsx, &vpx_d45_predictor_16x16_c,
-                       16, 8),
-        IntraPredParam(&vpx_d45_predictor_32x32_vsx, &vpx_d45_predictor_32x32_c,
-                       32, 8),
         IntraPredParam(&vpx_d63_predictor_8x8_vsx, &vpx_d63_predictor_8x8_c, 8,
                        8),
-        IntraPredParam(&vpx_d63_predictor_16x16_vsx, &vpx_d63_predictor_16x16_c,
-                       16, 8),
-        IntraPredParam(&vpx_d63_predictor_32x32_vsx, &vpx_d63_predictor_32x32_c,
-                       32, 8),
-        IntraPredParam(&vpx_dc_128_predictor_16x16_vsx,
-                       &vpx_dc_128_predictor_16x16_c, 16, 8),
-        IntraPredParam(&vpx_dc_128_predictor_32x32_vsx,
-                       &vpx_dc_128_predictor_32x32_c, 32, 8),
-        IntraPredParam(&vpx_dc_left_predictor_16x16_vsx,
-                       &vpx_dc_left_predictor_16x16_c, 16, 8),
-        IntraPredParam(&vpx_dc_left_predictor_32x32_vsx,
-                       &vpx_dc_left_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_predictor_8x8_vsx, &vpx_dc_predictor_8x8_c, 8,
                        8),
-        IntraPredParam(&vpx_dc_predictor_16x16_vsx, &vpx_dc_predictor_16x16_c,
-                       16, 8),
-        IntraPredParam(&vpx_dc_predictor_32x32_vsx, &vpx_dc_predictor_32x32_c,
-                       32, 8),
-        IntraPredParam(&vpx_dc_top_predictor_16x16_vsx,
-                       &vpx_dc_top_predictor_16x16_c, 16, 8),
-        IntraPredParam(&vpx_dc_top_predictor_32x32_vsx,
-                       &vpx_dc_top_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_h_predictor_4x4_vsx, &vpx_h_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_h_predictor_8x8_vsx, &vpx_h_predictor_8x8_c, 8, 8),
-        IntraPredParam(&vpx_h_predictor_16x16_vsx, &vpx_h_predictor_16x16_c, 16,
-                       8),
-        IntraPredParam(&vpx_h_predictor_32x32_vsx, &vpx_h_predictor_32x32_c, 32,
-                       8),
         IntraPredParam(&vpx_tm_predictor_4x4_vsx, &vpx_tm_predictor_4x4_c, 4,
                        8),
         IntraPredParam(&vpx_tm_predictor_8x8_vsx, &vpx_tm_predictor_8x8_c, 8,
                        8),
-        IntraPredParam(&vpx_tm_predictor_16x16_vsx, &vpx_tm_predictor_16x16_c,
-                       16, 8),
-        IntraPredParam(&vpx_tm_predictor_32x32_vsx, &vpx_tm_predictor_32x32_c,
-                       32, 8),
-        IntraPredParam(&vpx_v_predictor_16x16_vsx, &vpx_v_predictor_16x16_c, 16,
-                       8),
-        IntraPredParam(&vpx_v_predictor_32x32_vsx, &vpx_v_predictor_32x32_c, 32,
-                       8)));
+#endif
+
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_d45_predictor_16x16_vsx,
+                                     &vpx_d45_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d45_predictor_32x32_vsx,
+                                     &vpx_d45_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_d63_predictor_16x16_vsx,
+                                     &vpx_d63_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_d63_predictor_32x32_vsx,
+                                     &vpx_d63_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_128_predictor_16x16_vsx,
+                                     &vpx_dc_128_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_128_predictor_32x32_vsx,
+                                     &vpx_dc_128_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_left_predictor_16x16_vsx,
+                                     &vpx_dc_left_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_left_predictor_32x32_vsx,
+                                     &vpx_dc_left_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_vsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_predictor_32x32_vsx,
+                                     &vpx_dc_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_dc_top_predictor_16x16_vsx,
+                                     &vpx_dc_top_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_dc_top_predictor_32x32_vsx,
+                                     &vpx_dc_top_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_h_predictor_16x16_vsx,
+                                     &vpx_h_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_h_predictor_32x32_vsx,
+                                     &vpx_h_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_tm_predictor_16x16_vsx,
+                                     &vpx_tm_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_tm_predictor_32x32_vsx,
+                                     &vpx_tm_predictor_32x32_c, 32, 8),
+                      IntraPredParam(&vpx_v_predictor_16x16_vsx,
+                                     &vpx_v_predictor_16x16_c, 16, 8),
+                      IntraPredParam(&vpx_v_predictor_32x32_vsx,
+                                     &vpx_v_predictor_32x32_c, 32, 8)));
 #endif  // HAVE_VSX
 
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/test/vp9_lossless_test.cc b/libs/libvpx/test/vp9_lossless_test.cc
index 703b55e9bdd..5cf0a41da4d 100644
--- a/libs/libvpx/test/vp9_lossless_test.cc
+++ b/libs/libvpx/test/vp9_lossless_test.cc
@@ -38,7 +38,7 @@ class LosslessTest
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
       if (cfg_.rc_max_quantizer > 0 || cfg_.rc_min_quantizer > 0) {
diff --git a/libs/libvpx/test/vp9_motion_vector_test.cc b/libs/libvpx/test/vp9_motion_vector_test.cc
index 1030204ae32..b556a1c3784 100644
--- a/libs/libvpx/test/vp9_motion_vector_test.cc
+++ b/libs/libvpx/test/vp9_motion_vector_test.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <memory>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "test/codec_factory.h"
@@ -22,7 +24,7 @@ namespace {
 // Encoding modes
 const libvpx_test::TestMode kEncodingModeVectors[] = {
   ::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood,
-  ::libvpx_test::kRealTime,
+  ::libvpx_test::kRealTime
 };
 
 // Encoding speeds
@@ -59,7 +61,7 @@ class MotionVectorTestLarge
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
-    if (video->frame() == 1) {
+    if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -81,7 +83,7 @@ TEST_P(MotionVectorTestLarge, OverallTest) {
   cfg_.g_profile = 0;
   init_flags_ = VPX_CODEC_USE_PSNR;
 
-  testing::internal::scoped_ptr<libvpx_test::VideoSource> video;
+  std::unique_ptr<libvpx_test::VideoSource> video;
   video.reset(new libvpx_test::YUVVideoSource(
       "niklas_640_480_30.yuv", VPX_IMG_FMT_I420, 3840, 2160,  // 2048, 1080,
       30, 1, 0, 5));
diff --git a/libs/libvpx/test/vp9_quantize_test.cc b/libs/libvpx/test/vp9_quantize_test.cc
index b18d4522ceb..cce6b6f1981 100644
--- a/libs/libvpx/test/vp9_quantize_test.cc
+++ b/libs/libvpx/test/vp9_quantize_test.cc
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -18,6 +19,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
@@ -26,6 +28,7 @@
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
@@ -41,8 +44,8 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              tran_low_t *dqcoeff, const int16_t *dequant,
                              uint16_t *eob, const int16_t *scan,
                              const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
-                        int /*max_size*/, bool /*is_fp*/>
+typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+                   int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
 // Wrapper for FP version which does not use zbin or quant_shift.
@@ -67,10 +70,13 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
      scan, iscan);
 }
 
-class VP9QuantizeBase {
+class VP9QuantizeBase : public AbstractBench {
  public:
   VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
-      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp),
+        coeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 16)),
+        qcoeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)),
+        dqcoeff_(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)) {
     max_value_ = (1 << bit_depth_) - 1;
     zbin_ptr_ =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
@@ -86,6 +92,9 @@ class VP9QuantizeBase {
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
+
+    r_ptr_ = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
   ~VP9QuantizeBase() {
@@ -118,6 +127,15 @@ class VP9QuantizeBase {
   int max_value_;
   const int max_size_;
   const bool is_fp_;
+  Buffer<tran_low_t> coeff_;
+  Buffer<tran_low_t> qcoeff_;
+  Buffer<tran_low_t> dqcoeff_;
+  int16_t *r_ptr_;
+  int16_t *q_ptr_;
+  int count_;
+  int skip_block_;
+  const scan_order *scan_;
+  uint16_t eob_;
 };
 
 class VP9QuantizeTest : public VP9QuantizeBase,
@@ -128,21 +146,29 @@ class VP9QuantizeTest : public VP9QuantizeBase,
         quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
+  virtual void Run();
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
 
+void VP9QuantizeTest::Run() {
+  quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_,
+               q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
+               scan_->iscan);
+}
+
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
-void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *round_ptr,
+                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan, int is_32x32) {
   int i, eob = -1;
-  const int thr = dequant_ptr[1] >> 1;
+  const int thr = dequant_ptr[1] >> (1 + is_32x32);
   (void)iscan;
   (void)skip_block;
   assert(!skip_block);
@@ -172,11 +198,24 @@ void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       // If all of the AC coeffs in a row has magnitude less than the
       // quantization step_size/2, quantize to zero.
       if (nzflag_cnt < 16) {
-        int tmp =
-            clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+        int tmp;
+        int _round;
+
+        if (is_32x32) {
+          _round = ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+        } else {
+          _round = round_ptr[rc != 0];
+        }
+        tmp = clamp(abs_coeff[y] + _round, INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> (16 - is_32x32);
         qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
         dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (is_32x32) {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        } else {
+          dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+        }
       } else {
         qcoeff_ptr[rc] = 0;
         dqcoeff_ptr[rc] = 0;
@@ -195,6 +234,26 @@ void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob + 1;
 }
 
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            int skip_block, const int16_t *round_ptr,
+                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                            uint16_t *eob_ptr, const int16_t *scan,
+                            const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
                           int16_t *quant, int16_t *quant_shift,
                           int16_t *dequant, int16_t *round_fp,
@@ -236,19 +295,17 @@ void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
 
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
-  ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(dqcoeff.Init());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
   Buffer<tran_low_t> ref_qcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_qcoeff.Init());
   Buffer<tran_low_t> ref_dqcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_dqcoeff.Init());
-  uint16_t eob, ref_eob;
+  uint16_t ref_eob = 0;
+  eob_ = 0;
 
   for (int i = 0; i < number_of_iterations; ++i) {
     // Test skip block for the first three iterations to catch all the different
@@ -261,33 +318,31 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
       sz = TX_32X32;
     }
     const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);
-    coeff.Set(&rnd, -max_value_, max_value_);
+    scan_ = &vp9_scan_orders[sz][tx_type];
+    count_ = (4 << sz) * (4 << sz);
+    coeff_.Set(&rnd, -max_value_, max_value_);
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_,
+                     r_ptr_, q_ptr_, quant_shift_ptr_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+        coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_,
+        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
-    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
-    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+    EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
 
-    EXPECT_EQ(eob, ref_eob);
+    EXPECT_EQ(eob_, ref_eob);
 
     if (HasFailure()) {
       printf("Failure on iteration %d.\n", i);
-      qcoeff.PrintDifference(ref_qcoeff);
-      dqcoeff.PrintDifference(ref_dqcoeff);
+      qcoeff_.PrintDifference(ref_qcoeff);
+      dqcoeff_.PrintDifference(ref_dqcoeff);
       return;
     }
   }
@@ -295,22 +350,21 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
 
 TEST_P(VP9QuantizeTest, EOBCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
-  ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(dqcoeff.Init());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
   Buffer<tran_low_t> ref_qcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_qcoeff.Init());
   Buffer<tran_low_t> ref_dqcoeff =
       Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
   ASSERT_TRUE(ref_dqcoeff.Init());
-  uint16_t eob, ref_eob;
+  uint16_t ref_eob = 0;
+  eob_ = 0;
+  const uint32_t max_index = max_size_ * max_size_ - 1;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = 0;
+    skip_block_ = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -318,38 +372,36 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
       sz = TX_32X32;
     }
     const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);
+    scan_ = &vp9_scan_orders[sz][tx_type];
+    count_ = (4 << sz) * (4 << sz);
     // Two random entries
-    coeff.Set(0);
-    coeff.TopLeftPixel()[rnd(count)] =
+    coeff_.Set(0);
+    coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] =
         static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
-    coeff.TopLeftPixel()[rnd(count)] =
+    coeff_.TopLeftPixel()[rnd.RandRange(count_) & max_index] =
         static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
-    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_order->scan, scan_order->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_,
+                     r_ptr_, q_ptr_, quant_shift_ptr_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
-        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
-        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+        coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_,
+        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
-    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
-    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+    EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
 
-    EXPECT_EQ(eob, ref_eob);
+    EXPECT_EQ(eob_, ref_eob);
 
     if (HasFailure()) {
       printf("Failure on iteration %d.\n", i);
-      qcoeff.PrintDifference(ref_qcoeff);
-      dqcoeff.PrintDifference(ref_dqcoeff);
+      qcoeff_.PrintDifference(ref_qcoeff);
+      dqcoeff_.PrintDifference(ref_dqcoeff);
       return;
     }
   }
@@ -357,13 +409,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
 
 TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
-  ASSERT_TRUE(coeff.Init());
-  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(qcoeff.Init());
-  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
-  ASSERT_TRUE(dqcoeff.Init());
-  uint16_t eob;
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
   TX_SIZE starting_sz, ending_sz;
 
   if (max_size_ == 16) {
@@ -377,18 +425,16 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
     // zbin > coeff, zbin < coeff.
     for (int i = 0; i < 2; ++i) {
-      const int skip_block = 0;
+      skip_block_ = 0;
       // TX_TYPE defines the scan order. That is not relevant to the speed test.
       // Pick the first one.
       const TX_TYPE tx_type = DCT_DCT;
-      const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-      const int count = (4 << sz) * (4 << sz);
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
 
       GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                            quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                            quant_fp_ptr_);
-      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
-      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
 
       if (i == 0) {
         // When |coeff values| are less than zbin the results are 0.
@@ -399,40 +445,33 @@ TEST_P(VP9QuantizeTest, DISABLED_Speed) {
           threshold = 200;
         }
         for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
-        coeff.Set(&rnd, -99, 99);
+        coeff_.Set(&rnd, -99, 99);
       } else if (i == 1) {
         for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
-        coeff.Set(&rnd, -500, 500);
+        coeff_.Set(&rnd, -500, 500);
       }
 
-      vpx_usec_timer timer;
-      vpx_usec_timer_start(&timer);
-      for (int j = 0; j < 100000000 / count; ++j) {
-        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
-                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
-                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
-                     scan_order->scan, scan_order->iscan);
-      }
-      vpx_usec_timer_mark(&timer);
-      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-      if (i == 0) printf("Bypass calculations.\n");
-      if (i == 1) printf("Full calculations.\n");
-      printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
-             elapsed_time / 1000);
+      RunNTimes(10000000 / count_);
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+      PrintMedian(title);
     }
-    printf("\n");
   }
 }
 
-using std::tr1::make_tuple;
+using std::make_tuple;
 
 #if HAVE_SSE2
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.
-// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
+        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
@@ -457,51 +496,52 @@ INSTANTIATE_TEST_CASE_P(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3
 #if ARCH_X86_64
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_ssse3,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true)));
 #else
-INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
-                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
-                                                     &vpx_quantize_b_c,
-                                                     VPX_BITS_8, 16, false)));
-#endif
-
-#if ARCH_X86_64
-// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_ssse3,
                                  &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
-                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
-                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
-                                 VPX_BITS_8, 32, true)));
+                                 false)));
+
 #endif  // ARCH_X86_64
-#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
 
-// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or
-// highbitdepth configurations.
-#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_AVX
+INSTANTIATE_TEST_CASE_P(AVX, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_avx,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false),
+                                          make_tuple(&vpx_quantize_b_32x32_avx,
+                                                     &vpx_quantize_b_32x32_c,
+                                                     VPX_BITS_8, 32, false)));
+#endif  // HAVE_AVX
+
+#if ARCH_X86_64 && HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(
-    AVX, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      // Even though SSSE3 and AVX do not match the reference
-                      // code, we can keep them in sync with each other.
-                      make_tuple(&vpx_quantize_b_32x32_avx,
-                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
-                                 false)));
-#endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#endif  // HAVE_AVX2
 
-// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -515,7 +555,23 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    VSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_vsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_vsx,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_vsx>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_vsx>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_CASE_P(
@@ -528,6 +584,9 @@ INSTANTIATE_TEST_CASE_P(
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                   &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32,
+                   true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                    &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
                    true)));
diff --git a/libs/libvpx/test/vp9_scale_test.cc b/libs/libvpx/test/vp9_scale_test.cc
index 5d7d38e89ad..f3e7f0a0e2b 100644
--- a/libs/libvpx/test/vp9_scale_test.cc
+++ b/libs/libvpx/test/vp9_scale_test.cc
@@ -47,7 +47,7 @@ class ScaleTest : public VpxScaleBase,
         scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
   }
 
-  void RunTest() {
+  void RunTest(INTERP_FILTER filter_type) {
     static const int kNumSizesToTest = 20;
     static const int kNumScaleFactorsToTest = 4;
     static const int kSizesToTest[] = {
@@ -55,50 +55,48 @@ class ScaleTest : public VpxScaleBase,
       22, 24, 26, 28, 30, 32, 34, 68, 128, 134
     };
     static const int kScaleFactors[] = { 1, 2, 3, 4 };
-    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
-      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
-        for (int h = 0; h < kNumSizesToTest; ++h) {
-          const int src_height = kSizesToTest[h];
-          for (int w = 0; w < kNumSizesToTest; ++w) {
-            const int src_width = kSizesToTest[w];
-            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
-                 ++sf_up_idx) {
-              const int sf_up = kScaleFactors[sf_up_idx];
-              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
-                   ++sf_down_idx) {
-                const int sf_down = kScaleFactors[sf_down_idx];
-                const int dst_width = src_width * sf_up / sf_down;
-                const int dst_height = src_height * sf_up / sf_down;
-                if (sf_up == sf_down && sf_up != 1) {
-                  continue;
-                }
-                // I420 frame width and height must be even.
-                if (!dst_width || !dst_height || dst_width & 1 ||
-                    dst_height & 1) {
-                  continue;
-                }
-                // vpx_convolve8_c() has restriction on the step which cannot
-                // exceed 64 (ratio 1 to 4).
-                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
-                  continue;
-                }
-                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
-                    src_width, src_height, dst_width, dst_height));
-                ReferenceScaleFrame(filter_type, phase_scaler);
-                ScaleFrame(filter_type, phase_scaler);
-                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
-                           ref_img_.frame_size)) {
-                  printf(
-                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
-                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
-                      "scale factor = %d:%d\n",
-                      filter_type, phase_scaler, src_width, src_height,
-                      dst_width, dst_height, sf_down, sf_up);
-                  PrintDiff();
-                }
-                CompareImages(dst_img_);
-                DeallocScaleImages();
+    for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+      for (int h = 0; h < kNumSizesToTest; ++h) {
+        const int src_height = kSizesToTest[h];
+        for (int w = 0; w < kNumSizesToTest; ++w) {
+          const int src_width = kSizesToTest[w];
+          for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+               ++sf_up_idx) {
+            const int sf_up = kScaleFactors[sf_up_idx];
+            for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                 ++sf_down_idx) {
+              const int sf_down = kScaleFactors[sf_down_idx];
+              const int dst_width = src_width * sf_up / sf_down;
+              const int dst_height = src_height * sf_up / sf_down;
+              if (sf_up == sf_down && sf_up != 1) {
+                continue;
               }
+              // I420 frame width and height must be even.
+              if (!dst_width || !dst_height || dst_width & 1 ||
+                  dst_height & 1) {
+                continue;
+              }
+              // vpx_convolve8_c() has restriction on the step which cannot
+              // exceed 64 (ratio 1 to 4).
+              if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                continue;
+              }
+              ASSERT_NO_FATAL_FAILURE(ResetScaleImages(src_width, src_height,
+                                                       dst_width, dst_height));
+              ReferenceScaleFrame(filter_type, phase_scaler);
+              ScaleFrame(filter_type, phase_scaler);
+              if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                         ref_img_.frame_size)) {
+                printf(
+                    "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                    "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                    "scale factor = %d:%d\n",
+                    filter_type, phase_scaler, src_width, src_height, dst_width,
+                    dst_height, sf_down, sf_up);
+                PrintDiff();
+              }
+              CompareImages(dst_img_);
+              DeallocScaleImages();
             }
           }
         }
@@ -145,7 +143,10 @@ class ScaleTest : public VpxScaleBase,
   ScaleFrameFunc scale_fn_;
 };
 
-TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+TEST_P(ScaleTest, ScaleFrame_EightTap) { RunTest(EIGHTTAP); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSmooth) { RunTest(EIGHTTAP_SMOOTH); }
+TEST_P(ScaleTest, ScaleFrame_EightTapSharp) { RunTest(EIGHTTAP_SHARP); }
+TEST_P(ScaleTest, ScaleFrame_Bilinear) { RunTest(BILINEAR); }
 
 TEST_P(ScaleTest, DISABLED_Speed) {
   static const int kCountSpeedTestBlock = 100;
diff --git a/libs/libvpx/test/vp9_spatial_svc_encoder.sh b/libs/libvpx/test/vp9_spatial_svc_encoder.sh
deleted file mode 100755
index 65031073f87..00000000000
--- a/libs/libvpx/test/vp9_spatial_svc_encoder.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/sh
-##
-##  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-##  This file tests the libvpx vp9_spatial_svc_encoder example. To add new
-##  tests to to this file, do the following:
-##    1. Write a shell function (this is your test).
-##    2. Add the function to vp9_spatial_svc_tests (on a new line).
-##
-. $(dirname $0)/tools_common.sh
-
-# Environment check: $YUV_RAW_INPUT is required.
-vp9_spatial_svc_encoder_verify_environment() {
-  if [ ! -e "${YUV_RAW_INPUT}" ]; then
-    echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
-    return 1
-  fi
-}
-
-# Runs vp9_spatial_svc_encoder. $1 is the test name.
-vp9_spatial_svc_encoder() {
-  local readonly \
-    encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
-  local readonly test_name="$1"
-  local readonly \
-    output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
-  local readonly frames_to_encode=10
-  local readonly max_kf=9999
-
-  shift
-
-  if [ ! -x "${encoder}" ]; then
-    elog "${encoder} does not exist or is not executable."
-    return 1
-  fi
-
-  eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
-    -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
-    "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
-
-  [ -e "${output_file}" ] || return 1
-}
-
-# Each test is run with layer count 1-$vp9_ssvc_test_layers.
-vp9_ssvc_test_layers=5
-
-vp9_spatial_svc() {
-  if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly test_name="vp9_spatial_svc"
-    for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
-      vp9_spatial_svc_encoder "${test_name}" -sl ${layers}
-    done
-  fi
-}
-
-readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
-                                DISABLED_vp9_spatial_svc_mode_altip
-                                DISABLED_vp9_spatial_svc_mode_ip
-                                DISABLED_vp9_spatial_svc_mode_gf
-                                vp9_spatial_svc"
-
-if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
-  run_tests \
-    vp9_spatial_svc_encoder_verify_environment \
-    "${vp9_spatial_svc_tests}"
-fi
diff --git a/libs/libvpx/test/vp9_subtract_test.cc b/libs/libvpx/test/vp9_subtract_test.cc
index 62845ad6154..67e8de6c74b 100644
--- a/libs/libvpx/test/vp9_subtract_test.cc
+++ b/libs/libvpx/test/vp9_subtract_test.cc
@@ -14,9 +14,11 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_ports/msvc.h"
 #include "vpx_mem/vpx_mem.h"
 
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
@@ -26,62 +28,101 @@ typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
 
 namespace vp9 {
 
-class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+class VP9SubtractBlockTest : public AbstractBench,
+                             public ::testing::TestWithParam<SubtractFunc> {
  public:
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void Run() {
+    GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+               block_width_, pred_, block_width_);
+  }
+
+  void SetupBlocks(BLOCK_SIZE bsize) {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[bsize];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[bsize];
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(*diff_) * block_width_ * block_height_ * 2));
+    pred_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width_ * block_height_ * 2));
+    src_ = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width_ * block_height_ * 2));
+  }
+
+  int block_width_;
+  int block_height_;
+  int16_t *diff_;
+  uint8_t *pred_;
+  uint8_t *src_;
 };
 
 using libvpx_test::ACMRandom;
 
+TEST_P(VP9SubtractBlockTest, DISABLED_Speed) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    SetupBlocks(bsize);
+
+    RunNTimes(100000000 / (block_height_ * block_width_));
+    char block_size[16];
+    snprintf(block_size, sizeof(block_size), "%dx%d", block_height_,
+             block_width_);
+    char title[100];
+    snprintf(title, sizeof(title), "%8s ", block_size);
+    PrintMedian(title);
+
+    vpx_free(diff_);
+    vpx_free(pred_);
+    vpx_free(src_);
+  }
+}
+
 TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
-  // FIXME(rbultje) split in its own file
   for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
        bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
-    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
-    int16_t *diff = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
-    uint8_t *pred = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
-    uint8_t *src = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
+    SetupBlocks(bsize);
 
     for (int n = 0; n < 100; n++) {
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width * 2; ++c) {
-          src[r * block_width * 2 + c] = rnd.Rand8();
-          pred[r * block_width * 2 + c] = rnd.Rand8();
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_ * 2; ++c) {
+          src_[r * block_width_ * 2 + c] = rnd.Rand8();
+          pred_[r * block_width_ * 2 + c] = rnd.Rand8();
         }
       }
 
-      GetParam()(block_height, block_width, diff, block_width, src, block_width,
-                 pred, block_width);
+      GetParam()(block_height_, block_width_, diff_, block_width_, src_,
+                 block_width_, pred_, block_width_);
 
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width + c],
-                    (src[r * block_width + c] - pred[r * block_width + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ + c],
+                    (src_[r * block_width_ + c] - pred_[r * block_width_ + c]))
+              << "r = " << r << ", c = " << c
+              << ", bs = " << static_cast<int>(bsize);
         }
       }
 
-      GetParam()(block_height, block_width, diff, block_width * 2, src,
-                 block_width * 2, pred, block_width * 2);
+      GetParam()(block_height_, block_width_, diff_, block_width_ * 2, src_,
+                 block_width_ * 2, pred_, block_width_ * 2);
 
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(
-              diff[r * block_width * 2 + c],
-              (src[r * block_width * 2 + c] - pred[r * block_width * 2 + c]))
-              << "r = " << r << ", c = " << c << ", bs = " << bsize;
+      for (int r = 0; r < block_height_; ++r) {
+        for (int c = 0; c < block_width_; ++c) {
+          EXPECT_EQ(diff_[r * block_width_ * 2 + c],
+                    (src_[r * block_width_ * 2 + c] -
+                     pred_[r * block_width_ * 2 + c]))
+              << "r = " << r << ", c = " << c
+              << ", bs = " << static_cast<int>(bsize);
         }
       }
     }
-    vpx_free(diff);
-    vpx_free(pred);
-    vpx_free(src);
+    vpx_free(diff_);
+    vpx_free(pred_);
+    vpx_free(src_);
   }
 }
 
@@ -106,4 +147,9 @@ INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_mmi));
 #endif
 
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_vsx));
+#endif
+
 }  // namespace vp9
diff --git a/libs/libvpx/test/vp9_thread_test.cc b/libs/libvpx/test/vp9_thread_test.cc
index 576f5e906b0..31b6fe57b4a 100644
--- a/libs/libvpx/test/vp9_thread_test.cc
+++ b/libs/libvpx/test/vp9_thread_test.cc
@@ -147,7 +147,6 @@ TEST(VPxWorkerThreadTest, TestInterfaceAPI) {
 
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
-
 #if CONFIG_WEBM_IO
 struct FileList {
   const char *name;
@@ -197,6 +196,7 @@ void DecodeFiles(const FileList files[]) {
 // Note any worker that requires synchronization between other workers will
 // hang.
 namespace impl {
+namespace {
 
 void Init(VPxWorker *const worker) { memset(worker, 0, sizeof(*worker)); }
 int Reset(VPxWorker *const /*worker*/) { return 1; }
@@ -209,6 +209,7 @@ void Execute(VPxWorker *const worker) {
 void Launch(VPxWorker *const worker) { Execute(worker); }
 void End(VPxWorker *const /*worker*/) {}
 
+}  // namespace
 }  // namespace impl
 
 TEST(VPxWorkerThreadTest, TestSerialInterface) {
diff --git a/libs/libvpx/test/vpx_scale_test.cc b/libs/libvpx/test/vpx_scale_test.cc
index ac75dceb230..4fad3069afd 100644
--- a/libs/libvpx/test/vpx_scale_test.cc
+++ b/libs/libvpx/test/vpx_scale_test.cc
@@ -20,6 +20,15 @@
 #include "vpx_scale/yv12config.h"
 
 namespace libvpx_test {
+namespace {
+
+#if ARCH_ARM || (ARCH_MIPS && !HAVE_MIPS64) || ARCH_X86
+// Avoid OOM failures on 32-bit platforms.
+const int kNumSizesToTest = 7;
+#else
+const int kNumSizesToTest = 8;
+#endif
+const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 3840, 16383 };
 
 typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
 typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
@@ -37,13 +46,6 @@ class ExtendBorderTest
   void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); }
 
   void RunTest() {
-#if ARCH_ARM
-    // Some arm devices OOM when trying to allocate the largest buffers.
-    static const int kNumSizesToTest = 6;
-#else
-    static const int kNumSizesToTest = 7;
-#endif
-    static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
         ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
@@ -76,13 +78,6 @@ class CopyFrameTest : public VpxScaleBase,
   }
 
   void RunTest() {
-#if ARCH_ARM
-    // Some arm devices OOM when trying to allocate the largest buffers.
-    static const int kNumSizesToTest = 6;
-#else
-    static const int kNumSizesToTest = 7;
-#endif
-    static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
         ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
@@ -102,4 +97,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
                         ::testing::Values(vp8_yv12_copy_frame_c));
 
+}  // namespace
 }  // namespace libvpx_test
diff --git a/libs/libvpx/test/vpx_scale_test.h b/libs/libvpx/test/vpx_scale_test.h
index dcbd02b91fa..11c259ae807 100644
--- a/libs/libvpx/test/vpx_scale_test.h
+++ b/libs/libvpx/test/vpx_scale_test.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef TEST_VPX_SCALE_TEST_H_
-#define TEST_VPX_SCALE_TEST_H_
+#ifndef VPX_TEST_VPX_SCALE_TEST_H_
+#define VPX_TEST_VPX_SCALE_TEST_H_
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -33,7 +33,8 @@ class VpxScaleBase {
                   const int height) {
     memset(img, 0, sizeof(*img));
     ASSERT_EQ(
-        0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS));
+        0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS))
+        << "for width: " << width << " height: " << height;
     memset(img->buffer_alloc, kBufFiller, img->frame_size);
   }
 
@@ -197,4 +198,4 @@ class VpxScaleBase {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_VPX_SCALE_TEST_H_
+#endif  // VPX_TEST_VPX_SCALE_TEST_H_
diff --git a/libs/libvpx/test/vpx_temporal_svc_encoder.sh b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
index 56a7902f4f2..5e5bac8fa6a 100755
--- a/libs/libvpx/test/vpx_temporal_svc_encoder.sh
+++ b/libs/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -38,6 +38,7 @@ vpx_tsvc_encoder() {
   local output_file="${VPX_TEST_OUTPUT_DIR}/${output_file_base}"
   local timebase_num="1"
   local timebase_den="1000"
+  local timebase_den_y4m="30"
   local speed="6"
   local frame_drop_thresh="30"
   local max_threads="4"
@@ -58,6 +59,12 @@ vpx_tsvc_encoder() {
         "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
         "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
         "$@" ${devnull}
+      # Test for y4m input.
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${Y4M_720P_INPUT}" \
+        "${output_file}" "${codec}" "${Y4M_720P_INPUT_WIDTH}" \
+        "${Y4M_720P_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den_y4m}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull}
     else
       eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
         "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
@@ -85,7 +92,7 @@ files_exist() {
 
 vpx_tsvc_encoder_vp8_mode_0() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_0"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_0"
     vpx_tsvc_encoder vp8 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
     files_exist "${output_basename}" 1 || return 1
@@ -94,7 +101,7 @@ vpx_tsvc_encoder_vp8_mode_0() {
 
 vpx_tsvc_encoder_vp8_mode_1() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_1"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_1"
     vpx_tsvc_encoder vp8 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -103,7 +110,7 @@ vpx_tsvc_encoder_vp8_mode_1() {
 
 vpx_tsvc_encoder_vp8_mode_2() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_2"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_2"
     vpx_tsvc_encoder vp8 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -112,7 +119,7 @@ vpx_tsvc_encoder_vp8_mode_2() {
 
 vpx_tsvc_encoder_vp8_mode_3() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_3"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_3"
     vpx_tsvc_encoder vp8 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -121,7 +128,7 @@ vpx_tsvc_encoder_vp8_mode_3() {
 
 vpx_tsvc_encoder_vp8_mode_4() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_4"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_4"
     vpx_tsvc_encoder vp8 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -130,7 +137,7 @@ vpx_tsvc_encoder_vp8_mode_4() {
 
 vpx_tsvc_encoder_vp8_mode_5() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_5"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_5"
     vpx_tsvc_encoder vp8 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -139,7 +146,7 @@ vpx_tsvc_encoder_vp8_mode_5() {
 
 vpx_tsvc_encoder_vp8_mode_6() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_6"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_6"
     vpx_tsvc_encoder vp8 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -148,7 +155,7 @@ vpx_tsvc_encoder_vp8_mode_6() {
 
 vpx_tsvc_encoder_vp8_mode_7() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_7"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_7"
     vpx_tsvc_encoder vp8 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
     files_exist "${output_basename}" 5 || return 1
@@ -157,7 +164,7 @@ vpx_tsvc_encoder_vp8_mode_7() {
 
 vpx_tsvc_encoder_vp8_mode_8() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_8"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_8"
     vpx_tsvc_encoder vp8 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -166,7 +173,7 @@ vpx_tsvc_encoder_vp8_mode_8() {
 
 vpx_tsvc_encoder_vp8_mode_9() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_9"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_9"
     vpx_tsvc_encoder vp8 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -175,7 +182,7 @@ vpx_tsvc_encoder_vp8_mode_9() {
 
 vpx_tsvc_encoder_vp8_mode_10() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_10"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_10"
     vpx_tsvc_encoder vp8 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -184,7 +191,7 @@ vpx_tsvc_encoder_vp8_mode_10() {
 
 vpx_tsvc_encoder_vp8_mode_11() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp8_mode_11"
+    local output_basename="vpx_tsvc_encoder_vp8_mode_11"
     vpx_tsvc_encoder vp8 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -193,7 +200,7 @@ vpx_tsvc_encoder_vp8_mode_11() {
 
 vpx_tsvc_encoder_vp9_mode_0() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_0"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_0"
     vpx_tsvc_encoder vp9 "${output_basename}" 0 200 || return 1
     # Mode 0 produces 1 stream
     files_exist "${output_basename}" 1 || return 1
@@ -202,7 +209,7 @@ vpx_tsvc_encoder_vp9_mode_0() {
 
 vpx_tsvc_encoder_vp9_mode_1() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_1"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_1"
     vpx_tsvc_encoder vp9 "${output_basename}" 1 200 400 || return 1
     # Mode 1 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -211,7 +218,7 @@ vpx_tsvc_encoder_vp9_mode_1() {
 
 vpx_tsvc_encoder_vp9_mode_2() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_2"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_2"
     vpx_tsvc_encoder vp9 "${output_basename}" 2 200 400 || return 1
     # Mode 2 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -220,7 +227,7 @@ vpx_tsvc_encoder_vp9_mode_2() {
 
 vpx_tsvc_encoder_vp9_mode_3() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_3"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_3"
     vpx_tsvc_encoder vp9 "${output_basename}" 3 200 400 600 || return 1
     # Mode 3 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -229,7 +236,7 @@ vpx_tsvc_encoder_vp9_mode_3() {
 
 vpx_tsvc_encoder_vp9_mode_4() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_4"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_4"
     vpx_tsvc_encoder vp9 "${output_basename}" 4 200 400 600 || return 1
     # Mode 4 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -238,7 +245,7 @@ vpx_tsvc_encoder_vp9_mode_4() {
 
 vpx_tsvc_encoder_vp9_mode_5() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_5"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_5"
     vpx_tsvc_encoder vp9 "${output_basename}" 5 200 400 600 || return 1
     # Mode 5 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -247,7 +254,7 @@ vpx_tsvc_encoder_vp9_mode_5() {
 
 vpx_tsvc_encoder_vp9_mode_6() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_6"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_6"
     vpx_tsvc_encoder vp9 "${output_basename}" 6 200 400 600 || return 1
     # Mode 6 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -256,7 +263,7 @@ vpx_tsvc_encoder_vp9_mode_6() {
 
 vpx_tsvc_encoder_vp9_mode_7() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_7"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_7"
     vpx_tsvc_encoder vp9 "${output_basename}" 7 200 400 600 800 1000 || return 1
     # Mode 7 produces 5 streams
     files_exist "${output_basename}" 5 || return 1
@@ -265,7 +272,7 @@ vpx_tsvc_encoder_vp9_mode_7() {
 
 vpx_tsvc_encoder_vp9_mode_8() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_8"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_8"
     vpx_tsvc_encoder vp9 "${output_basename}" 8 200 400 || return 1
     # Mode 8 produces 2 streams
     files_exist "${output_basename}" 2 || return 1
@@ -274,7 +281,7 @@ vpx_tsvc_encoder_vp9_mode_8() {
 
 vpx_tsvc_encoder_vp9_mode_9() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_9"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_9"
     vpx_tsvc_encoder vp9 "${output_basename}" 9 200 400 600 || return 1
     # Mode 9 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -283,7 +290,7 @@ vpx_tsvc_encoder_vp9_mode_9() {
 
 vpx_tsvc_encoder_vp9_mode_10() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_10"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_10"
     vpx_tsvc_encoder vp9 "${output_basename}" 10 200 400 600 || return 1
     # Mode 10 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
@@ -292,7 +299,7 @@ vpx_tsvc_encoder_vp9_mode_10() {
 
 vpx_tsvc_encoder_vp9_mode_11() {
   if [ "$(vp9_encode_available)" = "yes" ]; then
-    local readonly output_basename="vpx_tsvc_encoder_vp9_mode_11"
+    local output_basename="vpx_tsvc_encoder_vp9_mode_11"
     vpx_tsvc_encoder vp9 "${output_basename}" 11 200 400 600 || return 1
     # Mode 11 produces 3 streams
     files_exist "${output_basename}" 3 || return 1
diff --git a/libs/libvpx/test/vpxdec.sh b/libs/libvpx/test/vpxdec.sh
index de51c8004ed..044aa7e16d0 100755
--- a/libs/libvpx/test/vpxdec.sh
+++ b/libs/libvpx/test/vpxdec.sh
@@ -18,7 +18,8 @@
 vpxdec_verify_environment() {
   if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
     [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
-    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_RAW_FILE}" ]; then
     elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
@@ -33,8 +34,8 @@ vpxdec_verify_environment() {
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxdec.
 vpxdec_pipe() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly input="$1"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
 }
@@ -43,8 +44,8 @@ vpxdec_pipe() {
 # the directory containing vpxdec. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxdec.
 vpxdec() {
-  local readonly decoder="$(vpx_tool_path vpxdec)"
-  local readonly input="$1"
+  local decoder="$(vpx_tool_path vpxdec)"
+  local input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
 }
@@ -95,9 +96,9 @@ vpxdec_vp9_webm_less_than_50_frames() {
   # frames in actual webm_read_frame calls.
   if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly decoder="$(vpx_tool_path vpxdec)"
-    local readonly expected=10
-    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+    local decoder="$(vpx_tool_path vpxdec)"
+    local expected=10
+    local num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
       "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
       | awk '/^[0-9]+ decoded frames/ { print $1 }')
     if [ "$num_frames" -ne "$expected" ]; then
@@ -107,10 +108,28 @@ vpxdec_vp9_webm_less_than_50_frames() {
   fi
 }
 
+# Ensures VP9_RAW_FILE correctly produces 1 frame instead of causing a hang.
+vpxdec_vp9_raw_file() {
+  # Ensure a raw file properly reports eof and doesn't cause a hang.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ]; then
+    local decoder="$(vpx_tool_path vpxdec)"
+    local expected=1
+    [ -x /usr/bin/timeout ] && local TIMEOUT="/usr/bin/timeout 30s"
+    local num_frames=$(${TIMEOUT} ${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_RAW_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ -z "$num_frames" ] || [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
+}
+
 vpxdec_tests="vpxdec_vp8_ivf
               vpxdec_vp8_ivf_pipe_input
               vpxdec_vp9_webm
               vpxdec_vp9_webm_frame_parallel
-              vpxdec_vp9_webm_less_than_50_frames"
+              vpxdec_vp9_webm_less_than_50_frames
+              vpxdec_vp9_raw_file"
 
 run_tests vpxdec_verify_environment "${vpxdec_tests}"
diff --git a/libs/libvpx/test/vpxenc.sh b/libs/libvpx/test/vpxenc.sh
index 0c160dafc03..f94e2e094a3 100755
--- a/libs/libvpx/test/vpxenc.sh
+++ b/libs/libvpx/test/vpxenc.sh
@@ -67,7 +67,7 @@ y4m_input_720p() {
 # Echo default vpxenc real time encoding params. $1 is the codec, which defaults
 # to vp8 if unspecified.
 vpxenc_rt_params() {
-  local readonly codec="${1:-vp8}"
+  local codec="${1:-vp8}"
   echo "--codec=${codec}
     --buf-initial-sz=500
     --buf-optimal-sz=600
@@ -104,8 +104,8 @@ vpxenc_passes_param() {
 # input file path and shifted away. All remaining parameters are passed through
 # to vpxenc.
 vpxenc_pipe() {
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly input="$1"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local input="$1"
   shift
   cat "${input}" | eval "${VPX_TEST_PREFIX}" "${encoder}" - \
     --test-decode=fatal \
@@ -116,8 +116,8 @@ vpxenc_pipe() {
 # the directory containing vpxenc. $1 one is used as the input file path and
 # shifted away. All remaining parameters are passed through to vpxenc.
 vpxenc() {
-  local readonly encoder="$(vpx_tool_path vpxenc)"
-  local readonly input="$1"
+  local encoder="$(vpx_tool_path vpxenc)"
+  local input="$1"
   shift
   eval "${VPX_TEST_PREFIX}" "${encoder}" "${input}" \
     --test-decode=fatal \
@@ -126,7 +126,7 @@ vpxenc() {
 
 vpxenc_vp8_ivf() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.ivf"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
@@ -143,7 +143,7 @@ vpxenc_vp8_ivf() {
 vpxenc_vp8_webm() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
@@ -159,7 +159,7 @@ vpxenc_vp8_webm() {
 vpxenc_vp8_webm_rt() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_rt.webm"
     vpxenc $(yuv_input_hantro_collage) \
       $(vpxenc_rt_params vp8) \
       --output="${output}"
@@ -173,7 +173,7 @@ vpxenc_vp8_webm_rt() {
 vpxenc_vp8_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
@@ -190,9 +190,9 @@ vpxenc_vp8_webm_2pass() {
 vpxenc_vp8_webm_lag10_frames20() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly lag_total_frames=20
-    local readonly lag_frames=10
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
+    local lag_total_frames=20
+    local lag_frames=10
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_lag10_frames20.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${lag_total_frames}" \
@@ -210,7 +210,7 @@ vpxenc_vp8_webm_lag10_frames20() {
 
 vpxenc_vp8_ivf_piped_input() {
   if [ "$(vpxenc_can_encode_vp8)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp8_piped_input.ivf"
     vpxenc_pipe $(yuv_input_hantro_collage) \
       --codec=vp8 \
       --limit="${TEST_FRAMES}" \
@@ -226,8 +226,8 @@ vpxenc_vp8_ivf_piped_input() {
 
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
-    local readonly passes=$(vpxenc_passes_param)
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -245,8 +245,8 @@ vpxenc_vp9_ivf() {
 vpxenc_vp9_webm() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
-    local readonly passes=$(vpxenc_passes_param)
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -263,7 +263,7 @@ vpxenc_vp9_webm() {
 vpxenc_vp9_webm_rt() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt.webm"
     vpxenc $(yuv_input_hantro_collage) \
       $(vpxenc_rt_params vp9) \
       --output="${output}"
@@ -278,11 +278,11 @@ vpxenc_vp9_webm_rt() {
 vpxenc_vp9_webm_rt_multithread_tiled() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
-    local readonly tilethread_min=2
-    local readonly tilethread_max=4
-    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
-    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_multithread_tiled.webm"
+    local tilethread_min=2
+    local tilethread_max=4
+    local num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
 
     for threads in ${num_threads}; do
       for tile_cols in ${num_tile_cols}; do
@@ -291,26 +291,25 @@ vpxenc_vp9_webm_rt_multithread_tiled() {
           --threads=${threads} \
           --tile-columns=${tile_cols} \
           --output="${output}"
+
+        if [ ! -e "${output}" ]; then
+          elog "Output file does not exist."
+          return 1
+        fi
+        rm "${output}"
       done
     done
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-
-    rm "${output}"
   fi
 }
 
 vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
-    local readonly tilethread_min=2
-    local readonly tilethread_max=4
-    local readonly num_threads="$(seq ${tilethread_min} ${tilethread_max})"
-    local readonly num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_rt_mt_t_fp.webm"
+    local tilethread_min=2
+    local tilethread_max=4
+    local num_threads="$(seq ${tilethread_min} ${tilethread_max})"
+    local num_tile_cols="$(seq ${tilethread_min} ${tilethread_max})"
 
     for threads in ${num_threads}; do
       for tile_cols in ${num_tile_cols}; do
@@ -320,22 +319,20 @@ vpxenc_vp9_webm_rt_multithread_tiled_frameparallel() {
           --tile-columns=${tile_cols} \
           --frame-parallel=1 \
           --output="${output}"
+        if [ ! -e "${output}" ]; then
+          elog "Output file does not exist."
+          return 1
+        fi
+        rm "${output}"
       done
     done
-
-    if [ ! -e "${output}" ]; then
-      elog "Output file does not exist."
-      return 1
-    fi
-
-    rm "${output}"
   fi
 }
 
 vpxenc_vp9_webm_2pass() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -351,8 +348,8 @@ vpxenc_vp9_webm_2pass() {
 
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
-    local readonly passes=$(vpxenc_passes_param)
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -370,8 +367,8 @@ vpxenc_vp9_ivf_lossless() {
 
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
-    local readonly passes=$(vpxenc_passes_param)
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -391,10 +388,10 @@ vpxenc_vp9_ivf_minq0_maxq0() {
 vpxenc_vp9_webm_lag10_frames20() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly lag_total_frames=20
-    local readonly lag_frames=10
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
-    local readonly passes=$(vpxenc_passes_param)
+    local lag_total_frames=20
+    local lag_frames=10
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${lag_total_frames}" \
@@ -414,8 +411,8 @@ vpxenc_vp9_webm_lag10_frames20() {
 vpxenc_vp9_webm_non_square_par() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
-    local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
-    local readonly passes=$(vpxenc_passes_param)
+    local output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    local passes=$(vpxenc_passes_param)
     vpxenc $(y4m_input_non_square_par) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
@@ -429,6 +426,42 @@ vpxenc_vp9_webm_non_square_par() {
   fi
 }
 
+vpxenc_vp9_webm_sharpness() {
+  if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
+    local sharpnesses="0 1 2 3 4 5 6 7"
+    local output="${VPX_TEST_OUTPUT_DIR}/vpxenc_vp9_webm_sharpness.ivf"
+    local last_size=0
+    local this_size=0
+
+    for sharpness in ${sharpnesses}; do
+
+      vpxenc $(yuv_input_hantro_collage) \
+        --sharpness="${sharpness}" \
+        --codec=vp9 \
+        --limit=1 \
+        --cpu-used=2 \
+        --end-usage=q \
+        --cq-level=40 \
+        --output="${output}" \
+        "${passes}"
+
+      if [ ! -e "${output}" ]; then
+        elog "Output file does not exist."
+        return 1
+      fi
+
+      this_size=$(stat -c '%s' "${output}")
+      if [ "${this_size}" -lt "${last_size}" ]; then
+        elog "Higher sharpness value yielded lower file size."
+        echo "${this_size}" " < " "${last_size}"
+        return 1
+      fi
+      last_size="${this_size}"
+
+    done
+  fi
+}
+
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
               vpxenc_vp8_webm_rt
@@ -441,7 +474,9 @@ vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
               vpxenc_vp9_webm_lag10_frames20
-              vpxenc_vp9_webm_non_square_par"
+              vpxenc_vp9_webm_non_square_par
+              vpxenc_vp9_webm_sharpness"
+
 if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
   vpxenc_tests="$vpxenc_tests
                 vpxenc_vp8_webm_2pass
diff --git a/libs/libvpx/test/webm_video_source.h b/libs/libvpx/test/webm_video_source.h
index 09c007a3f3f..6f55f7db7c2 100644
--- a/libs/libvpx/test/webm_video_source.h
+++ b/libs/libvpx/test/webm_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_WEBM_VIDEO_SOURCE_H_
-#define TEST_WEBM_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_WEBM_VIDEO_SOURCE_H_
+#define VPX_TEST_WEBM_VIDEO_SOURCE_H_
 #include <cstdarg>
 #include <cstdio>
 #include <cstdlib>
@@ -90,4 +90,4 @@ class WebMVideoSource : public CompressedVideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_WEBM_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_WEBM_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/y4m_test.cc b/libs/libvpx/test/y4m_test.cc
index ced717a7c19..76d033d52a7 100644
--- a/libs/libvpx/test/y4m_test.cc
+++ b/libs/libvpx/test/y4m_test.cc
@@ -40,18 +40,18 @@ const Y4mTestParam kY4mTestVectors[] = {
     "284a47a47133b12884ec3a14e959a0b6" },
   { "park_joy_90p_8_444.y4m", 8, VPX_IMG_FMT_I444,
     "90517ff33843d85de712fd4fe60dbed0" },
-  { "park_joy_90p_10_420.y4m", 10, VPX_IMG_FMT_I42016,
-    "63f21f9f717d8b8631bd2288ee87137b" },
-  { "park_joy_90p_10_422.y4m", 10, VPX_IMG_FMT_I42216,
-    "48ab51fb540aed07f7ff5af130c9b605" },
-  { "park_joy_90p_10_444.y4m", 10, VPX_IMG_FMT_I44416,
-    "067bfd75aa85ff9bae91fa3e0edd1e3e" },
-  { "park_joy_90p_12_420.y4m", 12, VPX_IMG_FMT_I42016,
-    "9e6d8f6508c6e55625f6b697bc461cef" },
-  { "park_joy_90p_12_422.y4m", 12, VPX_IMG_FMT_I42216,
-    "b239c6b301c0b835485be349ca83a7e3" },
-  { "park_joy_90p_12_444.y4m", 12, VPX_IMG_FMT_I44416,
-    "5a6481a550821dab6d0192f5c63845e9" },
+  { "park_joy_90p_10_420_20f.y4m", 10, VPX_IMG_FMT_I42016,
+    "2f56ab9809269f074df7e3daf1ce0be6" },
+  { "park_joy_90p_10_422_20f.y4m", 10, VPX_IMG_FMT_I42216,
+    "1b5c73d2e8e8c4e02dc4889ecac41c83" },
+  { "park_joy_90p_10_444_20f.y4m", 10, VPX_IMG_FMT_I44416,
+    "ec4ab5be53195c5b838d1d19e1bc2674" },
+  { "park_joy_90p_12_420_20f.y4m", 12, VPX_IMG_FMT_I42016,
+    "3370856c8ddebbd1f9bb2e66f97677f4" },
+  { "park_joy_90p_12_422_20f.y4m", 12, VPX_IMG_FMT_I42216,
+    "4eab364318dd8201acbb182e43bd4966" },
+  { "park_joy_90p_12_444_20f.y4m", 12, VPX_IMG_FMT_I44416,
+    "f189dfbbd92119fc8e5f211a550166be" },
 };
 
 static void write_image_file(const vpx_image_t *img, FILE *file) {
diff --git a/libs/libvpx/test/y4m_video_source.h b/libs/libvpx/test/y4m_video_source.h
index 1301f69703b..89aa2a44fc9 100644
--- a/libs/libvpx/test/y4m_video_source.h
+++ b/libs/libvpx/test/y4m_video_source.h
@@ -7,9 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_Y4M_VIDEO_SOURCE_H_
-#define TEST_Y4M_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_Y4M_VIDEO_SOURCE_H_
+#define VPX_TEST_Y4M_VIDEO_SOURCE_H_
 #include <algorithm>
+#include <memory>
 #include <string>
 
 #include "test/video_source.h"
@@ -108,7 +109,7 @@ class Y4mVideoSource : public VideoSource {
 
   std::string file_name_;
   FILE *input_file_;
-  testing::internal::scoped_ptr<vpx_image_t> img_;
+  std::unique_ptr<vpx_image_t> img_;
   unsigned int start_;
   unsigned int limit_;
   unsigned int frame_;
@@ -119,4 +120,4 @@ class Y4mVideoSource : public VideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_Y4M_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_Y4M_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/test/yuv_temporal_filter_test.cc b/libs/libvpx/test/yuv_temporal_filter_test.cc
new file mode 100644
index 00000000000..8f3c58b0383
--- /dev/null
+++ b/libs/libvpx/test/yuv_temporal_filter_test.cc
@@ -0,0 +1,708 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/register_state_check.h"
+#include "vpx_ports/vpx_timer.h"
+
+namespace {
+
+using ::libvpx_test::ACMRandom;
+using ::libvpx_test::Buffer;
+
+typedef void (*YUVTemporalFilterFunc)(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
+struct TemporalFilterWithBd {
+  TemporalFilterWithBd(YUVTemporalFilterFunc func, int bitdepth)
+      : temporal_filter(func), bd(bitdepth) {}
+
+  YUVTemporalFilterFunc temporal_filter;
+  int bd;
+};
+
+std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
+  return os << "Bitdepth: " << tf.bd;
+}
+
+int GetFilterWeight(unsigned int row, unsigned int col,
+                    unsigned int block_height, unsigned int block_width,
+                    const int *const blk_fw, int use_32x32) {
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
+}
+
+template <typename PixelType>
+int GetModIndex(int sum_dist, int index, int rounding, int strength,
+                int filter_weight) {
+  int mod = sum_dist * 3 / index;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <>
+int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
+                         int filter_weight) {
+  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                  39322, 32768, 28087, 24576, 21846,
+                                  19661, 17874, 0,     15124 };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <>
+int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
+                          int filter_weight) {
+  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
+                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
+                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
+                             0U,          991146300U };
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+template <typename PixelType>
+void ApplyReferenceFilter(
+    const Buffer<PixelType> &y_src, const Buffer<PixelType> &y_pre,
+    const Buffer<PixelType> &u_src, const Buffer<PixelType> &v_src,
+    const Buffer<PixelType> &u_pre, const Buffer<PixelType> &v_pre,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *const blk_fw, int use_32x32,
+    Buffer<uint32_t> *y_accumulator, Buffer<uint16_t> *y_counter,
+    Buffer<uint32_t> *u_accumulator, Buffer<uint16_t> *u_counter,
+    Buffer<uint32_t> *v_accumulator, Buffer<uint16_t> *v_counter) {
+  const PixelType *y_src_ptr = y_src.TopLeftPixel();
+  const PixelType *y_pre_ptr = y_pre.TopLeftPixel();
+  const PixelType *u_src_ptr = u_src.TopLeftPixel();
+  const PixelType *u_pre_ptr = u_pre.TopLeftPixel();
+  const PixelType *v_src_ptr = v_src.TopLeftPixel();
+  const PixelType *v_pre_ptr = v_pre.TopLeftPixel();
+
+  const int uv_block_width = block_width >> ss_x,
+            uv_block_height = block_height >> ss_y;
+  const int y_src_stride = y_src.stride(), y_pre_stride = y_pre.stride();
+  const int uv_src_stride = u_src.stride(), uv_pre_stride = u_pre.stride();
+  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
+
+  Buffer<int> y_dif = Buffer<int>(block_width, block_height, 0);
+  Buffer<int> u_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+  Buffer<int> v_dif = Buffer<int>(uv_block_width, uv_block_height, 0);
+
+  ASSERT_TRUE(y_dif.Init());
+  ASSERT_TRUE(u_dif.Init());
+  ASSERT_TRUE(v_dif.Init());
+  y_dif.Set(0);
+  u_dif.Set(0);
+  v_dif.Set(0);
+
+  int *y_diff_ptr = y_dif.TopLeftPixel();
+  int *u_diff_ptr = u_dif.TopLeftPixel();
+  int *v_diff_ptr = v_dif.TopLeftPixel();
+
+  uint32_t *y_accum = y_accumulator->TopLeftPixel();
+  uint32_t *u_accum = u_accumulator->TopLeftPixel();
+  uint32_t *v_accum = v_accumulator->TopLeftPixel();
+  uint16_t *y_count = y_counter->TopLeftPixel();
+  uint16_t *u_count = u_counter->TopLeftPixel();
+  uint16_t *v_count = v_counter->TopLeftPixel();
+
+  const int y_accum_stride = y_accumulator->stride();
+  const int u_accum_stride = u_accumulator->stride();
+  const int v_accum_stride = v_accumulator->stride();
+  const int y_count_stride = y_counter->stride();
+  const int u_count_stride = u_counter->stride();
+  const int v_count_stride = v_counter->stride();
+
+  const int rounding = (1 << strength) >> 1;
+
+  // Get the square diffs
+  for (int row = 0; row < static_cast<int>(block_height); row++) {
+    for (int col = 0; col < static_cast<int>(block_width); col++) {
+      const int diff = y_src_ptr[row * y_src_stride + col] -
+                       y_pre_ptr[row * y_pre_stride + col];
+      y_diff_ptr[row * y_diff_stride + col] = diff * diff;
+    }
+  }
+
+  for (int row = 0; row < uv_block_height; row++) {
+    for (int col = 0; col < uv_block_width; col++) {
+      const int u_diff = u_src_ptr[row * uv_src_stride + col] -
+                         u_pre_ptr[row * uv_pre_stride + col];
+      const int v_diff = v_src_ptr[row * uv_src_stride + col] -
+                         v_pre_ptr[row * uv_pre_stride + col];
+      u_diff_ptr[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_ptr[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
+
+  // Apply the filter to luma
+  for (int row = 0; row < static_cast<int>(block_height); row++) {
+    for (int col = 0; col < static_cast<int>(block_width); col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = GetFilterWeight(row, col, block_height,
+                                                block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre_ptr[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < static_cast<int>(block_height) &&
+              sub_col >= 0 && sub_col < static_cast<int>(block_width)) {
+            y_mod += y_diff_ptr[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
+          }
+        }
+      }
+
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_diff_ptr[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_ptr[uv_row * uv_diff_stride + uv_col];
+
+      y_num_used += 2;
+
+      // Set the modifier
+      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      y_count[row * y_count_stride + col] += y_mod;
+      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
+    }
+  }
+
+  // Apply the filter to chroma
+  for (int uv_row = 0; uv_row < uv_block_height; uv_row++) {
+    for (int uv_col = 0; uv_col < uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = GetFilterWeight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre_ptr[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre_ptr[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (int row_step = -1; row_step <= 1; row_step++) {
+        for (int col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_ptr[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_ptr[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
+
+      // Sum all the luma pixels associated with the current luma pixel
+      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_ptr[sub_row * y_diff_stride + sub_col];
+
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
+
+      // Set the modifier
+      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
+                                     filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * u_count_stride + uv_col] += u_mod;
+      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * v_count_stride + uv_col] += v_mod;
+      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
+    }
+  }
+}
+
+class YUVTemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithBd> {
+ public:
+  virtual void SetUp() {
+    filter_func_ = GetParam().temporal_filter;
+    bd_ = GetParam().bd;
+    use_highbd_ = (bd_ != 8);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    saturate_test_ = 0;
+    num_repeats_ = 10;
+
+    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
+  }
+
+ protected:
+  template <typename PixelType>
+  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
+                            int filter_strength, int use_32x32,
+                            const int *filter_weight);
+  template <typename PixelType>
+  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
+                              int filter_strength, int use_32x32,
+                              const int *filter_weight);
+  YUVTemporalFilterFunc filter_func_;
+  ACMRandom rnd_;
+  int saturate_test_;
+  int num_repeats_;
+  int use_highbd_;
+  int bd_;
+};
+
+template <typename PixelType>
+void YUVTemporalFilterTest::CompareTestWithParam(int width, int height,
+                                                 int ss_x, int ss_y,
+                                                 int filter_strength,
+                                                 int use_32x32,
+                                                 const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
+  Buffer<uint16_t> y_count_ref = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_ref = Buffer<uint32_t>(width, height, 0);
+  Buffer<uint16_t> y_count_tst = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum_tst = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_ref = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_ref = Buffer<uint32_t>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count_tst = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum_tst = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count_ref.Init());
+  ASSERT_TRUE(y_accum_ref.Init());
+  ASSERT_TRUE(y_count_tst.Init());
+  ASSERT_TRUE(y_accum_tst.Init());
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count_ref.Init());
+  ASSERT_TRUE(u_accum_ref.Init());
+  ASSERT_TRUE(u_count_tst.Init());
+  ASSERT_TRUE(u_accum_tst.Init());
+
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count_ref.Init());
+  ASSERT_TRUE(v_accum_ref.Init());
+  ASSERT_TRUE(v_count_tst.Init());
+  ASSERT_TRUE(v_accum_tst.Init());
+
+  y_accum_ref.Set(0);
+  y_accum_tst.Set(0);
+  y_count_ref.Set(0);
+  y_count_tst.Set(0);
+  u_accum_ref.Set(0);
+  u_accum_tst.Set(0);
+  u_count_ref.Set(0);
+  u_count_tst.Set(0);
+  v_accum_ref.Set(0);
+  v_accum_tst.Set(0);
+  v_count_ref.Set(0);
+  v_count_tst.Set(0);
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    if (saturate_test_) {
+      const int max_val = (1 << bd_) - 1;
+      y_src.Set(max_val);
+      y_pre.Set(0);
+      u_src.Set(max_val);
+      u_pre.Set(0);
+      v_src.Set(max_val);
+      v_pre.Set(0);
+    } else {
+      y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+      v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+    }
+
+    ApplyReferenceFilter<PixelType>(
+        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
+        filter_strength, filter_weight, use_32x32, &y_accum_ref, &y_count_ref,
+        &u_accum_ref, &u_count_ref, &v_accum_ref, &v_count_ref);
+
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum_tst.TopLeftPixel(), y_count_tst.TopLeftPixel(),
+        u_accum_tst.TopLeftPixel(), u_count_tst.TopLeftPixel(),
+        v_accum_tst.TopLeftPixel(), v_count_tst.TopLeftPixel()));
+
+    EXPECT_TRUE(y_accum_tst.CheckValues(y_accum_ref));
+    EXPECT_TRUE(y_count_tst.CheckValues(y_count_ref));
+    EXPECT_TRUE(u_accum_tst.CheckValues(u_accum_ref));
+    EXPECT_TRUE(u_count_tst.CheckValues(u_count_ref));
+    EXPECT_TRUE(v_accum_tst.CheckValues(v_accum_ref));
+    EXPECT_TRUE(v_count_tst.CheckValues(v_count_ref));
+
+    if (HasFailure()) {
+      if (use_32x32) {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
+               filter_strength, *filter_weight);
+      } else {
+        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
+               ss_y, filter_strength, filter_weight[0], filter_weight[1],
+               filter_weight[2], filter_weight[3]);
+      }
+      y_accum_tst.PrintDifference(y_accum_ref);
+      y_count_tst.PrintDifference(y_count_ref);
+      u_accum_tst.PrintDifference(u_accum_ref);
+      u_count_tst.PrintDifference(u_count_ref);
+      v_accum_tst.PrintDifference(v_accum_ref);
+      v_count_tst.PrintDifference(v_count_ref);
+
+      return;
+    }
+  }
+}
+
+template <typename PixelType>
+void YUVTemporalFilterTest::RunTestFilterWithParam(int width, int height,
+                                                   int ss_x, int ss_y,
+                                                   int filter_strength,
+                                                   int use_32x32,
+                                                   const int *filter_weight) {
+  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
+
+  Buffer<PixelType> y_src = Buffer<PixelType>(width, height, 0);
+  Buffer<PixelType> y_pre = Buffer<PixelType>(width, height, 0);
+  Buffer<uint16_t> y_count = Buffer<uint16_t>(width, height, 0);
+  Buffer<uint32_t> y_accum = Buffer<uint32_t>(width, height, 0);
+
+  Buffer<PixelType> u_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> u_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> u_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> u_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  Buffer<PixelType> v_src = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<PixelType> v_pre = Buffer<PixelType>(uv_width, uv_height, 0);
+  Buffer<uint16_t> v_count = Buffer<uint16_t>(uv_width, uv_height, 0);
+  Buffer<uint32_t> v_accum = Buffer<uint32_t>(uv_width, uv_height, 0);
+
+  ASSERT_TRUE(y_src.Init());
+  ASSERT_TRUE(y_pre.Init());
+  ASSERT_TRUE(y_count.Init());
+  ASSERT_TRUE(y_accum.Init());
+
+  ASSERT_TRUE(u_src.Init());
+  ASSERT_TRUE(u_pre.Init());
+  ASSERT_TRUE(u_count.Init());
+  ASSERT_TRUE(u_accum.Init());
+
+  ASSERT_TRUE(v_src.Init());
+  ASSERT_TRUE(v_pre.Init());
+  ASSERT_TRUE(v_count.Init());
+  ASSERT_TRUE(v_accum.Init());
+
+  y_accum.Set(0);
+  y_count.Set(0);
+
+  u_accum.Set(0);
+  u_count.Set(0);
+
+  v_accum.Set(0);
+  v_count.Set(0);
+
+  y_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  y_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  u_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_src.Set(&rnd_, 0, 7 << (bd_ - 8));
+  v_pre.Set(&rnd_, 0, 7 << (bd_ - 8));
+
+  for (int repeats = 0; repeats < num_repeats_; repeats++) {
+    ASM_REGISTER_STATE_CHECK(filter_func_(
+        reinterpret_cast<const uint8_t *>(y_src.TopLeftPixel()), y_src.stride(),
+        reinterpret_cast<const uint8_t *>(y_pre.TopLeftPixel()), y_pre.stride(),
+        reinterpret_cast<const uint8_t *>(u_src.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_src.TopLeftPixel()), u_src.stride(),
+        reinterpret_cast<const uint8_t *>(u_pre.TopLeftPixel()),
+        reinterpret_cast<const uint8_t *>(v_pre.TopLeftPixel()), u_pre.stride(),
+        width, height, ss_x, ss_y, filter_strength, filter_weight, use_32x32,
+        y_accum.TopLeftPixel(), y_count.TopLeftPixel(), u_accum.TopLeftPixel(),
+        u_count.TopLeftPixel(), v_accum.TopLeftPixel(),
+        v_count.TopLeftPixel()));
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use32x32) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           &filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          &filter_weight);
+          }
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, Use16x16) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 0;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
+        // Set up the filter
+        int filter_weight[4];
+        int filter_idx_cp = filter_idx;
+        for (int idx = 0; idx < 4; idx++) {
+          filter_weight[idx] = filter_idx_cp % 3;
+          filter_idx_cp /= 3;
+        }
+
+        // Test each parameter
+        for (int filter_strength = 0; filter_strength <= 6;
+             filter_strength += 2) {
+          if (use_highbd_) {
+            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                           adjusted_strength, use_32x32,
+                                           filter_weight);
+          } else {
+            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                          filter_strength, use_32x32,
+                                          filter_weight);
+          }
+
+          ASSERT_FALSE(HasFailure());
+        }
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, SaturationTest) {
+  const int width = 32, height = 32;
+  const int use_32x32 = 1;
+  const int filter_weight = 1;
+  saturate_test_ = 1;
+
+  for (int ss_x = 0; ss_x <= 1; ss_x++) {
+    for (int ss_y = 0; ss_y <= 1; ss_y++) {
+      for (int filter_strength = 0; filter_strength <= 6;
+           filter_strength += 2) {
+        if (use_highbd_) {
+          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
+          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                         adjusted_strength, use_32x32,
+                                         &filter_weight);
+        } else {
+          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                        filter_strength, use_32x32,
+                                        &filter_weight);
+        }
+
+        ASSERT_FALSE(HasFailure());
+      }
+    }
+  }
+}
+
+TEST_P(YUVTemporalFilterTest, DISABLED_Speed) {
+  const int width = 32, height = 32;
+  num_repeats_ = 1000;
+
+  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
+    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
+    for (int ss_x = 0; ss_x <= 1; ss_x++) {
+      for (int ss_y = 0; ss_y <= 1; ss_y++) {
+        for (int filter_idx = 0; filter_idx < num_filter_weights;
+             filter_idx++) {
+          // Set up the filter
+          int filter_weight[4];
+          int filter_idx_cp = filter_idx;
+          for (int idx = 0; idx < 4; idx++) {
+            filter_weight[idx] = filter_idx_cp % 3;
+            filter_idx_cp /= 3;
+          }
+
+          // Test each parameter
+          for (int filter_strength = 0; filter_strength <= 6;
+               filter_strength += 2) {
+            vpx_usec_timer timer;
+            vpx_usec_timer_start(&timer);
+
+            if (use_highbd_) {
+              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
+                                               filter_strength, use_32x32,
+                                               filter_weight);
+            } else {
+              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
+                                              filter_strength, use_32x32,
+                                              filter_weight);
+            }
+
+            vpx_usec_timer_mark(&timer);
+            const int elapsed_time =
+                static_cast<int>(vpx_usec_timer_elapsed(&timer));
+
+            printf(
+                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
+                "%d, Strength: %d, Time: %5d\n",
+                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
+                elapsed_time);
+          }
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define WRAP_HIGHBD_FUNC(func, bd)                                            \
+  void wrap_##func##_##bd(                                                    \
+      const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,           \
+      int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,           \
+      int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,          \
+      int uv_pre_stride, unsigned int block_width, unsigned int block_height, \
+      int ss_x, int ss_y, int strength, const int *const blk_fw,              \
+      int use_32x32, uint32_t *y_accumulator, uint16_t *y_count,              \
+      uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator,    \
+      uint16_t *v_count) {                                                    \
+    func(reinterpret_cast<const uint16_t *>(y_src), y_src_stride,             \
+         reinterpret_cast<const uint16_t *>(y_pre), y_pre_stride,             \
+         reinterpret_cast<const uint16_t *>(u_src),                           \
+         reinterpret_cast<const uint16_t *>(v_src), uv_src_stride,            \
+         reinterpret_cast<const uint16_t *>(u_pre),                           \
+         reinterpret_cast<const uint16_t *>(v_pre), uv_pre_stride,            \
+         block_width, block_height, ss_x, ss_y, strength, blk_fw, use_32x32,  \
+         y_accumulator, y_count, u_accumulator, u_count, v_accumulator,       \
+         v_count);                                                            \
+  }
+
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12);
+
+INSTANTIATE_TEST_CASE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
+#if HAVE_SSE4_1
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12);
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12,
+                             12)));
+#endif  // HAVE_SSE4_1
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, YUVTemporalFilterTest,
+    ::testing::Values(TemporalFilterWithBd(&vp9_apply_temporal_filter_c, 8)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(SSE4_1, YUVTemporalFilterTest,
+                        ::testing::Values(TemporalFilterWithBd(
+                            &vp9_apply_temporal_filter_sse4_1, 8)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/libs/libvpx/test/yuv_video_source.h b/libs/libvpx/test/yuv_video_source.h
index aee6b2ffbb8..020ce801d93 100644
--- a/libs/libvpx/test/yuv_video_source.h
+++ b/libs/libvpx/test/yuv_video_source.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TEST_YUV_VIDEO_SOURCE_H_
-#define TEST_YUV_VIDEO_SOURCE_H_
+#ifndef VPX_TEST_YUV_VIDEO_SOURCE_H_
+#define VPX_TEST_YUV_VIDEO_SOURCE_H_
 
 #include <cstdio>
 #include <cstdlib>
@@ -122,4 +122,4 @@ class YUVVideoSource : public VideoSource {
 
 }  // namespace libvpx_test
 
-#endif  // TEST_YUV_VIDEO_SOURCE_H_
+#endif  // VPX_TEST_YUV_VIDEO_SOURCE_H_
diff --git a/libs/libvpx/third_party/googletest/README.libvpx b/libs/libvpx/third_party/googletest/README.libvpx
index 2cd6910b410..49005ddac98 100644
--- a/libs/libvpx/third_party/googletest/README.libvpx
+++ b/libs/libvpx/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
-URL: https://github.com/google/googletest
-Version: 1.8.0
+URL: https://github.com/google/googletest.git
+Version: release-1.8.1
 License: BSD
 License File: LICENSE
 
@@ -13,12 +13,16 @@ generation.
 
 Local Modifications:
 - Remove everything but:
-  googletest-release-1.8.0/googletest/
+  googletest-release-1.8.1/googletest/
    CHANGES
    CONTRIBUTORS
    include
    LICENSE
    README.md
    src
-- Suppress unsigned overflow instrumentation in the LCG
-  https://github.com/google/googletest/pull/1066
+
+- Make WithParamInterface<T>::GetParam static in order to avoid
+  initialization issues
+  https://github.com/google/googletest/pull/1830
+- Use wcslen() instead of std::wcslen()
+  https://github.com/google/googletest/pull/1899
diff --git a/libs/libvpx/third_party/googletest/src/README.md b/libs/libvpx/third_party/googletest/src/README.md
index edd4408054b..e30fe804712 100644
--- a/libs/libvpx/third_party/googletest/src/README.md
+++ b/libs/libvpx/third_party/googletest/src/README.md
@@ -1,23 +1,21 @@
+### Generic Build Instructions
 
-### Generic Build Instructions ###
+#### Setup
 
-#### Setup ####
+To build Google Test and your tests that use it, you need to tell your build
+system where to find its headers and source files. The exact way to do it
+depends on which build system you use, and is usually straightforward.
 
-To build Google Test and your tests that use it, you need to tell your
-build system where to find its headers and source files.  The exact
-way to do it depends on which build system you use, and is usually
-straightforward.
+#### Build
 
-#### Build ####
-
-Suppose you put Google Test in directory `${GTEST_DIR}`.  To build it,
-create a library build target (or a project as called by Visual Studio
-and Xcode) to compile
+Suppose you put Google Test in directory `${GTEST_DIR}`. To build it, create a
+library build target (or a project as called by Visual Studio and Xcode) to
+compile
 
     ${GTEST_DIR}/src/gtest-all.cc
 
 with `${GTEST_DIR}/include` in the system header search path and `${GTEST_DIR}`
-in the normal header search path.  Assuming a Linux-like system and gcc,
+in the normal header search path. Assuming a Linux-like system and gcc,
 something like the following will do:
 
     g++ -isystem ${GTEST_DIR}/include -I${GTEST_DIR} \
@@ -26,136 +24,239 @@ something like the following will do:
 
 (We need `-pthread` as Google Test uses threads.)
 
-Next, you should compile your test source file with
-`${GTEST_DIR}/include` in the system header search path, and link it
-with gtest and any other necessary libraries:
+Next, you should compile your test source file with `${GTEST_DIR}/include` in
+the system header search path, and link it with gtest and any other necessary
+libraries:
 
     g++ -isystem ${GTEST_DIR}/include -pthread path/to/your_test.cc libgtest.a \
         -o your_test
 
-As an example, the make/ directory contains a Makefile that you can
-use to build Google Test on systems where GNU make is available
-(e.g. Linux, Mac OS X, and Cygwin).  It doesn't try to build Google
-Test's own tests.  Instead, it just builds the Google Test library and
-a sample test.  You can use it as a starting point for your own build
-script.
+As an example, the make/ directory contains a Makefile that you can use to build
+Google Test on systems where GNU make is available (e.g. Linux, Mac OS X, and
+Cygwin). It doesn't try to build Google Test's own tests. Instead, it just
+builds the Google Test library and a sample test. You can use it as a starting
+point for your own build script.
 
-If the default settings are correct for your environment, the
-following commands should succeed:
+If the default settings are correct for your environment, the following commands
+should succeed:
 
     cd ${GTEST_DIR}/make
     make
     ./sample1_unittest
 
-If you see errors, try to tweak the contents of `make/Makefile` to make
-them go away.  There are instructions in `make/Makefile` on how to do
-it.
+If you see errors, try to tweak the contents of `make/Makefile` to make them go
+away. There are instructions in `make/Makefile` on how to do it.
 
-### Using CMake ###
+### Using CMake
 
 Google Test comes with a CMake build script (
-[CMakeLists.txt](CMakeLists.txt)) that can be used on a wide range of platforms ("C" stands for
-cross-platform.). If you don't have CMake installed already, you can
-download it for free from <http://www.cmake.org/>.
+[CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
+that can be used on a wide range of platforms ("C" stands for cross-platform.).
+If you don't have CMake installed already, you can download it for free from
+<http://www.cmake.org/>.
+
+CMake works by generating native makefiles or build projects that can be used in
+the compiler environment of your choice. You can either build Google Test as a
+standalone project or it can be incorporated into an existing CMake build for
+another project.
 
-CMake works by generating native makefiles or build projects that can
-be used in the compiler environment of your choice.  The typical
-workflow starts with:
+#### Standalone CMake Project
+
+When building Google Test as a standalone project, the typical workflow starts
+with:
 
     mkdir mybuild       # Create a directory to hold the build output.
     cd mybuild
     cmake ${GTEST_DIR}  # Generate native build scripts.
 
-If you want to build Google Test's samples, you should replace the
-last command with
+If you want to build Google Test's samples, you should replace the last command
+with
 
     cmake -Dgtest_build_samples=ON ${GTEST_DIR}
 
-If you are on a \*nix system, you should now see a Makefile in the
-current directory.  Just type 'make' to build gtest.
+If you are on a \*nix system, you should now see a Makefile in the current
+directory. Just type 'make' to build gtest.
 
-If you use Windows and have Visual Studio installed, a `gtest.sln` file
-and several `.vcproj` files will be created.  You can then build them
-using Visual Studio.
+If you use Windows and have Visual Studio installed, a `gtest.sln` file and
+several `.vcproj` files will be created. You can then build them using Visual
+Studio.
 
 On Mac OS X with Xcode installed, a `.xcodeproj` file will be generated.
 
-### Legacy Build Scripts ###
+#### Incorporating Into An Existing CMake Project
+
+If you want to use gtest in a project which already uses CMake, then a more
+robust and flexible approach is to build gtest as part of that project directly.
+This is done by making the GoogleTest source code available to the main build
+and adding it using CMake's `add_subdirectory()` command. This has the
+significant advantage that the same compiler and linker settings are used
+between gtest and the rest of your project, so issues associated with using
+incompatible libraries (eg debug/release), etc. are avoided. This is
+particularly useful on Windows. Making GoogleTest's source code available to the
+main build can be done a few different ways:
+
+*   Download the GoogleTest source code manually and place it at a known
+    location. This is the least flexible approach and can make it more difficult
+    to use with continuous integration systems, etc.
+*   Embed the GoogleTest source code as a direct copy in the main project's
+    source tree. This is often the simplest approach, but is also the hardest to
+    keep up to date. Some organizations may not permit this method.
+*   Add GoogleTest as a git submodule or equivalent. This may not always be
+    possible or appropriate. Git submodules, for example, have their own set of
+    advantages and drawbacks.
+*   Use CMake to download GoogleTest as part of the build's configure step. This
+    is just a little more complex, but doesn't have the limitations of the other
+    methods.
+
+The last of the above methods is implemented with a small piece of CMake code in
+a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
+then invoked as a sub-build _during the CMake stage_. That directory is then
+pulled into the main build with `add_subdirectory()`. For example:
+
+New file `CMakeLists.txt.in`:
+
+    cmake_minimum_required(VERSION 2.8.2)
+
+    project(googletest-download NONE)
+
+    include(ExternalProject)
+    ExternalProject_Add(googletest
+      GIT_REPOSITORY    https://github.com/google/googletest.git
+      GIT_TAG           master
+      SOURCE_DIR        "${CMAKE_BINARY_DIR}/googletest-src"
+      BINARY_DIR        "${CMAKE_BINARY_DIR}/googletest-build"
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+    )
+
+Existing build's `CMakeLists.txt`:
+
+    # Download and unpack googletest at configure time
+    configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
+    execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "CMake step for googletest failed: ${result}")
+    endif()
+    execute_process(COMMAND ${CMAKE_COMMAND} --build .
+      RESULT_VARIABLE result
+      WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download )
+    if(result)
+      message(FATAL_ERROR "Build step for googletest failed: ${result}")
+    endif()
+
+    # Prevent overriding the parent project's compiler/linker
+    # settings on Windows
+    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+
+    # Add googletest directly to our build. This defines
+    # the gtest and gtest_main targets.
+    add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src
+                     ${CMAKE_BINARY_DIR}/googletest-build
+                     EXCLUDE_FROM_ALL)
+
+    # The gtest/gtest_main targets carry header search path
+    # dependencies automatically when using CMake 2.8.11 or
+    # later. Otherwise we have to add them here ourselves.
+    if (CMAKE_VERSION VERSION_LESS 2.8.11)
+      include_directories("${gtest_SOURCE_DIR}/include")
+    endif()
+
+    # Now simply link against gtest or gtest_main as needed. Eg
+    add_executable(example example.cpp)
+    target_link_libraries(example gtest_main)
+    add_test(NAME example_test COMMAND example)
+
+Note that this approach requires CMake 2.8.2 or later due to its use of the
+`ExternalProject_Add()` command. The above technique is discussed in more detail
+in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
+also contains a link to a fully generalized implementation of the technique.
+
+##### Visual Studio Dynamic vs Static Runtimes
+
+By default, new Visual Studio projects link the C runtimes dynamically but
+Google Test links them statically. This will generate an error that looks
+something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
+detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
+'MDd_DynamicDebug' in main.obj
+
+Google Test already has a CMake option for this: `gtest_force_shared_crt`
+
+Enabling this option will make gtest link the runtimes dynamically too, and
+match the project in which it is included.
+
+### Legacy Build Scripts
 
 Before settling on CMake, we have been providing hand-maintained build
-projects/scripts for Visual Studio, Xcode, and Autotools.  While we
-continue to provide them for convenience, they are not actively
-maintained any more.  We highly recommend that you follow the
-instructions in the previous two sections to integrate Google Test
-with your existing build system.
+projects/scripts for Visual Studio, Xcode, and Autotools. While we continue to
+provide them for convenience, they are not actively maintained any more. We
+highly recommend that you follow the instructions in the above sections to
+integrate Google Test with your existing build system.
 
 If you still need to use the legacy build scripts, here's how:
 
-The msvc\ folder contains two solutions with Visual C++ projects.
-Open the `gtest.sln` or `gtest-md.sln` file using Visual Studio, and you
-are ready to build Google Test the same way you build any Visual
-Studio project.  Files that have names ending with -md use DLL
-versions of Microsoft runtime libraries (the /MD or the /MDd compiler
-option).  Files without that suffix use static versions of the runtime
-libraries (the /MT or the /MTd option).  Please note that one must use
-the same option to compile both gtest and the test code.  If you use
-Visual Studio 2005 or above, we recommend the -md version as /MD is
-the default for new projects in these versions of Visual Studio.
-
-On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using
-Xcode.  Build the "gtest" target.  The universal binary framework will
-end up in your selected build directory (selected in the Xcode
-"Preferences..." -> "Building" pane and defaults to xcode/build).
-Alternatively, at the command line, enter:
+The msvc\ folder contains two solutions with Visual C++ projects. Open the
+`gtest.sln` or `gtest-md.sln` file using Visual Studio, and you are ready to
+build Google Test the same way you build any Visual Studio project. Files that
+have names ending with -md use DLL versions of Microsoft runtime libraries (the
+/MD or the /MDd compiler option). Files without that suffix use static versions
+of the runtime libraries (the /MT or the /MTd option). Please note that one must
+use the same option to compile both gtest and the test code. If you use Visual
+Studio 2005 or above, we recommend the -md version as /MD is the default for new
+projects in these versions of Visual Studio.
+
+On Mac OS X, open the `gtest.xcodeproj` in the `xcode/` folder using Xcode.
+Build the "gtest" target. The universal binary framework will end up in your
+selected build directory (selected in the Xcode "Preferences..." -> "Building"
+pane and defaults to xcode/build). Alternatively, at the command line, enter:
 
     xcodebuild
 
-This will build the "Release" configuration of gtest.framework in your
-default build location.  See the "xcodebuild" man page for more
-information about building different configurations and building in
-different locations.
+This will build the "Release" configuration of gtest.framework in your default
+build location. See the "xcodebuild" man page for more information about
+building different configurations and building in different locations.
 
-If you wish to use the Google Test Xcode project with Xcode 4.x and
-above, you need to either:
+If you wish to use the Google Test Xcode project with Xcode 4.x and above, you
+need to either:
 
- * update the SDK configuration options in xcode/Config/General.xconfig.
-   Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
-   you choose this route you lose the ability to target earlier versions
-   of MacOS X.
- * Install an SDK for an earlier version. This doesn't appear to be
-   supported by Apple, but has been reported to work
-   (http://stackoverflow.com/questions/5378518).
+*   update the SDK configuration options in xcode/Config/General.xconfig.
+    Comment options `SDKROOT`, `MACOS_DEPLOYMENT_TARGET`, and `GCC_VERSION`. If
+    you choose this route you lose the ability to target earlier versions of
+    MacOS X.
+*   Install an SDK for an earlier version. This doesn't appear to be supported
+    by Apple, but has been reported to work
+    (http://stackoverflow.com/questions/5378518).
 
-### Tweaking Google Test ###
+### Tweaking Google Test
 
-Google Test can be used in diverse environments.  The default
-configuration may not work (or may not work well) out of the box in
-some environments.  However, you can easily tweak Google Test by
-defining control macros on the compiler command line.  Generally,
-these macros are named like `GTEST_XYZ` and you define them to either 1
-or 0 to enable or disable a certain feature.
+Google Test can be used in diverse environments. The default configuration may
+not work (or may not work well) out of the box in some environments. However,
+you can easily tweak Google Test by defining control macros on the compiler
+command line. Generally, these macros are named like `GTEST_XYZ` and you define
+them to either 1 or 0 to enable or disable a certain feature.
 
-We list the most frequently used macros below.  For a complete list,
-see file [include/gtest/internal/gtest-port.h](include/gtest/internal/gtest-port.h).
+We list the most frequently used macros below. For a complete list, see file
+[include/gtest/internal/gtest-port.h](https://github.com/google/googletest/blob/master/include/gtest/internal/gtest-port.h).
 
-### Choosing a TR1 Tuple Library ###
+### Choosing a TR1 Tuple Library
 
-Some Google Test features require the C++ Technical Report 1 (TR1)
-tuple library, which is not yet available with all compilers.  The
-good news is that Google Test implements a subset of TR1 tuple that's
-enough for its own need, and will automatically use this when the
-compiler doesn't provide TR1 tuple.
+Some Google Test features require the C++ Technical Report 1 (TR1) tuple
+library, which is not yet available with all compilers. The good news is that
+Google Test implements a subset of TR1 tuple that's enough for its own need, and
+will automatically use this when the compiler doesn't provide TR1 tuple.
 
-Usually you don't need to care about which tuple library Google Test
-uses.  However, if your project already uses TR1 tuple, you need to
-tell Google Test to use the same TR1 tuple library the rest of your
-project uses, or the two tuple implementations will clash.  To do
-that, add
+Usually you don't need to care about which tuple library Google Test uses.
+However, if your project already uses TR1 tuple, you need to tell Google Test to
+use the same TR1 tuple library the rest of your project uses, or the two tuple
+implementations will clash. To do that, add
 
     -DGTEST_USE_OWN_TR1_TUPLE=0
 
-to the compiler flags while compiling Google Test and your tests.  If
-you want to force Google Test to use its own tuple library, just add
+to the compiler flags while compiling Google Test and your tests. If you want to
+force Google Test to use its own tuple library, just add
 
     -DGTEST_USE_OWN_TR1_TUPLE=1
 
@@ -167,15 +268,15 @@ If you don't want Google Test to use tuple at all, add
 
 and all features using tuple will be disabled.
 
-### Multi-threaded Tests ###
+### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available.
-After `#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE`
-macro to see whether this is the case (yes if the macro is `#defined` to
-1, no if it's undefined.).
+Google Test is thread-safe where the pthread library is available. After
+`#include "gtest/gtest.h"`, you can check the `GTEST_IS_THREADSAFE` macro to see
+whether this is the case (yes if the macro is `#defined` to 1, no if it's
+undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available
-in your environment, you can force it with
+If Google Test doesn't correctly detect whether pthread is available in your
+environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
 
@@ -183,26 +284,24 @@ or
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your
-compiler and/or linker to select the pthread library, or you'll get
-link errors.  If you use the CMake script or the deprecated Autotools
-script, this is taken care of for you.  If you use your own build
-script, you'll need to read your compiler and linker's manual to
-figure out what flags to add.
+When Google Test uses pthread, you may need to add flags to your compiler and/or
+linker to select the pthread library, or you'll get link errors. If you use the
+CMake script or the deprecated Autotools script, this is taken care of for you.
+If you use your own build script, you'll need to read your compiler and linker's
+manual to figure out what flags to add.
 
-### As a Shared Library (DLL) ###
+### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a
-static library for the simplicity.  You can choose to use Google Test
-as a shared library (known as a DLL on Windows) if you prefer.
+Google Test is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use Google Test as a shared library (known
+as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
 
     -DGTEST_CREATE_SHARED_LIBRARY=1
 
-to the compiler flags.  You'll also need to tell the linker to produce
-a shared library instead - consult your linker's manual for how to do
-it.
+to the compiler flags. You'll also need to tell the linker to produce a shared
+library instead - consult your linker's manual for how to do it.
 
 To compile your *tests* that use the gtest shared library, add
 
@@ -210,31 +309,28 @@ To compile your *tests* that use the gtest shared library, add
 
 to the compiler flags.
 
-Note: while the above steps aren't technically necessary today when
-using some compilers (e.g. GCC), they may become necessary in the
-future, if we decide to improve the speed of loading the library (see
-<http://gcc.gnu.org/wiki/Visibility> for details).  Therefore you are
-recommended to always add the above flags when using Google Test as a
-shared library.  Otherwise a future release of Google Test may break
-your build script.
+Note: while the above steps aren't technically necessary today when using some
+compilers (e.g. GCC), they may become necessary in the future, if we decide to
+improve the speed of loading the library (see
+<http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
+to always add the above flags when using Google Test as a shared library.
+Otherwise a future release of Google Test may break your build script.
 
-### Avoiding Macro Name Clashes ###
+### Avoiding Macro Name Clashes
 
-In C++, macros don't obey namespaces.  Therefore two libraries that
-both define a macro of the same name will clash if you `#include` both
-definitions.  In case a Google Test macro clashes with another
-library, you can force Google Test to rename its macro to avoid the
-conflict.
+In C++, macros don't obey namespaces. Therefore two libraries that both define a
+macro of the same name will clash if you `#include` both definitions. In case a
+Google Test macro clashes with another library, you can force Google Test to
+rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro
-FOO, you can add
+Specifically, if both Google Test and some other code define macro FOO, you can
+add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name
-from `FOO` to `GTEST_FOO`.  Currently `FOO` can be `FAIL`, `SUCCEED`,
-or `TEST`.  For example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll
-need to write
+to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
 
@@ -243,38 +339,3 @@ instead of
     TEST(SomeTest, DoesThis) { ... }
 
 in order to define a test.
-
-## Developing Google Test ##
-
-This section discusses how to make your own changes to Google Test.
-
-### Testing Google Test Itself ###
-
-To make sure your changes work as intended and don't break existing
-functionality, you'll want to compile and run Google Test's own tests.
-For that you can use CMake:
-
-    mkdir mybuild
-    cd mybuild
-    cmake -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Make sure you have Python installed, as some of Google Test's tests
-are written in Python.  If the cmake command complains about not being
-able to find Python (`Could NOT find PythonInterp (missing:
-PYTHON_EXECUTABLE)`), try telling it explicitly where your Python
-executable can be found:
-
-    cmake -DPYTHON_EXECUTABLE=path/to/python -Dgtest_build_tests=ON ${GTEST_DIR}
-
-Next, you can build Google Test and all of its own tests.  On \*nix,
-this is usually done by 'make'.  To run the tests, do
-
-    make test
-
-All tests should pass.
-
-Normally you don't need to worry about regenerating the source files,
-unless you need to modify them.  In that case, you should modify the
-corresponding .pump files instead and run the pump.py Python script to
-regenerate them.  You can find pump.py in the [scripts/](scripts/) directory.
-Read the [Pump manual](docs/PumpManual.md) for how to use it.
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
index 957a69c6a9e..20c54d86951 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -26,14 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
@@ -99,10 +99,11 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
+//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
-//   On other platforms (e.g. Windows), we only support a simple regex
+//   On other platforms (e.g. Windows or Mac), we only support a simple regex
 //   syntax implemented as part of Google Test.  This limited
 //   implementation should be enough most of the time when writing
 //   death tests; though it lacks many features you can find in PCRE
@@ -160,7 +161,7 @@ GTEST_API_ bool InDeathTestChild();
 //   is rarely a problem as people usually don't put the test binary
 //   directory in PATH.
 //
-// TODO(wan@google.com): make thread-safe death tests search the PATH.
+// FIXME: make thread-safe death tests search the PATH.
 
 // Asserts that a given statement causes the program to exit, with an
 // integer exit status that satisfies predicate, and emitting error output
@@ -198,9 +199,10 @@ class GTEST_API_ ExitedWithCode {
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
+// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
@@ -272,6 +274,54 @@ class GTEST_API_ KilledBySignal {
 # endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
+// This macro is used for implementing macros such as
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
+// death tests are not supported. Those macros must compile on such systems
+// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
+// systems that support death tests. This allows one to write such a macro
+// on a system that does not support death tests and be sure that it will
+// compile on a death-test supporting system. It is exposed publicly so that
+// systems that have death-tests with stricter requirements than
+// GTEST_HAS_DEATH_TEST can write their own equivalent of
+// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED.
+//
+// Parameters:
+//   statement -  A statement that a macro such as EXPECT_DEATH would test
+//                for program termination. This macro has to make sure this
+//                statement is compiled but not executed, to ensure that
+//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
+//                parameter iff EXPECT_DEATH compiles with it.
+//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
+//                the output of statement.  This parameter has to be
+//                compiled but not evaluated by this macro, to ensure that
+//                this macro only accepts expressions that a macro such as
+//                EXPECT_DEATH would accept.
+//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
+//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
+//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
+//                compile inside functions where ASSERT_DEATH doesn't
+//                compile.
+//
+//  The branch that has an always false condition is used to ensure that
+//  statement and regex are compiled (and thus syntactically correct) but
+//  never executed. The unreachable code macro protects the terminator
+//  statement from generating an 'unreachable code' warning in case
+//  statement unconditionally returns or throws. The Message constructor at
+//  the end allows the syntax of streaming additional messages into the
+//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
+# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
+    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+    if (::testing::internal::AlwaysTrue()) { \
+      GTEST_LOG_(WARNING) \
+          << "Death tests are not supported on this platform.\n" \
+          << "Statement '" #statement "' cannot be verified."; \
+    } else if (::testing::internal::AlwaysFalse()) { \
+      ::testing::internal::RE::PartialMatch(".*", (regex)); \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+      terminator; \
+    } else \
+      ::testing::Message()
+
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
 // death tests are supported; otherwise they just issue a warning.  This is
@@ -284,9 +334,9 @@ class GTEST_API_ KilledBySignal {
     ASSERT_DEATH(statement, regex)
 #else
 # define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, )
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
 # define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return)
+    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
index fe879bca792..5ca041614cb 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
 //
@@ -43,6 +42,8 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
@@ -50,6 +51,9 @@
 
 #include "gtest/internal/gtest-port.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Ensures that there is at least one operator<< in the global namespace.
 // See Message& operator<<(...) below for why.
 void operator<<(const testing::internal::Secret&, int);
@@ -196,7 +200,6 @@ class GTEST_API_ Message {
   std::string GetString() const;
 
  private:
-
 #if GTEST_OS_SYMBIAN
   // These are needed as the Nokia Symbian Compiler cannot decide between
   // const T& and const T* in a function template. The Nokia compiler _can_
@@ -247,4 +250,6 @@ std::string StreamableToString(const T& streamable) {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
index 038f9ba79eb..3e95e4390e0 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -31,13 +31,12 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: vladl@google.com (Vlad Losev)
-//
 // Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
+// in Google C++ Testing and Mocking Framework (Google Test)
 //
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
 //
+// GOOGLETEST_CM0001 DO NOT DELETE
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -79,7 +78,7 @@ TEST_P(FooTest, HasBlahBlah) {
 // Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
 // case with any set of parameters you want. Google Test defines a number
 // of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
+// (surprise!) parameter generators. Here is a summary of them, which
 // are all in the testing namespace:
 //
 //
@@ -185,15 +184,10 @@ TEST_P(DerivedTest, DoesBlah) {
 # include <utility>
 #endif
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-param-util.h"
 #include "gtest/internal/gtest-param-util-generated.h"
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Functions producing parameter generators.
@@ -273,7 +267,7 @@ internal::ParamGenerator<T> Range(T start, T end) {
 // each with C-string values of "foo", "bar", and "baz":
 //
 // const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings));
 //
 // This instantiates tests from test case StlStringTest
 // each with STL strings with values "a" and "b":
@@ -1375,8 +1369,6 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 }
 # endif  // GTEST_HAS_COMBINE
 
-
-
 # define TEST_P(test_case_name, test_name) \
   class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
       : public test_case_name { \
@@ -1390,8 +1382,8 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
               #test_case_name, \
               ::testing::internal::CodeLocation(\
                   __FILE__, __LINE__))->AddTestPattern(\
-                      #test_case_name, \
-                      #test_name, \
+                      GTEST_STRINGIFY_(test_case_name), \
+                      GTEST_STRINGIFY_(test_name), \
                       new ::testing::internal::TestMetaFactory< \
                           GTEST_TEST_CLASS_NAME_(\
                               test_case_name, test_name)>()); \
@@ -1412,21 +1404,21 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 // type testing::TestParamInfo<class ParamType>, and return std::string.
 //
 // testing::PrintToStringParamName is a builtin test suffix generator that
-// returns the value of testing::PrintToString(GetParam()). It does not work
-// for std::string or C strings.
+// returns the value of testing::PrintToString(GetParam()).
 //
 // Note: test names must be non-empty, unique, and may only contain ASCII
-// alphanumeric characters or underscore.
+// alphanumeric characters or underscore. Because PrintToString adds quotes
+// to std::string and C strings, it won't work for these types.
 
 # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+  static ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+  static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
       const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
     return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
         (__VA_ARGS__)(info); \
   } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+  static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
               #test_case_name, \
@@ -1439,6 +1431,4 @@ internal::CartesianProductHolder10<Generator1, Generator2, Generator3,
 
 }  // namespace testing
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump
index 3078d6d2a19..274f2b3b569 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h.pump
@@ -30,13 +30,12 @@ $var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: vladl@google.com (Vlad Losev)
-//
 // Macros and functions for implementing parameterized tests
-// in Google C++ Testing Framework (Google Test)
+// in Google C++ Testing and Mocking Framework (Google Test)
 //
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
 //
+// GOOGLETEST_CM0001 DO NOT DELETE
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -78,7 +77,7 @@ TEST_P(FooTest, HasBlahBlah) {
 // Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test
 // case with any set of parameters you want. Google Test defines a number
 // of functions for generating test parameters. They return what we call
-// (surprise!) parameter generators. Here is a  summary of them, which
+// (surprise!) parameter generators. Here is a summary of them, which
 // are all in the testing namespace:
 //
 //
@@ -184,15 +183,10 @@ TEST_P(DerivedTest, DoesBlah) {
 # include <utility>
 #endif
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-param-util.h"
 #include "gtest/internal/gtest-param-util-generated.h"
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Functions producing parameter generators.
@@ -272,7 +266,7 @@ internal::ParamGenerator<T> Range(T start, T end) {
 // each with C-string values of "foo", "bar", and "baz":
 //
 // const char* strings[] = {"foo", "bar", "baz"};
-// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings));
+// INSTANTIATE_TEST_CASE_P(StringSequence, StringTest, ValuesIn(strings));
 //
 // This instantiates tests from test case StlStringTest
 // each with STL strings with values "a" and "b":
@@ -441,8 +435,6 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
 ]]
 # endif  // GTEST_HAS_COMBINE
 
-
-
 # define TEST_P(test_case_name, test_name) \
   class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \
       : public test_case_name { \
@@ -456,8 +448,8 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
               #test_case_name, \
               ::testing::internal::CodeLocation(\
                   __FILE__, __LINE__))->AddTestPattern(\
-                      #test_case_name, \
-                      #test_name, \
+                      GTEST_STRINGIFY_(test_case_name), \
+                      GTEST_STRINGIFY_(test_name), \
                       new ::testing::internal::TestMetaFactory< \
                           GTEST_TEST_CLASS_NAME_(\
                               test_case_name, test_name)>()); \
@@ -485,14 +477,14 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
 // to std::string and C strings, it won't work for these types.
 
 # define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator, ...) \
-  ::testing::internal::ParamGenerator<test_case_name::ParamType> \
+  static ::testing::internal::ParamGenerator<test_case_name::ParamType> \
       gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \
-  ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
+  static ::std::string gtest_##prefix##test_case_name##_EvalGenerateName_( \
       const ::testing::TestParamInfo<test_case_name::ParamType>& info) { \
     return ::testing::internal::GetParamNameGen<test_case_name::ParamType> \
         (__VA_ARGS__)(info); \
   } \
-  int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
+  static int gtest_##prefix##test_case_name##_dummy_ GTEST_ATTRIBUTE_UNUSED_ = \
       ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \
           GetTestCasePatternHolder<test_case_name>(\
               #test_case_name, \
@@ -505,6 +497,4 @@ internal::CartesianProductHolder$i<$for j, [[Generator$j]]> Combine(
 
 }  // namespace testing
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
index 8a33164cb38..51865f84e6f 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -46,6 +45,10 @@
 //   2. operator<<(ostream&, const T&) defined in either foo or the
 //      global namespace.
 //
+// However if T is an STL-style container then it is printed element-wise
+// unless foo::PrintTo(const T&, ostream*) is defined. Note that
+// operator<<() is ignored for container types.
+//
 // If none of the above is defined, it will print the debug string of
 // the value if it is a protocol buffer, or print the raw bytes in the
 // value otherwise.
@@ -92,6 +95,8 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
@@ -107,6 +112,12 @@
 # include <tuple>
 #endif
 
+#if GTEST_HAS_ABSL
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/variant.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 // Definitions in the 'internal' and 'internal2' name spaces are
@@ -125,7 +136,11 @@ enum TypeKind {
   kProtobuf,              // a protobuf type
   kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
                           // (e.g. a named or unnamed enum type)
-  kOtherType              // anything else
+#if GTEST_HAS_ABSL
+  kConvertibleToStringView,  // a type implicitly convertible to
+                             // absl::string_view
+#endif
+  kOtherType  // anything else
 };
 
 // TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
@@ -137,7 +152,8 @@ class TypeWithoutFormatter {
  public:
   // This default version is called when kTypeKind is kOtherType.
   static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(reinterpret_cast<const unsigned char*>(&value),
+    PrintBytesInObjectTo(static_cast<const unsigned char*>(
+                             reinterpret_cast<const void*>(&value)),
                          sizeof(value), os);
   }
 };
@@ -151,10 +167,10 @@ template <typename T>
 class TypeWithoutFormatter<T, kProtobuf> {
  public:
   static void PrintValue(const T& value, ::std::ostream* os) {
-    const ::testing::internal::string short_str = value.ShortDebugString();
-    const ::testing::internal::string pretty_str =
-        short_str.length() <= kProtobufOneLinerMaxLength ?
-        short_str : ("\n" + value.DebugString());
+    std::string pretty_str = value.ShortDebugString();
+    if (pretty_str.length() > kProtobufOneLinerMaxLength) {
+      pretty_str = "\n" + value.DebugString();
+    }
     *os << ("<" + pretty_str + ">");
   }
 };
@@ -175,6 +191,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
   }
 };
 
+#if GTEST_HAS_ABSL
+template <typename T>
+class TypeWithoutFormatter<T, kConvertibleToStringView> {
+ public:
+  // Since T has neither operator<< nor PrintTo() but can be implicitly
+  // converted to absl::string_view, we print it as a absl::string_view.
+  //
+  // Note: the implementation is further below, as it depends on
+  // internal::PrintTo symbol which is defined later in the file.
+  static void PrintValue(const T& value, ::std::ostream* os);
+};
+#endif
+
 // Prints the given value to the given ostream.  If the value is a
 // protocol message, its debug string is printed; if it's an enum or
 // of a type implicitly convertible to BiggestInt, it's printed as an
@@ -202,10 +231,19 @@ class TypeWithoutFormatter<T, kConvertibleToInteger> {
 template <typename Char, typename CharTraits, typename T>
 ::std::basic_ostream<Char, CharTraits>& operator<<(
     ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T,
-      (internal::IsAProtocolMessage<T>::value ? kProtobuf :
-       internal::ImplicitlyConvertible<const T&, internal::BiggestInt>::value ?
-       kConvertibleToInteger : kOtherType)>::PrintValue(x, &os);
+  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
+                               ? kProtobuf
+                               : internal::ImplicitlyConvertible<
+                                     const T&, internal::BiggestInt>::value
+                                     ? kConvertibleToInteger
+                                     :
+#if GTEST_HAS_ABSL
+                                     internal::ImplicitlyConvertible<
+                                         const T&, absl::string_view>::value
+                                         ? kConvertibleToStringView
+                                         :
+#endif
+                                         kOtherType)>::PrintValue(x, &os);
   return os;
 }
 
@@ -364,11 +402,18 @@ class UniversalPrinter;
 template <typename T>
 void UniversalPrint(const T& value, ::std::ostream* os);
 
+enum DefaultPrinterType {
+  kPrintContainer,
+  kPrintPointer,
+  kPrintFunctionPointer,
+  kPrintOther,
+};
+template <DefaultPrinterType type> struct WrapPrinterType {};
+
 // Used to print an STL-style container when the user doesn't define
 // a PrintTo() for it.
 template <typename C>
-void DefaultPrintTo(IsContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
                     const C& container, ::std::ostream* os) {
   const size_t kMaxCount = 32;  // The maximum number of elements to print.
   *os << '{';
@@ -401,40 +446,34 @@ void DefaultPrintTo(IsContainer /* dummy */,
 // implementation-defined.  Therefore they will be printed as raw
 // bytes.)
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    true_type /* is a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
                     T* p, ::std::ostream* os) {
   if (p == NULL) {
     *os << "NULL";
   } else {
-    // C++ doesn't allow casting from a function pointer to any object
-    // pointer.
-    //
-    // IsTrue() silences warnings: "Condition is always true",
-    // "unreachable code".
-    if (IsTrue(ImplicitlyConvertible<T*, const void*>::value)) {
-      // T is not a function type.  We just call << to print p,
-      // relying on ADL to pick up user-defined << for their pointer
-      // types, if any.
-      *os << p;
-    } else {
-      // T is a function type, so '*os << p' doesn't do what we want
-      // (it just prints p as bool).  We want to print p as a const
-      // void*.  However, we cannot cast it to const void* directly,
-      // even using reinterpret_cast, as earlier versions of gcc
-      // (e.g. 3.4.5) cannot compile the cast when p is a function
-      // pointer.  Casting to UInt64 first solves the problem.
-      *os << reinterpret_cast<const void*>(
-          reinterpret_cast<internal::UInt64>(p));
-    }
+    // T is not a function type.  We just call << to print p,
+    // relying on ADL to pick up user-defined << for their pointer
+    // types, if any.
+    *os << p;
+  }
+}
+template <typename T>
+void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
+                    T* p, ::std::ostream* os) {
+  if (p == NULL) {
+    *os << "NULL";
+  } else {
+    // T is a function type, so '*os << p' doesn't do what we want
+    // (it just prints p as bool).  We want to print p as a const
+    // void*.
+    *os << reinterpret_cast<const void*>(p);
   }
 }
 
 // Used to print a non-container, non-pointer value when the user
 // doesn't define PrintTo() for it.
 template <typename T>
-void DefaultPrintTo(IsNotContainer /* dummy */,
-                    false_type /* is not a pointer */,
+void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
                     const T& value, ::std::ostream* os) {
   ::testing_internal::DefaultPrintNonContainerTo(value, os);
 }
@@ -452,11 +491,8 @@ void DefaultPrintTo(IsNotContainer /* dummy */,
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first two
-  // arguments determine which version will be picked.  If T is an
-  // STL-style container, the version for container will be called; if
-  // T is a pointer, the pointer version will be called; otherwise the
-  // generic version will be called.
+  // DefaultPrintTo() is overloaded.  The type of its first argument
+  // determines which version will be picked.
   //
   // Note that we check for container types here, prior to we check
   // for protocol message types in our operator<<.  The rationale is:
@@ -468,13 +504,27 @@ void PrintTo(const T& value, ::std::ostream* os) {
   // elements; therefore we check for container types here to ensure
   // that our format is used.
   //
-  // The second argument of DefaultPrintTo() is needed to bypass a bug
-  // in Symbian's C++ compiler that prevents it from picking the right
-  // overload between:
-  //
-  //   PrintTo(const T& x, ...);
-  //   PrintTo(T* x, ...);
-  DefaultPrintTo(IsContainerTest<T>(0), is_pointer<T>(), value, os);
+  // Note that MSVC and clang-cl do allow an implicit conversion from
+  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
+  // So don't use ImplicitlyConvertible if it can be helped since it will
+  // cause this warning, and use a separate overload of DefaultPrintTo for
+  // function pointers so that the `*os << p` in the object pointer overload
+  // doesn't cause that warning either.
+  DefaultPrintTo(
+      WrapPrinterType <
+                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+              !IsRecursiveContainer<T>::value
+          ? kPrintContainer
+          : !is_pointer<T>::value
+                ? kPrintOther
+#if GTEST_LANG_CXX11
+                : std::is_function<typename std::remove_pointer<T>::type>::value
+#else
+                : !internal::ImplicitlyConvertible<T, const void*>::value
+#endif
+                      ? kPrintFunctionPointer
+                      : kPrintPointer > (),
+      value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -581,6 +631,17 @@ inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
 }
 #endif  // GTEST_HAS_STD_WSTRING
 
+#if GTEST_HAS_ABSL
+// Overload for absl::string_view.
+inline void PrintTo(absl::string_view sp, ::std::ostream* os) {
+  PrintTo(::std::string(sp), os);
+}
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_LANG_CXX11
+inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
+#endif  // GTEST_LANG_CXX11
+
 #if GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
@@ -710,6 +771,48 @@ class UniversalPrinter {
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
+#if GTEST_HAS_ABSL
+
+// Printer for absl::optional
+
+template <typename T>
+class UniversalPrinter<::absl::optional<T>> {
+ public:
+  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+    *os << '(';
+    if (!value) {
+      *os << "nullopt";
+    } else {
+      UniversalPrint(*value, os);
+    }
+    *os << ')';
+  }
+};
+
+// Printer for absl::variant
+
+template <typename... T>
+class UniversalPrinter<::absl::variant<T...>> {
+ public:
+  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+    *os << '(';
+    absl::visit(Visitor{os}, value);
+    *os << ')';
+  }
+
+ private:
+  struct Visitor {
+    template <typename U>
+    void operator()(const U& u) const {
+      *os << "'" << GetTypeName<U>() << "' with value ";
+      UniversalPrint(u, os);
+    }
+    ::std::ostream* os;
+  };
+};
+
+#endif  // GTEST_HAS_ABSL
+
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
 template <typename T>
@@ -723,7 +826,7 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
     // If the array has more than kThreshold elements, we'll have to
     // omit some details by printing only the first and the last
     // kChunkSize elements.
-    // TODO(wan@google.com): let the user control the threshold using a flag.
+    // FIXME: let the user control the threshold using a flag.
     if (len <= kThreshold) {
       PrintRawArrayTo(begin, len, os);
     } else {
@@ -805,7 +908,7 @@ class UniversalTersePrinter<const char*> {
     if (str == NULL) {
       *os << "NULL";
     } else {
-      UniversalPrint(string(str), os);
+      UniversalPrint(std::string(str), os);
     }
   }
 };
@@ -856,7 +959,7 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector<string> Strings;
+typedef ::std::vector< ::std::string> Strings;
 
 // TuplePolicy<TupleT> must provide:
 // - tuple_size
@@ -875,12 +978,13 @@ struct TuplePolicy {
   static const size_t tuple_size = ::std::tr1::tuple_size<Tuple>::value;
 
   template <size_t I>
-  struct tuple_element : ::std::tr1::tuple_element<I, Tuple> {};
+  struct tuple_element : ::std::tr1::tuple_element<static_cast<int>(I), Tuple> {
+  };
 
   template <size_t I>
-  static typename AddReference<
-      const typename ::std::tr1::tuple_element<I, Tuple>::type>::type get(
-      const Tuple& tuple) {
+  static typename AddReference<const typename ::std::tr1::tuple_element<
+      static_cast<int>(I), Tuple>::type>::type
+  get(const Tuple& tuple) {
     return ::std::tr1::get<I>(tuple);
   }
 };
@@ -976,6 +1080,16 @@ Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) {
 
 }  // namespace internal
 
+#if GTEST_HAS_ABSL
+namespace internal2 {
+template <typename T>
+void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
+    const T& value, ::std::ostream* os) {
+  internal::PrintTo(absl::string_view(value), os);
+}
+}  // namespace internal2
+#endif
+
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
index f63fa9a1b2a..1e8983938ea 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -26,17 +26,21 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 //
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
+// GOOGLETEST_CM0004 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // This helper class can be used to mock out Google Test failure reporting
@@ -97,13 +101,12 @@ class GTEST_API_ SingleFailureChecker {
  public:
   // The constructor remembers the arguments.
   SingleFailureChecker(const TestPartResultArray* results,
-                       TestPartResult::Type type,
-                       const string& substr);
+                       TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
-  const string substr_;
+  const std::string substr_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
 };
@@ -112,6 +115,8 @@ class GTEST_API_ SingleFailureChecker {
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // A set of macros for testing Google Test assertions or code that's expected
 // to generate Google Test fatal failures.  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
index 77eb844839d..1c7b89e0879 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -27,8 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: mheule@google.com (Markus Heule)
-//
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
@@ -38,6 +37,9 @@
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // A copyable object representing the result of a test part (i.e. an
@@ -143,7 +145,7 @@ class GTEST_API_ TestPartResultArray {
 };
 
 // This interface knows how to report a test part result.
-class TestPartResultReporterInterface {
+class GTEST_API_ TestPartResultReporterInterface {
  public:
   virtual ~TestPartResultReporterInterface() {}
 
@@ -176,4 +178,6 @@ class GTEST_API_ HasNewFatalFailureHelper
 
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 5f69d5678ea..74bce46bdc5 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -26,8 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
+
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -82,6 +83,24 @@ TYPED_TEST(FooTest, DoesBlah) {
 
 TYPED_TEST(FooTest, HasPropertyA) { ... }
 
+// TYPED_TEST_CASE takes an optional third argument which allows to specify a
+// class that generates custom test name suffixes based on the type. This should
+// be a class which has a static template function GetName(int index) returning
+// a string for each type. The provided integer index equals the index of the
+// type in the provided type list. In many cases the index can be ignored.
+//
+// For example:
+//   class MyTypeNames {
+//    public:
+//     template <typename T>
+//     static std::string GetName(int) {
+//       if (std::is_same<T, char>()) return "char";
+//       if (std::is_same<T, int>()) return "int";
+//       if (std::is_same<T, unsigned int>()) return "unsignedInt";
+//     }
+//   };
+//   TYPED_TEST_CASE(FooTest, MyTypes, MyTypeNames);
+
 #endif  // 0
 
 // Type-parameterized tests are abstract test patterns parameterized
@@ -143,6 +162,11 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // If the type list contains only one type, you can write that type
 // directly without Types<...>:
 //   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int);
+//
+// Similar to the optional argument of TYPED_TEST_CASE above,
+// INSTANTIATE_TEST_CASE_P takes an optional fourth argument which allows to
+// generate custom names.
+//   INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes, MyTypeNames);
 
 #endif  // 0
 
@@ -159,32 +183,46 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
 // given test case.
 # define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_
 
+// Expands to the name of the typedef for the NameGenerator, responsible for
+// creating the suffixes of the name.
+#define GTEST_NAME_GENERATOR_(TestCaseName) \
+  gtest_type_params_##TestCaseName##_NameGenerator
+
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define TYPED_TEST_CASE(CaseName, Types) \
-  typedef ::testing::internal::TypeList< Types >::type \
-      GTEST_TYPE_PARAMS_(CaseName)
-
-# define TYPED_TEST(CaseName, TestName) \
-  template <typename gtest_TypeParam_> \
-  class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \
-      : public CaseName<gtest_TypeParam_> { \
-   private: \
-    typedef CaseName<gtest_TypeParam_> TestFixture; \
-    typedef gtest_TypeParam_ TypeParam; \
-    virtual void TestBody(); \
-  }; \
-  bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTest< \
-          CaseName, \
-          ::testing::internal::TemplateSel< \
-              GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \
-          GTEST_TYPE_PARAMS_(CaseName)>::Register(\
-              "", ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-              #CaseName, #TestName, 0); \
-  template <typename gtest_TypeParam_> \
-  void GTEST_TEST_CLASS_NAME_(CaseName, TestName)<gtest_TypeParam_>::TestBody()
+# define TYPED_TEST_CASE(CaseName, Types, ...)                             \
+  typedef ::testing::internal::TypeList< Types >::type GTEST_TYPE_PARAMS_( \
+      CaseName);                                                           \
+  typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type    \
+      GTEST_NAME_GENERATOR_(CaseName)
+
+# define TYPED_TEST(CaseName, TestName)                                       \
+  template <typename gtest_TypeParam_>                                        \
+  class GTEST_TEST_CLASS_NAME_(CaseName, TestName)                            \
+      : public CaseName<gtest_TypeParam_> {                                   \
+   private:                                                                   \
+    typedef CaseName<gtest_TypeParam_> TestFixture;                           \
+    typedef gtest_TypeParam_ TypeParam;                                       \
+    virtual void TestBody();                                                  \
+  };                                                                          \
+  static bool gtest_##CaseName##_##TestName##_registered_                     \
+        GTEST_ATTRIBUTE_UNUSED_ =                                             \
+      ::testing::internal::TypeParameterizedTest<                             \
+          CaseName,                                                           \
+          ::testing::internal::TemplateSel<GTEST_TEST_CLASS_NAME_(CaseName,   \
+                                                                  TestName)>, \
+          GTEST_TYPE_PARAMS_(                                                 \
+              CaseName)>::Register("",                                        \
+                                   ::testing::internal::CodeLocation(         \
+                                       __FILE__, __LINE__),                   \
+                                   #CaseName, #TestName, 0,                   \
+                                   ::testing::internal::GenerateNames<        \
+                                       GTEST_NAME_GENERATOR_(CaseName),       \
+                                       GTEST_TYPE_PARAMS_(CaseName)>());      \
+  template <typename gtest_TypeParam_>                                        \
+  void GTEST_TEST_CLASS_NAME_(CaseName,                                       \
+                              TestName)<gtest_TypeParam_>::TestBody()
 
 #endif  // GTEST_HAS_TYPED_TEST
 
@@ -241,22 +279,27 @@ INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes);
   namespace GTEST_CASE_NAMESPACE_(CaseName) { \
   typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \
   } \
-  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \
-      GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\
-          __FILE__, __LINE__, #__VA_ARGS__)
+  static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) \
+      GTEST_ATTRIBUTE_UNUSED_ = \
+          GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames( \
+              __FILE__, __LINE__, #__VA_ARGS__)
 
 // The 'Types' template argument below must have spaces around it
 // since some compilers may choke on '>>' when passing a template
 // instance (e.g. Types<int>)
-# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \
-  bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \
-      ::testing::internal::TypeParameterizedTestCase<CaseName, \
-          GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_, \
-          ::testing::internal::TypeList< Types >::type>::Register(\
-              #Prefix, \
-              ::testing::internal::CodeLocation(__FILE__, __LINE__), \
-              &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), \
-              #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName))
+# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types, ...)      \
+  static bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ =       \
+      ::testing::internal::TypeParameterizedTestCase<                     \
+          CaseName, GTEST_CASE_NAMESPACE_(CaseName)::gtest_AllTests_,     \
+          ::testing::internal::TypeList< Types >::type>::                 \
+          Register(#Prefix,                                               \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_CASE_P_STATE_(CaseName), #CaseName,  \
+                   GTEST_REGISTERED_TEST_NAMES_(CaseName),                \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
+                       ::testing::internal::TypeList< Types >::type>())
 
 #endif  // GTEST_HAS_TYPED_TEST_P
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
index f846c5bd669..3b4bb1ee902 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
 // included by any test program that uses Google Test.
@@ -48,6 +47,8 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_H_
 #define GTEST_INCLUDE_GTEST_GTEST_H_
 
@@ -65,6 +66,9 @@
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // Depending on the platform, different string classes are available.
 // On Linux, in addition to ::std::string, Google also makes use of
 // class ::string, which has the same interface as ::std::string, but
@@ -82,6 +86,15 @@
 
 namespace testing {
 
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4805)
+# pragma warning(disable:4100)
+#endif
+
+
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -103,6 +116,10 @@ GTEST_DECLARE_string_(color);
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
 
+// This flag controls whether Google Test installs a signal handler that dumps
+// debugging information when fatal signals are raised.
+GTEST_DECLARE_bool_(install_failure_signal_handler);
+
 // This flag causes the Google Test to list tests. None of the tests listed
 // are actually run if the flag is provided.
 GTEST_DECLARE_bool_(list_tests);
@@ -115,6 +132,9 @@ GTEST_DECLARE_string_(output);
 // test.
 GTEST_DECLARE_bool_(print_time);
 
+// This flags control whether Google Test prints UTF8 characters as text.
+GTEST_DECLARE_bool_(print_utf8);
+
 // This flag specifies the random number seed.
 GTEST_DECLARE_int32_(random_seed);
 
@@ -135,7 +155,7 @@ GTEST_DECLARE_int32_(stack_trace_depth);
 
 // When this flag is specified, a failed assertion will throw an
 // exception if exceptions are enabled, or exit the program with a
-// non-zero code otherwise.
+// non-zero code otherwise. For use with an external test framework.
 GTEST_DECLARE_bool_(throw_on_failure);
 
 // When this flag is set with a "host:port" string, on supported
@@ -143,6 +163,10 @@ GTEST_DECLARE_bool_(throw_on_failure);
 // the specified host machine.
 GTEST_DECLARE_string_(stream_result_to);
 
+#if GTEST_USE_OWN_FLAGFILE_FLAG_
+GTEST_DECLARE_string_(flagfile);
+#endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -160,6 +184,7 @@ class TestEventListenersAccessor;
 class TestEventRepeater;
 class UnitTestRecordPropertyTestHelper;
 class WindowsDeathTest;
+class FuchsiaDeathTest;
 class UnitTestImpl* GetUnitTestImpl();
 void ReportFailureInUnknownLocation(TestPartResult::Type result_type,
                                     const std::string& message);
@@ -259,7 +284,9 @@ class GTEST_API_ AssertionResult {
   // Used in EXPECT_TRUE/FALSE(assertion_result).
   AssertionResult(const AssertionResult& other);
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
 
   // Used in the EXPECT_TRUE/FALSE(bool_expression).
   //
@@ -276,7 +303,9 @@ class GTEST_API_ AssertionResult {
           /*enabler*/ = NULL)
       : success_(success) {}
 
+#if defined(_MSC_VER) && _MSC_VER < 1910
   GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
 
   // Assignment operator.
   AssertionResult& operator=(AssertionResult other) {
@@ -297,7 +326,7 @@ class GTEST_API_ AssertionResult {
   const char* message() const {
     return message_.get() != NULL ?  message_->c_str() : "";
   }
-  // TODO(vladl@google.com): Remove this after making sure no clients use it.
+  // FIXME: Remove this after making sure no clients use it.
   // Deprecated; please use message() instead.
   const char* failure_message() const { return message(); }
 
@@ -345,6 +374,15 @@ GTEST_API_ AssertionResult AssertionFailure();
 // Deprecated; use AssertionFailure() << msg.
 GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 
+}  // namespace testing
+
+// Includes the auto-generated header that implements a family of generic
+// predicate assertion macros. This include comes late because it relies on
+// APIs declared above.
+#include "gtest/gtest_pred_impl.h"
+
+namespace testing {
+
 // The abstract class that all tests inherit from.
 //
 // In Google Test, a unit test program contains one or many TestCases, and
@@ -355,7 +393,7 @@ GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
 // this for you.
 //
 // The only time you derive from Test is when defining a test fixture
-// to be used a TEST_F.  For example:
+// to be used in a TEST_F.  For example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
@@ -550,9 +588,8 @@ class GTEST_API_ TestResult {
   // Returns the elapsed time, in milliseconds.
   TimeInMillis elapsed_time() const { return elapsed_time_; }
 
-  // Returns the i-th test part result among all the results. i can range
-  // from 0 to test_property_count() - 1. If i is not in that range, aborts
-  // the program.
+  // Returns the i-th test part result among all the results. i can range from 0
+  // to total_part_count() - 1. If i is not in that range, aborts the program.
   const TestPartResult& GetTestPartResult(int i) const;
 
   // Returns the i-th test property. i can range from 0 to
@@ -569,6 +606,7 @@ class GTEST_API_ TestResult {
   friend class internal::TestResultAccessor;
   friend class internal::UnitTestImpl;
   friend class internal::WindowsDeathTest;
+  friend class internal::FuchsiaDeathTest;
 
   // Gets the vector of TestPartResults.
   const std::vector<TestPartResult>& test_part_results() const {
@@ -594,7 +632,7 @@ class GTEST_API_ TestResult {
 
   // Adds a failure if the key is a reserved attribute of Google Test
   // testcase tags.  Returns true if the property is valid.
-  // TODO(russr): Validate attribute names are legal and human readable.
+  // FIXME: Validate attribute names are legal and human readable.
   static bool ValidateTestProperty(const std::string& xml_element,
                                    const TestProperty& test_property);
 
@@ -675,6 +713,9 @@ class GTEST_API_ TestInfo {
   // Returns the line where this test is defined.
   int line() const { return location_.line; }
 
+  // Return true if this test should not be run because it's in another shard.
+  bool is_in_another_shard() const { return is_in_another_shard_; }
+
   // Returns true if this test should run, that is if the test is not
   // disabled (or it is disabled but the also_run_disabled_tests flag has
   // been specified) and its full name matches the user-specified filter.
@@ -695,10 +736,9 @@ class GTEST_API_ TestInfo {
 
   // Returns true iff this test will appear in the XML report.
   bool is_reportable() const {
-    // For now, the XML report includes all tests matching the filter.
-    // In the future, we may trim tests that are excluded because of
-    // sharding.
-    return matches_filter_;
+    // The XML report includes tests matching the filter, excluding those
+    // run in other shards.
+    return matches_filter_ && !is_in_another_shard_;
   }
 
   // Returns the result of the test.
@@ -762,6 +802,7 @@ class GTEST_API_ TestInfo {
   bool is_disabled_;                // True iff this test is disabled
   bool matches_filter_;             // True if this test matches the
                                     // user-specified filter.
+  bool is_in_another_shard_;        // Will be run in another shard.
   internal::TestFactoryBase* const factory_;  // The factory that creates
                                               // the test object
 
@@ -986,6 +1027,18 @@ class Environment {
   virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; }
 };
 
+#if GTEST_HAS_EXCEPTIONS
+
+// Exception which can be thrown from TestEventListener::OnTestPartResult.
+class GTEST_API_ AssertionException
+    : public internal::GoogleTestFailureException {
+ public:
+  explicit AssertionException(const TestPartResult& result)
+      : GoogleTestFailureException(result) {}
+};
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
 // The interface for tracing execution of tests. The methods are organized in
 // the order the corresponding events are fired.
 class TestEventListener {
@@ -1014,6 +1067,8 @@ class TestEventListener {
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
   // Fired after a failed assertion or a SUCCEED() invocation.
+  // If you want to throw an exception from this function to skip to the next
+  // TEST, it must be AssertionException defined above, or inherited from it.
   virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0;
 
   // Fired after the test ends.
@@ -1180,14 +1235,12 @@ class GTEST_API_ UnitTest {
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
 
-#if GTEST_HAS_PARAM_TEST
   // Returns the ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   //
   // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry()
       GTEST_LOCK_EXCLUDED_(mutex_);
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Gets the number of successful test cases.
   int successful_test_case_count() const;
@@ -1287,11 +1340,11 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl() { return impl_; }
   const internal::UnitTestImpl* impl() const { return impl_; }
 
-  // These classes and funcions are friends as they need to access private
+  // These classes and functions are friends as they need to access private
   // members of UnitTest.
+  friend class ScopedTrace;
   friend class Test;
   friend class internal::AssertHelper;
-  friend class internal::ScopedTrace;
   friend class internal::StreamingListenerTest;
   friend class internal::UnitTestRecordPropertyTestHelper;
   friend Environment* AddGlobalTestEnvironment(Environment* env);
@@ -1388,11 +1441,9 @@ AssertionResult CmpHelperEQ(const char* lhs_expression,
                             const char* rhs_expression,
                             const T1& lhs,
                             const T2& rhs) {
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4389 /* signed/unsigned mismatch */)
   if (lhs == rhs) {
     return AssertionSuccess();
   }
-GTEST_DISABLE_MSC_WARNINGS_POP_()
 
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
@@ -1706,7 +1757,6 @@ class GTEST_API_ AssertHelper {
 
 }  // namespace internal
 
-#if GTEST_HAS_PARAM_TEST
 // The pure interface class that all value-parameterized tests inherit from.
 // A value-parameterized class must inherit from both ::testing::Test and
 // ::testing::WithParamInterface. In most cases that just means inheriting
@@ -1748,11 +1798,8 @@ class WithParamInterface {
   virtual ~WithParamInterface() {}
 
   // The current parameter value. Is also available in the test fixture's
-  // constructor. This member function is non-static, even though it only
-  // references static data, to reduce the opportunity for incorrect uses
-  // like writing 'WithParamInterface<bool>::GetParam()' for a test that
-  // uses a fixture whose parameter type is int.
-  const ParamType& GetParam() const {
+  // constructor.
+  static const ParamType& GetParam() {
     GTEST_CHECK_(parameter_ != NULL)
         << "GetParam() can only be called inside a value-parameterized test "
         << "-- did you intend to write TEST_P instead of TEST_F?";
@@ -1783,8 +1830,6 @@ template <typename T>
 class TestWithParam : public Test, public WithParamInterface<T> {
 };
 
-#endif  // GTEST_HAS_PARAM_TEST
-
 // Macros for indicating success/failure in test code.
 
 // ADD_FAILURE unconditionally adds a failure to the current test.
@@ -1857,22 +1902,18 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
 #define EXPECT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
 #define EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_((condition), #condition, false, true, \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_FATAL_FAILURE_)
 #define ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
-// Includes the auto-generated header that implements a family of
-// generic predicate assertion macros.
-#include "gtest/gtest_pred_impl.h"
-
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -1914,8 +1955,8 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 //
 // Examples:
 //
-//   EXPECT_NE(5, Foo());
-//   EXPECT_EQ(NULL, a_pointer);
+//   EXPECT_NE(Foo(), 5);
+//   EXPECT_EQ(a_pointer, NULL);
 //   ASSERT_LT(i, array_size);
 //   ASSERT_GT(records.size(), 0) << "There is no record left.";
 
@@ -2101,6 +2142,57 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 #define EXPECT_NO_FATAL_FAILURE(statement) \
     GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
+// Causes a trace (including the given source file path and line number,
+// and the given message) to be included in every test failure message generated
+// by code in the scope of the lifetime of an instance of this class. The effect
+// is undone with the destruction of the instance.
+//
+// The message argument can be anything streamable to std::ostream.
+//
+// Example:
+//   testing::ScopedTrace trace("file.cc", 123, "message");
+//
+class GTEST_API_ ScopedTrace {
+ public:
+  // The c'tor pushes the given source file location and message onto
+  // a trace stack maintained by Google Test.
+
+  // Template version. Uses Message() to convert the values into strings.
+  // Slow, but flexible.
+  template <typename T>
+  ScopedTrace(const char* file, int line, const T& message) {
+    PushTrace(file, line, (Message() << message).GetString());
+  }
+
+  // Optimize for some known types.
+  ScopedTrace(const char* file, int line, const char* message) {
+    PushTrace(file, line, message ? message : "(null)");
+  }
+
+#if GTEST_HAS_GLOBAL_STRING
+  ScopedTrace(const char* file, int line, const ::string& message) {
+    PushTrace(file, line, message);
+  }
+#endif
+
+  ScopedTrace(const char* file, int line, const std::string& message) {
+    PushTrace(file, line, message);
+  }
+
+  // The d'tor pops the info pushed by the c'tor.
+  //
+  // Note that the d'tor is not virtual in order to be efficient.
+  // Don't inherit from ScopedTrace!
+  ~ScopedTrace();
+
+ private:
+  void PushTrace(const char* file, int line, std::string message);
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
+                            // c'tor and d'tor.  Therefore it doesn't
+                            // need to be used otherwise.
+
 // Causes a trace (including the source file path, the current line
 // number, and the given message) to be included in every test failure
 // message generated by code in the current scope.  The effect is
@@ -2112,9 +2204,14 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // of the dummy variable name, thus allowing multiple SCOPED_TRACE()s
 // to appear in the same block - as long as they are on different
 // lines.
+//
+// Assuming that each thread maintains its own stack of traces.
+// Therefore, a SCOPED_TRACE() would (correctly) only affect the
+// assertions in its own thread.
 #define SCOPED_TRACE(message) \
-  ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, ::testing::Message() << (message))
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
+    __FILE__, __LINE__, (message))
+
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles iff type1 and type2 are
@@ -2194,7 +2291,7 @@ bool StaticAssertTypeEq() {
 // name of the test within the test case.
 //
 // A test fixture class must be declared earlier.  The user should put
-// his test code between braces after using this macro.  Example:
+// the test code between braces after using this macro.  Example:
 //
 //   class FooTest : public testing::Test {
 //    protected:
@@ -2209,14 +2306,22 @@ bool StaticAssertTypeEq() {
 //   }
 //
 //   TEST_F(FooTest, ReturnsElementCountCorrectly) {
-//     EXPECT_EQ(0, a_.size());
-//     EXPECT_EQ(1, b_.size());
+//     EXPECT_EQ(a_.size(), 0);
+//     EXPECT_EQ(b_.size(), 1);
 //   }
 
 #define TEST_F(test_fixture, test_name)\
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
 
+// Returns a path to temporary directory.
+// Tries to determine an appropriate directory for the platform.
+GTEST_API_ std::string TempDir();
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
 }  // namespace testing
 
 // Use this function in main() to run all tests.  It returns 0 if all
@@ -2233,4 +2338,6 @@ inline int RUN_ALL_TESTS() {
   return ::testing::UnitTest::GetInstance()->Run();
 }
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index 30ae712f50e..0c1105cb8eb 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -27,18 +27,19 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// This file is AUTOMATICALLY GENERATED on 10/31/2011 by command
+// This file is AUTOMATICALLY GENERATED on 01/02/2018 by command
 // 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-// Makes sure this header is not included before gtest.h.
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-# error Do not include gtest_pred_impl.h directly.  Include gtest.h instead.
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#include "gtest/gtest.h"
+
+namespace testing {
 
 // This header implements a family of generic predicate assertion
 // macros:
@@ -66,8 +67,6 @@
 // We also define the EXPECT_* variations.
 //
 // For now we only support predicates whose arity is at most 5.
-// Please email googletestframework@googlegroups.com if you need
-// support for higher arities.
 
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
@@ -355,4 +354,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 
 
+}  // namespace testing
+
 #endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
index da80ddc6c70..e651671ebde 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -26,10 +26,10 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// Google C++ Testing Framework definitions useful in production code.
+// Google C++ Testing and Mocking Framework definitions useful in production code.
+// GOOGLETEST_CM0003 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -40,17 +40,20 @@
 //
 // class MyClass {
 //  private:
-//   void MyMethod();
-//   FRIEND_TEST(MyClassTest, MyMethod);
+//   void PrivateMethod();
+//   FRIEND_TEST(MyClassTest, PrivateMethodWorks);
 // };
 //
 // class MyClassTest : public testing::Test {
 //   // ...
 // };
 //
-// TEST_F(MyClassTest, MyMethod) {
-//   // Can call MyClass::MyMethod() here.
+// TEST_F(MyClassTest, PrivateMethodWorks) {
+//   // Can call MyClass::PrivateMethod() here.
 // }
+//
+// Note: The test class must be in the same namespace as the class being tested.
+// For example, putting MyClassTest in an anonymous namespace will not work.
 
 #define FRIEND_TEST(test_case_name, test_name)\
 friend class test_case_name##_##test_name##_Test
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
new file mode 100644
index 00000000000..ff391fb4e2b
--- /dev/null
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -0,0 +1,56 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gtest.h`
+
+### The following macros can be defined:
+
+*   `GTEST_OS_STACK_TRACE_GETTER_` - The name of an implementation of
+    `OsStackTraceGetterInterface`.
+*   `GTEST_CUSTOM_TEMPDIR_FUNCTION_` - An override for `testing::TempDir()`. See
+    `testing::TempDir` for semantics and signature.
+
+## Header `gtest-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GTEST_FLAG(flag_name)`
+*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
+    own flagfile flag parsing.
+*   `GTEST_DECLARE_bool_(name)`
+*   `GTEST_DECLARE_int32_(name)`
+*   `GTEST_DECLARE_string_(name)`
+*   `GTEST_DEFINE_bool_(name, default_val, doc)`
+*   `GTEST_DEFINE_int32_(name, default_val, doc)`
+*   `GTEST_DEFINE_string_(name, default_val, doc)`
+
+### Logging:
+
+*   `GTEST_LOG_(severity)`
+*   `GTEST_CHECK_(condition)`
+*   Functions `LogToStderr()` and `FlushInfoLog()` have to be provided too.
+
+### Threading:
+
+*   `GTEST_HAS_NOTIFICATION_` - Enabled if Notification is already provided.
+*   `GTEST_HAS_MUTEX_AND_THREAD_LOCAL_` - Enabled if `Mutex` and `ThreadLocal`
+    are already provided. Must also provide `GTEST_DECLARE_STATIC_MUTEX_(mutex)`
+    and `GTEST_DEFINE_STATIC_MUTEX_(mutex)`
+*   `GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)`
+*   `GTEST_LOCK_EXCLUDED_(locks)`
+
+### Underlying library support features
+
+*   `GTEST_HAS_CXXABI_H_`
+
+### Exporting API symbols:
+
+*   `GTEST_API_` - Specifier for exported symbols.
+
+## Header `gtest-printers.h`
+
+*   See documentation at `gtest/gtest-printers.h` for details on how to define a
+    custom printer.
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index 7e744bd3bb3..cd85d956d2d 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -27,39 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Injection point for custom user configurations.
-// The following macros can be defined:
-//
-//   Flag related macros:
-//     GTEST_FLAG(flag_name)
-//     GTEST_USE_OWN_FLAGFILE_FLAG_  - Define to 0 when the system provides its
-//                                     own flagfile flag parsing.
-//     GTEST_DECLARE_bool_(name)
-//     GTEST_DECLARE_int32_(name)
-//     GTEST_DECLARE_string_(name)
-//     GTEST_DEFINE_bool_(name, default_val, doc)
-//     GTEST_DEFINE_int32_(name, default_val, doc)
-//     GTEST_DEFINE_string_(name, default_val, doc)
-//
-//   Test filtering:
-//     GTEST_TEST_FILTER_ENV_VAR_ - The name of an environment variable that
-//                                  will be used if --GTEST_FLAG(test_filter)
-//                                  is not provided.
-//
-//   Logging:
-//     GTEST_LOG_(severity)
-//     GTEST_CHECK_(condition)
-//     Functions LogToStderr() and FlushInfoLog() have to be provided too.
-//
-//   Threading:
-//     GTEST_HAS_NOTIFICATION_ - Enabled if Notification is already provided.
-//     GTEST_HAS_MUTEX_AND_THREAD_LOCAL_ - Enabled if Mutex and ThreadLocal are
-//                                         already provided.
-//     Must also provide GTEST_DECLARE_STATIC_MUTEX_(mutex) and
-//     GTEST_DEFINE_STATIC_MUTEX_(mutex)
-//
-//     GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-//     GTEST_LOCK_EXCLUDED_(locks)
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
index 60c1ea050b6..eb4467abcab 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -31,8 +31,8 @@
 // installation of gTest.
 // It will be included from gtest-printers.h and the overrides in this file
 // will be visible to everyone.
-// See documentation at gtest/gtest-printers.h for details on how to define a
-// custom printer.
+//
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
index c27412a8981..4c8e07be23f 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -27,11 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Injection point for custom user configurations.
-// The following macros can be defined:
-//
-// GTEST_OS_STACK_TRACE_GETTER_  - The name of an implementation of
-//                                 OsStackTraceGetterInterface.
+// Injection point for custom user configurations. See README for details
 //
 // ** Custom implementation starts here **
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 2b3a78f5bf8..0a9b42c8a57 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -27,12 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
+// GOOGLETEST_CM0001 DO NOT DELETE
 
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
@@ -53,6 +52,9 @@ const char kInternalRunDeathTestFlag[] = "internal_run_death_test";
 
 #if GTEST_HAS_DEATH_TEST
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // DeathTest is a class that hides much of the complexity of the
 // GTEST_DEATH_TEST_ macro.  It is abstract; its static Create method
 // returns a concrete class that depends on the prevailing death test
@@ -136,6 +138,8 @@ class GTEST_API_ DeathTest {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
@@ -218,14 +222,18 @@ GTEST_API_ bool ExitedUnsuccessfully(int exit_status);
 // can be streamed.
 
 // This macro is for implementing ASSERT/EXPECT_DEBUG_DEATH when compiled in
-// NDEBUG mode. In this case we need the statements to be executed, the regex is
-// ignored, and the macro must accept a streamed message even though the message
-// is never printed.
-# define GTEST_EXECUTE_STATEMENT_(statement, regex) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-     GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } else \
+// NDEBUG mode. In this case we need the statements to be executed and the macro
+// must accept a streamed message even though the message is never printed.
+// The regex object is not evaluated, but it is used to prevent "unused"
+// warnings and to avoid an expression that doesn't compile in debug mode.
+#define GTEST_EXECUTE_STATEMENT_(statement, regex)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                \
+  if (::testing::internal::AlwaysTrue()) {                     \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
+  } else if (!::testing::internal::AlwaysTrue()) {             \
+    const ::testing::internal::RE& gtest_regex = (regex);      \
+    static_cast<void>(gtest_regex);                            \
+  } else                                                       \
     ::testing::Message()
 
 // A class representing the parsed contents of the
@@ -264,53 +272,6 @@ class InternalRunDeathTestFlag {
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag();
 
-#else  // GTEST_HAS_DEATH_TEST
-
-// This macro is used for implementing macros such as
-// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where
-// death tests are not supported. Those macros must compile on such systems
-// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on
-// systems that support death tests. This allows one to write such a macro
-// on a system that does not support death tests and be sure that it will
-// compile on a death-test supporting system.
-//
-// Parameters:
-//   statement -  A statement that a macro such as EXPECT_DEATH would test
-//                for program termination. This macro has to make sure this
-//                statement is compiled but not executed, to ensure that
-//                EXPECT_DEATH_IF_SUPPORTED compiles with a certain
-//                parameter iff EXPECT_DEATH compiles with it.
-//   regex     -  A regex that a macro such as EXPECT_DEATH would use to test
-//                the output of statement.  This parameter has to be
-//                compiled but not evaluated by this macro, to ensure that
-//                this macro only accepts expressions that a macro such as
-//                EXPECT_DEATH would accept.
-//   terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED
-//                and a return statement for ASSERT_DEATH_IF_SUPPORTED.
-//                This ensures that ASSERT_DEATH_IF_SUPPORTED will not
-//                compile inside functions where ASSERT_DEATH doesn't
-//                compile.
-//
-//  The branch that has an always false condition is used to ensure that
-//  statement and regex are compiled (and thus syntactically correct) but
-//  never executed. The unreachable code macro protects the terminator
-//  statement from generating an 'unreachable code' warning in case
-//  statement unconditionally returns or throws. The Message constructor at
-//  the end allows the syntax of streaming additional messages into the
-//  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
-
 #endif  // GTEST_HAS_DEATH_TEST
 
 }  // namespace internal
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index 7a13b4b0de6..ae38d95bf84 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -27,21 +27,24 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Author: keith.ray@gmail.com (Keith Ray)
-//
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
 // Google Test.  They are subject to change without notice.
 //
-// This file is #included in <gtest/internal/gtest-internal.h>.
+// This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 namespace internal {
 
@@ -203,4 +206,6 @@ class GTEST_API_ FilePath {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index ebd1cf615de..b762f61fc53 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -27,13 +27,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
@@ -61,8 +61,8 @@
 #include <vector>
 
 #include "gtest/gtest-message.h"
-#include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-filepath.h"
+#include "gtest/internal/gtest-string.h"
 #include "gtest/internal/gtest-type-util.h"
 
 // Due to C++ preprocessor weirdness, we need double indirection to
@@ -76,6 +76,9 @@
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
 #define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
 
+// Stringifies its argument.
+#define GTEST_STRINGIFY_(name) #name
+
 class ProtocolMessage;
 namespace proto2 { class Message; }
 
@@ -96,7 +99,6 @@ ::std::string PrintToString(const T& value);
 namespace internal {
 
 struct TraceInfo;                      // Information about a trace point.
-class ScopedTrace;                     // Implements scoped trace.
 class TestInfoImpl;                    // Opaque implementation of TestInfo
 class UnitTestImpl;                    // Opaque implementation of UnitTest
 
@@ -139,6 +141,9 @@ GTEST_API_ std::string AppendUserMessage(
 
 #if GTEST_HAS_EXCEPTIONS
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
+/* an exported class was derived from a class that was not exported */)
+
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
 // are enabled).  We derive it from std::runtime_error, which is for
@@ -150,32 +155,15 @@ class GTEST_API_ GoogleTestFailureException : public ::std::runtime_error {
   explicit GoogleTestFailureException(const TestPartResult& failure);
 };
 
-#endif  // GTEST_HAS_EXCEPTIONS
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4275
 
-// A helper class for creating scoped traces in user programs.
-class GTEST_API_ ScopedTrace {
- public:
-  // The c'tor pushes the given source file location and message onto
-  // a trace stack maintained by Google Test.
-  ScopedTrace(const char* file, int line, const Message& message);
-
-  // The d'tor pops the info pushed by the c'tor.
-  //
-  // Note that the d'tor is not virtual in order to be efficient.
-  // Don't inherit from ScopedTrace!
-  ~ScopedTrace();
-
- private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
-} GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
-                            // c'tor and d'tor.  Therefore it doesn't
-                            // need to be used otherwise.
+#endif  // GTEST_HAS_EXCEPTIONS
 
 namespace edit_distance {
 // Returns the optimal edits to go from 'left' to 'right'.
 // All edits cost the same, with replace having lower priority than
 // add/remove.
-// Simple implementation of the Wagner–Fischer algorithm.
+// Simple implementation of the Wagner-Fischer algorithm.
 // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm
 enum EditType { kMatch, kAdd, kRemove, kReplace };
 GTEST_API_ std::vector<EditType> CalculateOptimalEdits(
@@ -502,9 +490,10 @@ typedef void (*SetUpTestCaseFunc)();
 typedef void (*TearDownTestCaseFunc)();
 
 struct CodeLocation {
-  CodeLocation(const string& a_file, int a_line) : file(a_file), line(a_line) {}
+  CodeLocation(const std::string& a_file, int a_line)
+      : file(a_file), line(a_line) {}
 
-  string file;
+  std::string file;
   int line;
 };
 
@@ -544,6 +533,9 @@ GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
 #if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 // State of the definition of a type-parameterized test case.
 class GTEST_API_ TypedTestCasePState {
  public:
@@ -589,6 +581,8 @@ class GTEST_API_ TypedTestCasePState {
   RegisteredTestsMap registered_tests_;
 };
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 // Skips to the first non-space char after the first comma in 'str';
 // returns NULL if no comma is found in 'str'.
 inline const char* SkipComma(const char* str) {
@@ -612,6 +606,37 @@ inline std::string GetPrefixUntilComma(const char* str) {
 void SplitString(const ::std::string& str, char delimiter,
                  ::std::vector< ::std::string>* dest);
 
+// The default argument to the template below for the case when the user does
+// not provide a name generator.
+struct DefaultNameGenerator {
+  template <typename T>
+  static std::string GetName(int i) {
+    return StreamableToString(i);
+  }
+};
+
+template <typename Provided = DefaultNameGenerator>
+struct NameGeneratorSelector {
+  typedef Provided type;
+};
+
+template <typename NameGenerator>
+void GenerateNamesRecursively(Types0, std::vector<std::string>*, int) {}
+
+template <typename NameGenerator, typename Types>
+void GenerateNamesRecursively(Types, std::vector<std::string>* result, int i) {
+  result->push_back(NameGenerator::template GetName<typename Types::Head>(i));
+  GenerateNamesRecursively<NameGenerator>(typename Types::Tail(), result,
+                                          i + 1);
+}
+
+template <typename NameGenerator, typename Types>
+std::vector<std::string> GenerateNames() {
+  std::vector<std::string> result;
+  GenerateNamesRecursively<NameGenerator>(Types(), &result, 0);
+  return result;
+}
+
 // TypeParameterizedTest<Fixture, TestSel, Types>::Register()
 // registers a list of type-parameterized tests with Google Test.  The
 // return value is insignificant - we just need to return something
@@ -626,10 +651,10 @@ class TypeParameterizedTest {
   // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase,
   // Types).  Valid values for 'index' are [0, N - 1] where N is the
   // length of Types.
-  static bool Register(const char* prefix,
-                       CodeLocation code_location,
-                       const char* case_name, const char* test_names,
-                       int index) {
+  static bool Register(const char* prefix, const CodeLocation& code_location,
+                       const char* case_name, const char* test_names, int index,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
     typedef typename Types::Head Type;
     typedef Fixture<Type> FixtureClass;
     typedef typename GTEST_BIND_(TestSel, Type) TestClass;
@@ -637,20 +662,23 @@ class TypeParameterizedTest {
     // First, registers the first type-parameterized test in the type
     // list.
     MakeAndRegisterTestInfo(
-        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name + "/"
-         + StreamableToString(index)).c_str(),
+        (std::string(prefix) + (prefix[0] == '\0' ? "" : "/") + case_name +
+         "/" + type_names[index])
+            .c_str(),
         StripTrailingSpaces(GetPrefixUntilComma(test_names)).c_str(),
         GetTypeName<Type>().c_str(),
         NULL,  // No value parameter.
-        code_location,
-        GetTypeId<FixtureClass>(),
-        TestClass::SetUpTestCase,
-        TestClass::TearDownTestCase,
-        new TestFactoryImpl<TestClass>);
+        code_location, GetTypeId<FixtureClass>(), TestClass::SetUpTestCase,
+        TestClass::TearDownTestCase, new TestFactoryImpl<TestClass>);
 
     // Next, recurses (at compile time) with the tail of the type list.
-    return TypeParameterizedTest<Fixture, TestSel, typename Types::Tail>
-        ::Register(prefix, code_location, case_name, test_names, index + 1);
+    return TypeParameterizedTest<Fixture, TestSel,
+                                 typename Types::Tail>::Register(prefix,
+                                                                 code_location,
+                                                                 case_name,
+                                                                 test_names,
+                                                                 index + 1,
+                                                                 type_names);
   }
 };
 
@@ -658,9 +686,11 @@ class TypeParameterizedTest {
 template <GTEST_TEMPLATE_ Fixture, class TestSel>
 class TypeParameterizedTest<Fixture, TestSel, Types0> {
  public:
-  static bool Register(const char* /*prefix*/, CodeLocation,
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
                        const char* /*case_name*/, const char* /*test_names*/,
-                       int /*index*/) {
+                       int /*index*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -673,8 +703,10 @@ template <GTEST_TEMPLATE_ Fixture, typename Tests, typename Types>
 class TypeParameterizedTestCase {
  public:
   static bool Register(const char* prefix, CodeLocation code_location,
-                       const TypedTestCasePState* state,
-                       const char* case_name, const char* test_names) {
+                       const TypedTestCasePState* state, const char* case_name,
+                       const char* test_names,
+                       const std::vector<std::string>& type_names =
+                           GenerateNames<DefaultNameGenerator, Types>()) {
     std::string test_name = StripTrailingSpaces(
         GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
@@ -691,12 +723,14 @@ class TypeParameterizedTestCase {
 
     // First, register the first test in 'Test' for each type in 'Types'.
     TypeParameterizedTest<Fixture, Head, Types>::Register(
-        prefix, test_location, case_name, test_names, 0);
+        prefix, test_location, case_name, test_names, 0, type_names);
 
     // Next, recurses (at compile time) with the tail of the test list.
-    return TypeParameterizedTestCase<Fixture, typename Tests::Tail, Types>
-        ::Register(prefix, code_location, state,
-                   case_name, SkipComma(test_names));
+    return TypeParameterizedTestCase<Fixture, typename Tests::Tail,
+                                     Types>::Register(prefix, code_location,
+                                                      state, case_name,
+                                                      SkipComma(test_names),
+                                                      type_names);
   }
 };
 
@@ -704,9 +738,11 @@ class TypeParameterizedTestCase {
 template <GTEST_TEMPLATE_ Fixture, typename Types>
 class TypeParameterizedTestCase<Fixture, Templates0, Types> {
  public:
-  static bool Register(const char* /*prefix*/, CodeLocation,
+  static bool Register(const char* /*prefix*/, const CodeLocation&,
                        const TypedTestCasePState* /*state*/,
-                       const char* /*case_name*/, const char* /*test_names*/) {
+                       const char* /*case_name*/, const char* /*test_names*/,
+                       const std::vector<std::string>& =
+                           std::vector<std::string>() /*type_names*/) {
     return true;
   }
 };
@@ -823,31 +859,6 @@ struct RemoveConst<T[N]> {
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
     GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T))
 
-// Adds reference to a type if it is not a reference type,
-// otherwise leaves it unchanged.  This is the same as
-// tr1::add_reference, which is not widely available yet.
-template <typename T>
-struct AddReference { typedef T& type; };  // NOLINT
-template <typename T>
-struct AddReference<T&> { typedef T& type; };  // NOLINT
-
-// A handy wrapper around AddReference that works when the argument T
-// depends on template parameters.
-#define GTEST_ADD_REFERENCE_(T) \
-    typename ::testing::internal::AddReference<T>::type
-
-// Adds a reference to const on top of T as necessary.  For example,
-// it transforms
-//
-//   char         ==> const char&
-//   const char   ==> const char&
-//   char&        ==> const char&
-//   const char&  ==> const char&
-//
-// The argument T must depend on some template parameters.
-#define GTEST_REFERENCE_TO_CONST_(T) \
-    GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T))
-
 // ImplicitlyConvertible<From, To>::value is a compile-time bool
 // constant that's true iff type From can be implicitly converted to
 // type To.
@@ -917,8 +928,11 @@ struct IsAProtocolMessage
 // a container class by checking the type of IsContainerTest<C>(0).
 // The value of the expression is insignificant.
 //
-// Note that we look for both C::iterator and C::const_iterator.  The
-// reason is that C++ injects the name of a class as a member of the
+// In C++11 mode we check the existence of a const_iterator and that an
+// iterator is properly implemented for the container.
+//
+// For pre-C++11 that we look for both C::iterator and C::const_iterator.
+// The reason is that C++ injects the name of a class as a member of the
 // class itself (e.g. you can refer to class iterator as either
 // 'iterator' or 'iterator::iterator').  If we look for C::iterator
 // only, for example, we would mistakenly think that a class named
@@ -928,17 +942,96 @@ struct IsAProtocolMessage
 // IsContainerTest(typename C::const_iterator*) and
 // IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++.
 typedef int IsContainer;
+#if GTEST_LANG_CXX11
+template <class C,
+          class Iterator = decltype(::std::declval<const C&>().begin()),
+          class = decltype(::std::declval<const C&>().end()),
+          class = decltype(++::std::declval<Iterator&>()),
+          class = decltype(*::std::declval<Iterator>()),
+          class = typename C::const_iterator>
+IsContainer IsContainerTest(int /* dummy */) {
+  return 0;
+}
+#else
 template <class C>
 IsContainer IsContainerTest(int /* dummy */,
                             typename C::iterator* /* it */ = NULL,
                             typename C::const_iterator* /* const_it */ = NULL) {
   return 0;
 }
+#endif  // GTEST_LANG_CXX11
 
 typedef char IsNotContainer;
 template <class C>
 IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
 
+// Trait to detect whether a type T is a hash table.
+// The heuristic used is that the type contains an inner type `hasher` and does
+// not contain an inner type `reverse_iterator`.
+// If the container is iterable in reverse, then order might actually matter.
+template <typename T>
+struct IsHashTable {
+ private:
+  template <typename U>
+  static char test(typename U::hasher*, typename U::reverse_iterator*);
+  template <typename U>
+  static int test(typename U::hasher*, ...);
+  template <typename U>
+  static char test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(0, 0)) == sizeof(int);
+};
+
+template <typename T>
+const bool IsHashTable<T>::value;
+
+template<typename T>
+struct VoidT {
+    typedef void value_type;
+};
+
+template <typename T, typename = void>
+struct HasValueType : false_type {};
+template <typename T>
+struct HasValueType<T, VoidT<typename T::value_type> > : true_type {
+};
+
+template <typename C,
+          bool = sizeof(IsContainerTest<C>(0)) == sizeof(IsContainer),
+          bool = HasValueType<C>::value>
+struct IsRecursiveContainerImpl;
+
+template <typename C, bool HV>
+struct IsRecursiveContainerImpl<C, false, HV> : public false_type {};
+
+// Since the IsRecursiveContainerImpl depends on the IsContainerTest we need to
+// obey the same inconsistencies as the IsContainerTest, namely check if
+// something is a container is relying on only const_iterator in C++11 and
+// is relying on both const_iterator and iterator otherwise
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, false> : public false_type {};
+
+template <typename C>
+struct IsRecursiveContainerImpl<C, true, true> {
+  #if GTEST_LANG_CXX11
+  typedef typename IteratorTraits<typename C::const_iterator>::value_type
+      value_type;
+#else
+  typedef typename IteratorTraits<typename C::iterator>::value_type value_type;
+#endif
+  typedef is_same<value_type, C> type;
+};
+
+// IsRecursiveContainer<Type> is a unary compile-time predicate that
+// evaluates whether C is a recursive container type. A recursive container
+// type is a container type whose value_type is equal to the container type
+// itself. An example for a recursive container type is
+// boost::filesystem::path, whose iterator has a value_type that is equal to
+// boost::filesystem::path.
+template <typename C>
+struct IsRecursiveContainer : public IsRecursiveContainerImpl<C>::type {};
+
 // EnableIf<condition>::type is void when 'Cond' is true, and
 // undefined when 'Cond' is false.  To use SFINAE to make a function
 // overload only apply when a particular expression is true, add
@@ -1070,7 +1163,7 @@ class NativeArray {
  private:
   enum {
     kCheckTypeIsNotConstOrAReference = StaticAssertTypeEqHelper<
-        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value,
+        Element, GTEST_REMOVE_REFERENCE_AND_CONST_(Element)>::value
   };
 
   // Initializes this object with a copy of the input.
@@ -1115,7 +1208,7 @@ class NativeArray {
 #define GTEST_SUCCESS_(message) \
   GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess)
 
-// Suppresses MSVC warnings 4072 (unreachable code) for the code following
+// Suppress MSVC warning 4702 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
@@ -1235,4 +1328,3 @@ ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\
 void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody()
 
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h
index 36029422174..082b87289ae 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-linked_ptr.h
@@ -27,8 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: Dan Egnor (egnor@google.com)
-//
 // A "smart" pointer type with reference tracking.  Every pointer to a
 // particular object is kept on a circular linked list.  When the last pointer
 // to an object is destroyed or reassigned, the object is deleted.
@@ -62,9 +60,11 @@
 //       raw pointer (e.g. via get()) concurrently, and
 //     - it's safe to write to two linked_ptrs that point to the same
 //       shared object concurrently.
-// TODO(wan@google.com): rename this to safe_linked_ptr to avoid
+// FIXME: rename this to safe_linked_ptr to avoid
 // confusion with normal linked_ptr.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h
index 4d1d81d20ff..4fac8c02703 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h
@@ -30,8 +30,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -43,17 +42,14 @@
 // by the maximum arity of the implementation of tuple which is
 // currently set at 10.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-param-util.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Forward declarations of ValuesIn(), which is implemented in
@@ -84,6 +80,8 @@ class ValueArray1 {
     return ValuesIn(array);
   }
 
+  ValueArray1(const ValueArray1& other) : v1_(other.v1_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray1& other);
@@ -102,6 +100,8 @@ class ValueArray2 {
     return ValuesIn(array);
   }
 
+  ValueArray2(const ValueArray2& other) : v1_(other.v1_), v2_(other.v2_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray2& other);
@@ -122,6 +122,9 @@ class ValueArray3 {
     return ValuesIn(array);
   }
 
+  ValueArray3(const ValueArray3& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray3& other);
@@ -144,6 +147,9 @@ class ValueArray4 {
     return ValuesIn(array);
   }
 
+  ValueArray4(const ValueArray4& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray4& other);
@@ -167,6 +173,9 @@ class ValueArray5 {
     return ValuesIn(array);
   }
 
+  ValueArray5(const ValueArray5& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray5& other);
@@ -193,6 +202,9 @@ class ValueArray6 {
     return ValuesIn(array);
   }
 
+  ValueArray6(const ValueArray6& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray6& other);
@@ -220,6 +232,10 @@ class ValueArray7 {
     return ValuesIn(array);
   }
 
+  ValueArray7(const ValueArray7& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray7& other);
@@ -249,6 +265,10 @@ class ValueArray8 {
     return ValuesIn(array);
   }
 
+  ValueArray8(const ValueArray8& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray8& other);
@@ -280,6 +300,10 @@ class ValueArray9 {
     return ValuesIn(array);
   }
 
+  ValueArray9(const ValueArray9& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray9& other);
@@ -312,6 +336,10 @@ class ValueArray10 {
     return ValuesIn(array);
   }
 
+  ValueArray10(const ValueArray10& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray10& other);
@@ -346,6 +374,11 @@ class ValueArray11 {
     return ValuesIn(array);
   }
 
+  ValueArray11(const ValueArray11& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray11& other);
@@ -382,6 +415,11 @@ class ValueArray12 {
     return ValuesIn(array);
   }
 
+  ValueArray12(const ValueArray12& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray12& other);
@@ -420,6 +458,11 @@ class ValueArray13 {
     return ValuesIn(array);
   }
 
+  ValueArray13(const ValueArray13& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray13& other);
@@ -459,6 +502,11 @@ class ValueArray14 {
     return ValuesIn(array);
   }
 
+  ValueArray14(const ValueArray14& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray14& other);
@@ -500,6 +548,12 @@ class ValueArray15 {
     return ValuesIn(array);
   }
 
+  ValueArray15(const ValueArray15& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray15& other);
@@ -544,6 +598,12 @@ class ValueArray16 {
     return ValuesIn(array);
   }
 
+  ValueArray16(const ValueArray16& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray16& other);
@@ -589,6 +649,12 @@ class ValueArray17 {
     return ValuesIn(array);
   }
 
+  ValueArray17(const ValueArray17& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray17& other);
@@ -636,6 +702,12 @@ class ValueArray18 {
     return ValuesIn(array);
   }
 
+  ValueArray18(const ValueArray18& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray18& other);
@@ -684,6 +756,13 @@ class ValueArray19 {
     return ValuesIn(array);
   }
 
+  ValueArray19(const ValueArray19& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray19& other);
@@ -734,6 +813,13 @@ class ValueArray20 {
     return ValuesIn(array);
   }
 
+  ValueArray20(const ValueArray20& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray20& other);
@@ -787,6 +873,13 @@ class ValueArray21 {
     return ValuesIn(array);
   }
 
+  ValueArray21(const ValueArray21& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray21& other);
@@ -841,6 +934,13 @@ class ValueArray22 {
     return ValuesIn(array);
   }
 
+  ValueArray22(const ValueArray22& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray22& other);
@@ -897,6 +997,14 @@ class ValueArray23 {
     return ValuesIn(array);
   }
 
+  ValueArray23(const ValueArray23& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray23& other);
@@ -955,6 +1063,14 @@ class ValueArray24 {
     return ValuesIn(array);
   }
 
+  ValueArray24(const ValueArray24& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray24& other);
@@ -1014,6 +1130,14 @@ class ValueArray25 {
     return ValuesIn(array);
   }
 
+  ValueArray25(const ValueArray25& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray25& other);
@@ -1075,6 +1199,14 @@ class ValueArray26 {
     return ValuesIn(array);
   }
 
+  ValueArray26(const ValueArray26& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray26& other);
@@ -1139,6 +1271,15 @@ class ValueArray27 {
     return ValuesIn(array);
   }
 
+  ValueArray27(const ValueArray27& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray27& other);
@@ -1204,6 +1345,15 @@ class ValueArray28 {
     return ValuesIn(array);
   }
 
+  ValueArray28(const ValueArray28& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray28& other);
@@ -1270,6 +1420,15 @@ class ValueArray29 {
     return ValuesIn(array);
   }
 
+  ValueArray29(const ValueArray29& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray29& other);
@@ -1339,6 +1498,15 @@ class ValueArray30 {
     return ValuesIn(array);
   }
 
+  ValueArray30(const ValueArray30& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray30& other);
@@ -1410,6 +1578,16 @@ class ValueArray31 {
     return ValuesIn(array);
   }
 
+  ValueArray31(const ValueArray31& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray31& other);
@@ -1482,6 +1660,16 @@ class ValueArray32 {
     return ValuesIn(array);
   }
 
+  ValueArray32(const ValueArray32& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray32& other);
@@ -1557,6 +1745,16 @@ class ValueArray33 {
     return ValuesIn(array);
   }
 
+  ValueArray33(const ValueArray33& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray33& other);
@@ -1633,6 +1831,16 @@ class ValueArray34 {
     return ValuesIn(array);
   }
 
+  ValueArray34(const ValueArray34& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray34& other);
@@ -1710,6 +1918,17 @@ class ValueArray35 {
     return ValuesIn(array);
   }
 
+  ValueArray35(const ValueArray35& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray35& other);
@@ -1790,6 +2009,17 @@ class ValueArray36 {
     return ValuesIn(array);
   }
 
+  ValueArray36(const ValueArray36& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray36& other);
@@ -1872,6 +2102,17 @@ class ValueArray37 {
     return ValuesIn(array);
   }
 
+  ValueArray37(const ValueArray37& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray37& other);
@@ -1955,6 +2196,17 @@ class ValueArray38 {
     return ValuesIn(array);
   }
 
+  ValueArray38(const ValueArray38& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray38& other);
@@ -2040,6 +2292,18 @@ class ValueArray39 {
     return ValuesIn(array);
   }
 
+  ValueArray39(const ValueArray39& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray39& other);
@@ -2127,6 +2391,18 @@ class ValueArray40 {
     return ValuesIn(array);
   }
 
+  ValueArray40(const ValueArray40& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray40& other);
@@ -2216,6 +2492,18 @@ class ValueArray41 {
     return ValuesIn(array);
   }
 
+  ValueArray41(const ValueArray41& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray41& other);
@@ -2307,6 +2595,18 @@ class ValueArray42 {
     return ValuesIn(array);
   }
 
+  ValueArray42(const ValueArray42& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray42& other);
@@ -2399,6 +2699,19 @@ class ValueArray43 {
     return ValuesIn(array);
   }
 
+  ValueArray43(const ValueArray43& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray43& other);
@@ -2493,6 +2806,19 @@ class ValueArray44 {
     return ValuesIn(array);
   }
 
+  ValueArray44(const ValueArray44& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray44& other);
@@ -2589,6 +2915,19 @@ class ValueArray45 {
     return ValuesIn(array);
   }
 
+  ValueArray45(const ValueArray45& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray45& other);
@@ -2687,6 +3026,19 @@ class ValueArray46 {
     return ValuesIn(array);
   }
 
+  ValueArray46(const ValueArray46& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray46& other);
@@ -2787,6 +3139,20 @@ class ValueArray47 {
     return ValuesIn(array);
   }
 
+  ValueArray47(const ValueArray47& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray47& other);
@@ -2889,6 +3255,20 @@ class ValueArray48 {
     return ValuesIn(array);
   }
 
+  ValueArray48(const ValueArray48& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray48& other);
@@ -2992,6 +3372,20 @@ class ValueArray49 {
     return ValuesIn(array);
   }
 
+  ValueArray49(const ValueArray49& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray49& other);
@@ -3096,6 +3490,20 @@ class ValueArray50 {
     return ValuesIn(array);
   }
 
+  ValueArray50(const ValueArray50& other) : v1_(other.v1_), v2_(other.v2_),
+      v3_(other.v3_), v4_(other.v4_), v5_(other.v5_), v6_(other.v6_),
+      v7_(other.v7_), v8_(other.v8_), v9_(other.v9_), v10_(other.v10_),
+      v11_(other.v11_), v12_(other.v12_), v13_(other.v13_), v14_(other.v14_),
+      v15_(other.v15_), v16_(other.v16_), v17_(other.v17_), v18_(other.v18_),
+      v19_(other.v19_), v20_(other.v20_), v21_(other.v21_), v22_(other.v22_),
+      v23_(other.v23_), v24_(other.v24_), v25_(other.v25_), v26_(other.v26_),
+      v27_(other.v27_), v28_(other.v28_), v29_(other.v29_), v30_(other.v30_),
+      v31_(other.v31_), v32_(other.v32_), v33_(other.v33_), v34_(other.v34_),
+      v35_(other.v35_), v36_(other.v36_), v37_(other.v37_), v38_(other.v38_),
+      v39_(other.v39_), v40_(other.v40_), v41_(other.v41_), v42_(other.v42_),
+      v43_(other.v43_), v44_(other.v44_), v45_(other.v45_), v46_(other.v46_),
+      v47_(other.v47_), v48_(other.v48_), v49_(other.v49_), v50_(other.v50_) {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray50& other);
@@ -3208,7 +3616,7 @@ class CartesianProductGenerator2
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -3240,7 +3648,7 @@ class CartesianProductGenerator2
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_);
+        current_value_.reset(new ParamType(*current1_, *current2_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -3262,7 +3670,7 @@ class CartesianProductGenerator2
     const typename ParamGenerator<T2>::iterator begin2_;
     const typename ParamGenerator<T2>::iterator end2_;
     typename ParamGenerator<T2>::iterator current2_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator2::Iterator
 
   // No implementation - assignment is unsupported.
@@ -3331,7 +3739,7 @@ class CartesianProductGenerator3
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -3367,7 +3775,7 @@ class CartesianProductGenerator3
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -3393,7 +3801,7 @@ class CartesianProductGenerator3
     const typename ParamGenerator<T3>::iterator begin3_;
     const typename ParamGenerator<T3>::iterator end3_;
     typename ParamGenerator<T3>::iterator current3_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator3::Iterator
 
   // No implementation - assignment is unsupported.
@@ -3472,7 +3880,7 @@ class CartesianProductGenerator4
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -3512,8 +3920,8 @@ class CartesianProductGenerator4
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -3543,7 +3951,7 @@ class CartesianProductGenerator4
     const typename ParamGenerator<T4>::iterator begin4_;
     const typename ParamGenerator<T4>::iterator end4_;
     typename ParamGenerator<T4>::iterator current4_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator4::Iterator
 
   // No implementation - assignment is unsupported.
@@ -3630,7 +4038,7 @@ class CartesianProductGenerator5
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -3674,8 +4082,8 @@ class CartesianProductGenerator5
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -3709,7 +4117,7 @@ class CartesianProductGenerator5
     const typename ParamGenerator<T5>::iterator begin5_;
     const typename ParamGenerator<T5>::iterator end5_;
     typename ParamGenerator<T5>::iterator current5_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator5::Iterator
 
   // No implementation - assignment is unsupported.
@@ -3807,7 +4215,7 @@ class CartesianProductGenerator6
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -3855,8 +4263,8 @@ class CartesianProductGenerator6
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -3894,7 +4302,7 @@ class CartesianProductGenerator6
     const typename ParamGenerator<T6>::iterator begin6_;
     const typename ParamGenerator<T6>::iterator end6_;
     typename ParamGenerator<T6>::iterator current6_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator6::Iterator
 
   // No implementation - assignment is unsupported.
@@ -4001,7 +4409,7 @@ class CartesianProductGenerator7
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -4053,8 +4461,8 @@ class CartesianProductGenerator7
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -4096,7 +4504,7 @@ class CartesianProductGenerator7
     const typename ParamGenerator<T7>::iterator begin7_;
     const typename ParamGenerator<T7>::iterator end7_;
     typename ParamGenerator<T7>::iterator current7_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator7::Iterator
 
   // No implementation - assignment is unsupported.
@@ -4214,7 +4622,7 @@ class CartesianProductGenerator8
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -4270,8 +4678,8 @@ class CartesianProductGenerator8
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
-            *current4_, *current5_, *current6_, *current7_, *current8_);
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
+            *current4_, *current5_, *current6_, *current7_, *current8_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -4317,7 +4725,7 @@ class CartesianProductGenerator8
     const typename ParamGenerator<T8>::iterator begin8_;
     const typename ParamGenerator<T8>::iterator end8_;
     typename ParamGenerator<T8>::iterator current8_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator8::Iterator
 
   // No implementation - assignment is unsupported.
@@ -4443,7 +4851,7 @@ class CartesianProductGenerator9
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -4503,9 +4911,9 @@ class CartesianProductGenerator9
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_);
+            *current9_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -4555,7 +4963,7 @@ class CartesianProductGenerator9
     const typename ParamGenerator<T9>::iterator begin9_;
     const typename ParamGenerator<T9>::iterator end9_;
     typename ParamGenerator<T9>::iterator current9_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator9::Iterator
 
   // No implementation - assignment is unsupported.
@@ -4690,7 +5098,7 @@ class CartesianProductGenerator10
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -4754,9 +5162,9 @@ class CartesianProductGenerator10
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType(*current1_, *current2_, *current3_,
+        current_value_.reset(new ParamType(*current1_, *current2_, *current3_,
             *current4_, *current5_, *current6_, *current7_, *current8_,
-            *current9_, *current10_);
+            *current9_, *current10_));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -4810,7 +5218,7 @@ class CartesianProductGenerator10
     const typename ParamGenerator<T10>::iterator begin10_;
     const typename ParamGenerator<T10>::iterator end10_;
     typename ParamGenerator<T10>::iterator current10_;
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator10::Iterator
 
   // No implementation - assignment is unsupported.
@@ -5141,6 +5549,4 @@ CartesianProductHolder10(const Generator1& g1, const Generator2& g2,
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump
index 5c7c47af0bb..30dffe43c3c 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util-generated.h.pump
@@ -29,8 +29,7 @@ $var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 // This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -42,17 +41,14 @@ $var maxtuple = 10  $$ Maximum number of Combine arguments we want to support.
 // by the maximum arity of the implementation of tuple which is
 // currently set at $maxtuple.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-param-util.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Forward declarations of ValuesIn(), which is implemented in
@@ -87,6 +83,8 @@ class ValueArray$i {
     return ValuesIn(array);
   }
 
+  ValueArray$i(const ValueArray$i& other) : $for j, [[v$(j)_(other.v$(j)_)]] {}
+
  private:
   // No implementation - assignment is unsupported.
   void operator=(const ValueArray$i& other);
@@ -165,7 +163,7 @@ $for k [[
     virtual ParamIteratorInterface<ParamType>* Clone() const {
       return new Iterator(*this);
     }
-    virtual const ParamType* Current() const { return &current_value_; }
+    virtual const ParamType* Current() const { return current_value_.get(); }
     virtual bool Equals(const ParamIteratorInterface<ParamType>& other) const {
       // Having the same base generator guarantees that the other
       // iterator is of the same type and we can downcast.
@@ -197,7 +195,7 @@ $for k [[
 
     void ComputeCurrentValue() {
       if (!AtEnd())
-        current_value_ = ParamType($for j, [[*current$(j)_]]);
+        current_value_.reset(new ParamType($for j, [[*current$(j)_]]));
     }
     bool AtEnd() const {
       // We must report iterator past the end of the range when either of the
@@ -222,7 +220,7 @@ $for j [[
     typename ParamGenerator<T$j>::iterator current$(j)_;
 ]]
 
-    ParamType current_value_;
+    linked_ptr<ParamType> current_value_;
   };  // class CartesianProductGenerator$i::Iterator
 
   // No implementation - assignment is unsupported.
@@ -281,6 +279,4 @@ $for j [[
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index 82cab9b0201..d64f620c4c6 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -26,11 +26,12 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: vladl@google.com (Vlad Losev)
+
 
 // Type and function utilities for implementing parameterized tests.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
@@ -41,16 +42,11 @@
 #include <utility>
 #include <vector>
 
-// scripts/fuse_gtest.py depends on gtest's own header being #included
-// *unconditionally*.  Therefore these #includes cannot be moved
-// inside #if GTEST_HAS_PARAM_TEST.
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-linked_ptr.h"
 #include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-printers.h"
 
-#if GTEST_HAS_PARAM_TEST
-
 namespace testing {
 
 // Input to a parameterized test name generator, describing a test parameter.
@@ -472,7 +468,7 @@ class ParameterizedTestCaseInfoBase {
   virtual ~ParameterizedTestCaseInfoBase() {}
 
   // Base part of test case name for display purposes.
-  virtual const string& GetTestCaseName() const = 0;
+  virtual const std::string& GetTestCaseName() const = 0;
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
@@ -511,7 +507,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
       : test_case_name_(name), code_location_(code_location) {}
 
   // Test case base name for display purposes.
-  virtual const string& GetTestCaseName() const { return test_case_name_; }
+  virtual const std::string& GetTestCaseName() const { return test_case_name_; }
   // Test case id to verify identity.
   virtual TypeId GetTestCaseTypeId() const { return GetTypeId<TestCase>(); }
   // TEST_P macro uses AddTestPattern() to record information
@@ -529,11 +525,10 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
   }
   // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information
   // about a generator.
-  int AddTestCaseInstantiation(const string& instantiation_name,
+  int AddTestCaseInstantiation(const std::string& instantiation_name,
                                GeneratorCreationFunc* func,
                                ParamNameGeneratorFunc* name_func,
-                               const char* file,
-                               int line) {
+                               const char* file, int line) {
     instantiations_.push_back(
         InstantiationInfo(instantiation_name, func, name_func, file, line));
     return 0;  // Return value used only to run this method in namespace scope.
@@ -550,13 +545,13 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
       for (typename InstantiationContainer::iterator gen_it =
                instantiations_.begin(); gen_it != instantiations_.end();
                ++gen_it) {
-        const string& instantiation_name = gen_it->name;
+        const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
         ParamNameGeneratorFunc* name_func = gen_it->name_func;
         const char* file = gen_it->file;
         int line = gen_it->line;
 
-        string test_case_name;
+        std::string test_case_name;
         if ( !instantiation_name.empty() )
           test_case_name = instantiation_name + "/";
         test_case_name += test_info->test_case_base_name;
@@ -609,8 +604,8 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
         test_base_name(a_test_base_name),
         test_meta_factory(a_test_meta_factory) {}
 
-    const string test_case_base_name;
-    const string test_base_name;
+    const std::string test_case_base_name;
+    const std::string test_base_name;
     const scoped_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
   };
   typedef ::std::vector<linked_ptr<TestInfo> > TestInfoContainer;
@@ -651,7 +646,7 @@ class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase {
     return true;
   }
 
-  const string test_case_name_;
+  const std::string test_case_name_;
   CodeLocation code_location_;
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
@@ -726,6 +721,4 @@ class ParameterizedTestCaseRegistry {
 }  // namespace internal
 }  // namespace testing
 
-#endif  //  GTEST_HAS_PARAM_TEST
-
 #endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index 74ab949057c..f83700e06d9 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -27,7 +27,7 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
@@ -54,6 +54,9 @@
 #   define GTEST_OS_WINDOWS_PHONE 1
 #  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
 #   define GTEST_OS_WINDOWS_RT 1
+#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#   define GTEST_OS_WINDOWS_PHONE 1
+#   define GTEST_OS_WINDOWS_TV_TITLE 1
 #  else
     // WINAPI_FAMILY defined but no known partition matched.
     // Default to desktop.
@@ -69,6 +72,8 @@
 # endif
 #elif defined __FreeBSD__
 # define GTEST_OS_FREEBSD 1
+#elif defined __Fuchsia__
+# define GTEST_OS_FUCHSIA 1
 #elif defined __linux__
 # define GTEST_OS_LINUX 1
 # if defined __ANDROID__
@@ -84,6 +89,8 @@
 # define GTEST_OS_HPUX 1
 #elif defined __native_client__
 # define GTEST_OS_NACL 1
+#elif defined __NetBSD__
+# define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
 # define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
index da57e65d338..786497d854c 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -27,8 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan)
-//
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -40,6 +38,8 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
@@ -73,11 +73,9 @@
 //   GTEST_HAS_EXCEPTIONS     - Define it to 1/0 to indicate that exceptions
 //                              are enabled.
 //   GTEST_HAS_GLOBAL_STRING  - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::string, which is different to std::string).
-//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string
-//                              is/isn't available (some systems define
-//                              ::wstring, which is different to std::wstring).
+//                              is/isn't available
+//   GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::wstring
+//                              is/isn't available
 //   GTEST_HAS_POSIX_RE       - Define it to 1/0 to indicate that POSIX regular
 //                              expressions are/aren't available.
 //   GTEST_HAS_PTHREAD        - Define it to 1/0 to indicate that <pthread.h>
@@ -109,6 +107,12 @@
 //   GTEST_CREATE_SHARED_LIBRARY
 //                            - Define to 1 when compiling Google Test itself
 //                              as a shared library.
+//   GTEST_DEFAULT_DEATH_TEST_STYLE
+//                            - The default value of --gtest_death_test_style.
+//                              The legacy default has been "fast" in the open
+//                              source version since 2008. The recommended value
+//                              is "threadsafe", and can be set in
+//                              custom/gtest-port.h.
 
 // Platform-indicating macros
 // --------------------------
@@ -122,12 +126,14 @@
 //   GTEST_OS_AIX      - IBM AIX
 //   GTEST_OS_CYGWIN   - Cygwin
 //   GTEST_OS_FREEBSD  - FreeBSD
+//   GTEST_OS_FUCHSIA  - Fuchsia
 //   GTEST_OS_HPUX     - HP-UX
 //   GTEST_OS_LINUX    - Linux
 //     GTEST_OS_LINUX_ANDROID - Google Android
 //   GTEST_OS_MAC      - Mac OS X
 //     GTEST_OS_IOS    - iOS
 //   GTEST_OS_NACL     - Google Native Client (NaCl)
+//   GTEST_OS_NETBSD   - NetBSD
 //   GTEST_OS_OPENBSD  - OpenBSD
 //   GTEST_OS_QNX      - QNX
 //   GTEST_OS_SOLARIS  - Sun Solaris
@@ -169,15 +175,15 @@
 //   GTEST_HAS_COMBINE      - the Combine() function (for value-parameterized
 //                            tests)
 //   GTEST_HAS_DEATH_TEST   - death tests
-//   GTEST_HAS_PARAM_TEST   - value-parameterized tests
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
+//   GOOGLETEST_CM0007 DO NOT DELETE
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
 //   GTEST_USES_SIMPLE_RE   - our own simple regex is used;
-//                            the above two are mutually exclusive.
+//                            the above RE\b(s) are mutually exclusive.
 //   GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ().
 
 // Misc public macros
@@ -206,6 +212,7 @@
 //
 // C++11 feature wrappers:
 //
+//   testing::internal::forward - portability wrapper for std::forward.
 //   testing::internal::move  - portability wrapper for std::move.
 //
 // Synchronization:
@@ -222,10 +229,10 @@
 //
 // Regular expressions:
 //   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like
-//                    platforms, or a reduced regular exception syntax on
-//                    other platforms, including Windows.
-//
+//                    Extended Regular Expression syntax on UNIX-like platforms
+//                    GOOGLETEST_CM0008 DO NOT DELETE
+//                    or a reduced regular exception syntax on other
+//                    platforms, including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -271,10 +278,12 @@
 # include <TargetConditionals.h>
 #endif
 
+// Brings in the definition of HAS_GLOBAL_STRING.  This must be done
+// BEFORE we test HAS_GLOBAL_STRING.
+#include <string>  // NOLINT
 #include <algorithm>  // NOLINT
 #include <iostream>  // NOLINT
 #include <sstream>  // NOLINT
-#include <string>  // NOLINT
 #include <utility>
 #include <vector>  // NOLINT
 
@@ -306,7 +315,7 @@
 //   GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 4385)
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
-#if _MSC_VER >= 1500
+#if _MSC_VER >= 1400
 # define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
     __pragma(warning(push))                        \
     __pragma(warning(disable: warnings))
@@ -318,12 +327,28 @@
 # define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
+// Clang on Windows does not understand MSVC's pragma warning.
+// We need clang-specific way to disable function deprecation warning.
+#ifdef __clang__
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
+    _Pragma("clang diagnostic push")                                  \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    _Pragma("clang diagnostic pop")
+#else
+# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
+    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
 #ifndef GTEST_LANG_CXX11
 // gcc and clang define __GXX_EXPERIMENTAL_CXX0X__ when
 // -std={c,gnu}++{0x,11} is passed.  The C++11 standard specifies a
 // value for __cplusplus, and recent versions of clang, gcc, and
 // probably other compilers set that too in C++11 mode.
-# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L
+# if __GXX_EXPERIMENTAL_CXX0X__ || __cplusplus >= 201103L || _MSC_VER >= 1900
 // Compiling in at least C++11 mode.
 #  define GTEST_LANG_CXX11 1
 # else
@@ -355,12 +380,16 @@
 #if GTEST_STDLIB_CXX11
 # define GTEST_HAS_STD_BEGIN_AND_END_ 1
 # define GTEST_HAS_STD_FORWARD_LIST_ 1
-# define GTEST_HAS_STD_FUNCTION_ 1
+# if !defined(_MSC_VER) || (_MSC_FULL_VER >= 190023824)
+// works only with VS2015U2 and better
+#   define GTEST_HAS_STD_FUNCTION_ 1
+# endif
 # define GTEST_HAS_STD_INITIALIZER_LIST_ 1
 # define GTEST_HAS_STD_MOVE_ 1
-# define GTEST_HAS_STD_SHARED_PTR_ 1
-# define GTEST_HAS_STD_TYPE_TRAITS_ 1
 # define GTEST_HAS_STD_UNIQUE_PTR_ 1
+# define GTEST_HAS_STD_SHARED_PTR_ 1
+# define GTEST_HAS_UNORDERED_MAP_ 1
+# define GTEST_HAS_UNORDERED_SET_ 1
 #endif
 
 // C++11 specifies that <tuple> provides std::tuple.
@@ -368,7 +397,8 @@
 #if GTEST_LANG_CXX11
 # define GTEST_HAS_STD_TUPLE_ 1
 # if defined(__clang__)
-// Inspired by http://clang.llvm.org/docs/LanguageExtensions.html#__has_include
+// Inspired by
+// https://clang.llvm.org/docs/LanguageExtensions.html#include-file-checking-macros
 #  if defined(__has_include) && !__has_include(<tuple>)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -380,7 +410,7 @@
 # elif defined(__GLIBCXX__)
 // Inspired by boost/config/stdlib/libstdcpp3.hpp,
 // http://gcc.gnu.org/gcc-4.2/changes.html and
-// http://gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
+// https://web.archive.org/web/20140227044429/gcc.gnu.org/onlinedocs/libstdc++/manual/bk01pt01ch01.html#manual.intro.status.standard.200x
 #  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
 #   undef GTEST_HAS_STD_TUPLE_
 #  endif
@@ -396,10 +426,16 @@
 #  include <io.h>
 # endif
 // In order to avoid having to include <windows.h>, use forward declaration
-// assuming CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
+#if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
+// MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
+// separate (equivalent) structs, instead of using typedef
+typedef struct _CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#else
+// Assume CRITICAL_SECTION is a typedef of _RTL_CRITICAL_SECTION.
 // This assumption is verified by
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
-struct _RTL_CRITICAL_SECTION;
+typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
+#endif
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
@@ -453,8 +489,11 @@ struct _RTL_CRITICAL_SECTION;
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS
+# if defined(_MSC_VER) && defined(_CPPUNWIND)
+// MSVC defines _CPPUNWIND to 1 iff exceptions are enabled.
+#  define GTEST_HAS_EXCEPTIONS 1
+# elif defined(__BORLANDC__)
+// C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
 #  ifndef _HAS_EXCEPTIONS
@@ -498,21 +537,17 @@ struct _RTL_CRITICAL_SECTION;
 # define GTEST_HAS_STD_STRING 1
 #elif !GTEST_HAS_STD_STRING
 // The user told us that ::std::string isn't available.
-# error "Google Test cannot be used where ::std::string isn't available."
+# error "::std::string isn't available."
 #endif  // !defined(GTEST_HAS_STD_STRING)
 
 #ifndef GTEST_HAS_GLOBAL_STRING
-// The user didn't tell us whether ::string is available, so we need
-// to figure it out.
-
 # define GTEST_HAS_GLOBAL_STRING 0
-
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 #ifndef GTEST_HAS_STD_WSTRING
 // The user didn't tell us whether ::std::wstring is available, so we need
 // to figure it out.
-// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring
+// FIXME: uses autoconf to detect whether ::std::wstring
 //   is available.
 
 // Cygwin 1.7 and below doesn't support ::std::wstring.
@@ -600,8 +635,9 @@ struct _RTL_CRITICAL_SECTION;
 //
 // To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0
 // to your compiler flags.
-# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX \
-    || GTEST_OS_QNX || GTEST_OS_FREEBSD || GTEST_OS_NACL)
+#define GTEST_HAS_PTHREAD                                             \
+  (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX || \
+   GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
@@ -616,7 +652,7 @@ struct _RTL_CRITICAL_SECTION;
 // Determines if hash_map/hash_set are available.
 // Only used for testing against those containers.
 #if !defined(GTEST_HAS_HASH_MAP_)
-# if _MSC_VER
+# if defined(_MSC_VER) && (_MSC_VER < 1900)
 #  define GTEST_HAS_HASH_MAP_ 1  // Indicates that hash_map is available.
 #  define GTEST_HAS_HASH_SET_ 1  // Indicates that hash_set is available.
 # endif  // _MSC_VER
@@ -629,6 +665,14 @@ struct _RTL_CRITICAL_SECTION;
 # if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR)
 // STLport, provided with the Android NDK, has neither <tr1/tuple> or <tuple>.
 #  define GTEST_HAS_TR1_TUPLE 0
+# elif defined(_MSC_VER) && (_MSC_VER >= 1910)
+// Prevent `warning C4996: 'std::tr1': warning STL4002:
+// The non-Standard std::tr1 namespace and TR1-only machinery
+// are deprecated and will be REMOVED.`
+#  define GTEST_HAS_TR1_TUPLE 0
+# elif GTEST_LANG_CXX11 && defined(_LIBCPP_VERSION)
+// libc++ doesn't support TR1.
+#  define GTEST_HAS_TR1_TUPLE 0
 # else
 // The user didn't tell us not to do it, so we assume it's OK.
 #  define GTEST_HAS_TR1_TUPLE 1
@@ -638,6 +682,10 @@ struct _RTL_CRITICAL_SECTION;
 // Determines whether Google Test's own tr1 tuple implementation
 // should be used.
 #ifndef GTEST_USE_OWN_TR1_TUPLE
+// We use our own tuple implementation on Symbian.
+# if GTEST_OS_SYMBIAN
+#  define GTEST_USE_OWN_TR1_TUPLE 1
+# else
 // The user didn't tell us, so we need to figure it out.
 
 // We use our own TR1 tuple if we aren't sure the user has an
@@ -651,7 +699,8 @@ struct _RTL_CRITICAL_SECTION;
 // support TR1 tuple.  libc++ only provides std::tuple, in C++11 mode,
 // and it can be used with some compilers that define __GNUC__.
 # if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000) \
-      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) || _MSC_VER >= 1600
+      && !GTEST_OS_QNX && !defined(_LIBCPP_VERSION)) \
+      || (_MSC_VER >= 1600 && _MSC_VER < 1900)
 #  define GTEST_ENV_HAS_TR1_TUPLE_ 1
 # endif
 
@@ -667,12 +716,11 @@ struct _RTL_CRITICAL_SECTION;
 # else
 #  define GTEST_USE_OWN_TR1_TUPLE 1
 # endif
-
+# endif  // GTEST_OS_SYMBIAN
 #endif  // GTEST_USE_OWN_TR1_TUPLE
 
-// To avoid conditional compilation everywhere, we make it
-// gtest-port.h's responsibility to #include the header implementing
-// tuple.
+// To avoid conditional compilation we make it gtest-port.h's responsibility
+// to #include the header implementing tuple.
 #if GTEST_HAS_STD_TUPLE_
 # include <tuple>  // IWYU pragma: export
 # define GTEST_TUPLE_NAMESPACE_ ::std
@@ -687,22 +735,6 @@ struct _RTL_CRITICAL_SECTION;
 
 # if GTEST_USE_OWN_TR1_TUPLE
 #  include "gtest/internal/gtest-tuple.h"  // IWYU pragma: export  // NOLINT
-# elif GTEST_ENV_HAS_STD_TUPLE_
-#  include <tuple>
-// C++11 puts its tuple into the ::std namespace rather than
-// ::std::tr1.  gtest expects tuple to live in ::std::tr1, so put it there.
-// This causes undefined behavior, but supported compilers react in
-// the way we intend.
-namespace std {
-namespace tr1 {
-using ::std::get;
-using ::std::make_tuple;
-using ::std::tuple;
-using ::std::tuple_element;
-using ::std::tuple_size;
-}
-}
-
 # elif GTEST_OS_SYMBIAN
 
 // On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to
@@ -727,20 +759,22 @@ using ::std::tuple_size;
 // Until version 4.3.2, gcc has a bug that causes <tr1/functional>,
 // which is #included by <tr1/tuple>, to not compile when RTTI is
 // disabled.  _TR1_FUNCTIONAL is the header guard for
-// <tr1/functional>.  Hence the following #define is a hack to prevent
+// <tr1/functional>.  Hence the following #define is used to prevent
 // <tr1/functional> from being included.
 #   define _TR1_FUNCTIONAL 1
 #   include <tr1/tuple>
 #   undef _TR1_FUNCTIONAL  // Allows the user to #include
-                        // <tr1/functional> if he chooses to.
+                        // <tr1/functional> if they choose to.
 #  else
 #   include <tr1/tuple>  // NOLINT
 #  endif  // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302
 
-# else
-// If the compiler is not GCC 4.0+, we assume the user is using a
-// spec-conforming TR1 implementation.
+// VS 2010 now has tr1 support.
+# elif _MSC_VER >= 1600
 #  include <tuple>  // IWYU pragma: export  // NOLINT
+
+# else  // GTEST_USE_OWN_TR1_TUPLE
+#  include <tr1/tuple>  // IWYU pragma: export  // NOLINT
 # endif  // GTEST_USE_OWN_TR1_TUPLE
 
 #endif  // GTEST_HAS_TR1_TUPLE
@@ -754,8 +788,12 @@ using ::std::tuple_size;
 
 # if GTEST_OS_LINUX && !defined(__ia64__)
 #  if GTEST_OS_LINUX_ANDROID
-// On Android, clone() is only available on ARM starting with Gingerbread.
-#    if defined(__arm__) && __ANDROID_API__ >= 9
+// On Android, clone() became available at different API levels for each 32-bit
+// architecture.
+#    if defined(__LP64__) || \
+        (defined(__arm__) && __ANDROID_API__ >= 9) || \
+        (defined(__mips__) && __ANDROID_API__ >= 12) || \
+        (defined(__i386__) && __ANDROID_API__ >= 17)
 #     define GTEST_HAS_CLONE 1
 #    else
 #     define GTEST_HAS_CLONE 0
@@ -786,19 +824,15 @@ using ::std::tuple_size;
 // Google Test does not support death tests for VC 7.1 and earlier as
 // abort() in a VC 7.1 application compiled as GUI in debug config
 // pops up a dialog window that cannot be suppressed programmatically.
-#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     (GTEST_OS_MAC && !GTEST_OS_IOS) || \
-     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \
+#if (GTEST_OS_LINUX || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS ||   \
+     (GTEST_OS_MAC && !GTEST_OS_IOS) ||                         \
+     (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) ||          \
      GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX || \
-     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD)
+     GTEST_OS_OPENBSD || GTEST_OS_QNX || GTEST_OS_FREEBSD || \
+     GTEST_OS_NETBSD || GTEST_OS_FUCHSIA)
 # define GTEST_HAS_DEATH_TEST 1
 #endif
 
-// We don't support MSVC 7.1 with exceptions disabled now.  Therefore
-// all the compilers we care about are adequate for supporting
-// value-parameterized tests.
-#define GTEST_HAS_PARAM_TEST 1
-
 // Determines whether to support type-driven tests.
 
 // Typed tests need <typeinfo> and variadic macros, which GCC, VC++ 8.0,
@@ -813,7 +847,7 @@ using ::std::tuple_size;
 // value-parameterized tests are enabled.  The implementation doesn't
 // work on Sun Studio since it doesn't understand templated conversion
 // operators.
-#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC)
+#if (GTEST_HAS_TR1_TUPLE || GTEST_HAS_STD_TUPLE_) && !defined(__SUNPRO_CC)
 # define GTEST_HAS_COMBINE 1
 #endif
 
@@ -864,15 +898,39 @@ using ::std::tuple_size;
 # define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
+#if GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_ = delete
+#else  // GTEST_LANG_CXX11
+# define GTEST_CXX11_EQUALS_DELETE_
+#endif  // GTEST_LANG_CXX11
+
+// Use this annotation before a function that takes a printf format string.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
+# if defined(__MINGW_PRINTF_FORMAT)
+// MinGW has two different printf implementations. Ensure the format macro
+// matches the selected implementation. See
+// https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
+                                 first_to_check)))
+# else
+#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+       __attribute__((__format__(__printf__, string_index, first_to_check)))
+# endif
+#else
+# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#endif
+
+
 // A macro to disallow operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type)\
-  void operator=(type const &)
+#define GTEST_DISALLOW_ASSIGN_(type) \
+  void operator=(type const &) GTEST_CXX11_EQUALS_DELETE_
 
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\
-  type(type const &);\
+#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
+  type(type const &) GTEST_CXX11_EQUALS_DELETE_; \
   GTEST_DISALLOW_ASSIGN_(type)
 
 // Tell the compiler to warn about unused return values for functions declared
@@ -920,6 +978,11 @@ using ::std::tuple_size;
 
 #endif  // GTEST_HAS_SEH
 
+// GTEST_API_ qualifies all symbols that must be exported. The definitions below
+// are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
+// gtest/internal/custom/gtest-port.h
+#ifndef GTEST_API_
+
 #ifdef _MSC_VER
 # if GTEST_LINKED_AS_SHARED_LIBRARY
 #  define GTEST_API_ __declspec(dllimport)
@@ -928,11 +991,17 @@ using ::std::tuple_size;
 # endif
 #elif __GNUC__ >= 4 || defined(__clang__)
 # define GTEST_API_ __attribute__((visibility ("default")))
-#endif // _MSC_VER
+#endif  // _MSC_VER
+
+#endif  // GTEST_API_
 
 #ifndef GTEST_API_
 # define GTEST_API_
-#endif
+#endif  // GTEST_API_
+
+#ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
+# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
@@ -942,10 +1011,12 @@ using ::std::tuple_size;
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
-#if defined(__GLIBCXX__) || defined(_LIBCPP_VERSION)
-# define GTEST_HAS_CXXABI_H_ 1
-#else
-# define GTEST_HAS_CXXABI_H_ 0
+#if !defined(GTEST_HAS_CXXABI_H_)
+# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#  define GTEST_HAS_CXXABI_H_ 1
+# else
+#  define GTEST_HAS_CXXABI_H_ 0
+# endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
@@ -985,19 +1056,6 @@ using ::std::tuple_size;
 # define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
-// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
-// unsigned integer overflow instrumentation.
-#if defined(__clang__)
-# if defined(__has_attribute) && __has_attribute(no_sanitize)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
-       __attribute__((no_sanitize("unsigned-integer-overflow")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
-# endif  // defined(__has_attribute) && __has_attribute(no_sanitize)
-#else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
-#endif  // __clang__
-
 namespace testing {
 
 class Message;
@@ -1101,6 +1159,16 @@ struct StaticAssertTypeEqHelper<T, T> {
   enum { value = true };
 };
 
+// Same as std::is_same<>.
+template <typename T, typename U>
+struct IsSame {
+  enum { value = false };
+};
+template <typename T>
+struct IsSame<T, T> {
+  enum { value = true };
+};
+
 // Evaluates to the number of elements in 'array'.
 #define GTEST_ARRAY_SIZE_(array) (sizeof(array) / sizeof(array[0]))
 
@@ -1164,6 +1232,10 @@ class scoped_ptr {
 
 // Defines RE.
 
+#if GTEST_USES_PCRE
+// if used, PCRE is injected by custom/gtest-port.h
+#elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
+
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
 // Regular Expression syntax.
 class GTEST_API_ RE {
@@ -1175,11 +1247,11 @@ class GTEST_API_ RE {
   // Constructs an RE from a string.
   RE(const ::std::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   RE(const ::string& regex) { Init(regex.c_str()); }  // NOLINT
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   RE(const char* regex) { Init(regex); }  // NOLINT
   ~RE();
@@ -1192,7 +1264,7 @@ class GTEST_API_ RE {
   // PartialMatch(str, re) returns true iff regular expression re
   // matches a substring of str (including str itself).
   //
-  // TODO(wan@google.com): make FullMatch() and PartialMatch() work
+  // FIXME: make FullMatch() and PartialMatch() work
   // when str contains NUL characters.
   static bool FullMatch(const ::std::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -1201,7 +1273,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#if GTEST_HAS_GLOBAL_STRING
+# if GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const ::string& str, const RE& re) {
     return FullMatch(str.c_str(), re);
@@ -1210,7 +1282,7 @@ class GTEST_API_ RE {
     return PartialMatch(str.c_str(), re);
   }
 
-#endif  // GTEST_HAS_GLOBAL_STRING
+# endif  // GTEST_HAS_GLOBAL_STRING
 
   static bool FullMatch(const char* str, const RE& re);
   static bool PartialMatch(const char* str, const RE& re);
@@ -1219,25 +1291,27 @@ class GTEST_API_ RE {
   void Init(const char* regex);
 
   // We use a const char* instead of an std::string, as Google Test used to be
-  // used where std::string is not available.  TODO(wan@google.com): change to
+  // used where std::string is not available.  FIXME: change to
   // std::string.
   const char* pattern_;
   bool is_valid_;
 
-#if GTEST_USES_POSIX_RE
+# if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-#else  // GTEST_USES_SIMPLE_RE
+# else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-#endif
+# endif
 
   GTEST_DISALLOW_ASSIGN_(RE);
 };
 
+#endif  // GTEST_USES_PCRE
+
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
 GTEST_API_ ::std::string FormatFileLocation(const char* file, int line);
@@ -1323,13 +1397,59 @@ inline void FlushInfoLog() { fflush(NULL); }
     GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
                       << gtest_error
 
+// Adds reference to a type if it is not a reference type,
+// otherwise leaves it unchanged.  This is the same as
+// tr1::add_reference, which is not widely available yet.
+template <typename T>
+struct AddReference { typedef T& type; };  // NOLINT
+template <typename T>
+struct AddReference<T&> { typedef T& type; };  // NOLINT
+
+// A handy wrapper around AddReference that works when the argument T
+// depends on template parameters.
+#define GTEST_ADD_REFERENCE_(T) \
+    typename ::testing::internal::AddReference<T>::type
+
+// Transforms "T" into "const T&" according to standard reference collapsing
+// rules (this is only needed as a backport for C++98 compilers that do not
+// support reference collapsing). Specifically, it transforms:
+//
+//   char         ==> const char&
+//   const char   ==> const char&
+//   char&        ==> char&
+//   const char&  ==> const char&
+//
+// Note that the non-const reference will not have "const" added. This is
+// standard, and necessary so that "T" can always bind to "const T&".
+template <typename T>
+struct ConstRef { typedef const T& type; };
+template <typename T>
+struct ConstRef<T&> { typedef T& type; };
+
+// The argument T must depend on some template parameters.
+#define GTEST_REFERENCE_TO_CONST_(T) \
+  typename ::testing::internal::ConstRef<T>::type
+
 #if GTEST_HAS_STD_MOVE_
+using std::forward;
 using std::move;
+
+template <typename T>
+struct RvalueRef {
+  typedef T&& type;
+};
 #else  // GTEST_HAS_STD_MOVE_
 template <typename T>
 const T& move(const T& t) {
   return t;
 }
+template <typename T>
+GTEST_ADD_REFERENCE_(T) forward(GTEST_ADD_REFERENCE_(T) t) { return t; }
+
+template <typename T>
+struct RvalueRef {
+  typedef const T& type;
+};
 #endif  // GTEST_HAS_STD_MOVE_
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1430,10 +1550,6 @@ GTEST_API_ void CaptureStderr();
 GTEST_API_ std::string GetCapturedStderr();
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
-
-// Returns a path to temporary directory.
-GTEST_API_ std::string TempDir();
-
 // Returns the size (in bytes) of a file.
 GTEST_API_ size_t GetFileSize(FILE* file);
 
@@ -1441,14 +1557,18 @@ GTEST_API_ size_t GetFileSize(FILE* file);
 GTEST_API_ std::string ReadEntireFile(FILE* file);
 
 // All command line arguments.
-GTEST_API_ const ::std::vector<testing::internal::string>& GetArgvs();
+GTEST_API_ std::vector<std::string> GetArgvs();
 
 #if GTEST_HAS_DEATH_TEST
 
-const ::std::vector<testing::internal::string>& GetInjectableArgvs();
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>*
-                             new_argvs);
-
+std::vector<std::string> GetInjectableArgvs();
+// Deprecated: pass the args vector by value instead.
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs);
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs);
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs);
+#endif  // GTEST_HAS_GLOBAL_STRING
+void ClearInjectableArgvs();
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -1698,7 +1818,7 @@ class GTEST_API_ Mutex {
   // Initializes owner_thread_id_ and critical_section_ in static mutexes.
   void ThreadSafeLazyInit();
 
-  // Per http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx,
+  // Per https://blogs.msdn.microsoft.com/oldnewthing/20040223-00/?p=40503,
   // we assume that 0 is an invalid value for thread IDs.
   unsigned int owner_thread_id_;
 
@@ -1706,7 +1826,7 @@ class GTEST_API_ Mutex {
   // by the linker.
   MutexType type_;
   long critical_section_init_phase_;  // NOLINT
-  _RTL_CRITICAL_SECTION* critical_section_;
+  GTEST_CRITICAL_SECTION* critical_section_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
 };
@@ -1982,8 +2102,13 @@ class MutexBase {
      extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
-#  define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-     ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, false, pthread_t() }
+// The initialization list here does not explicitly initialize each field,
+// instead relying on default initialization for the unspecified fields. In
+// particular, the owner_ field (a pthread_t) is not explicitly initialized.
+// This allows initialization to work whether pthread_t is a scalar or struct.
+// The flag -Wmissing-field-initializers must not be specified for this to work.
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::MutexBase mutex = {PTHREAD_MUTEX_INITIALIZER, false, 0}
 
 // The Mutex class can only be used for mutexes created at runtime. It
 // shares its API with MutexBase otherwise.
@@ -2040,7 +2165,7 @@ extern "C" inline void DeleteThreadLocalValue(void* value_holder) {
 
 // Implements thread-local storage on pthreads-based systems.
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal()
       : key_(CreateKey()), default_factory_(new DefaultValueHolderFactory()) {}
@@ -2172,7 +2297,7 @@ class GTestMutexLock {
 typedef GTestMutexLock MutexLock;
 
 template <typename T>
-class ThreadLocal {
+class GTEST_API_ ThreadLocal {
  public:
   ThreadLocal() : value_() {}
   explicit ThreadLocal(const T& value) : value_(value) {}
@@ -2191,12 +2316,13 @@ class ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 // Passing non-POD classes through ellipsis (...) crashes the ARM
-// compiler and generates a warning in Sun Studio.  The Nokia Symbian
+// compiler and generates a warning in Sun Studio before 12u4. The Nokia Symbian
 // and the IBM XL C/C++ compiler try to instantiate a copy constructor
 // for objects passed through ellipsis (...), failing for uncopyable
 // objects.  We define this to ensure that only POD is passed through
 // ellipsis on these systems.
-#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC)
+#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || \
+     (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x5130)
 // We lose support for NULL detection where the compiler doesn't like
 // passing non-POD classes through ellipsis (...).
 # define GTEST_ELLIPSIS_NEEDS_POD_ 1
@@ -2222,6 +2348,13 @@ template <bool bool_value> const bool bool_constant<bool_value>::value;
 typedef bool_constant<false> false_type;
 typedef bool_constant<true> true_type;
 
+template <typename T, typename U>
+struct is_same : public false_type {};
+
+template <typename T>
+struct is_same<T, T> : public true_type {};
+
+
 template <typename T>
 struct is_pointer : public false_type {};
 
@@ -2233,6 +2366,7 @@ struct IteratorTraits {
   typedef typename Iterator::value_type value_type;
 };
 
+
 template <typename T>
 struct IteratorTraits<T*> {
   typedef T value_type;
@@ -2364,7 +2498,7 @@ inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); }
 
 // Functions deprecated by MSVC 8.0.
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996 /* deprecated function */)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 inline const char* StrNCpy(char* dest, const char* src, size_t n) {
   return strncpy(dest, src, n);
@@ -2398,7 +2532,7 @@ inline int Close(int fd) { return close(fd); }
 inline const char* StrError(int errnum) { return strerror(errnum); }
 #endif
 inline const char* GetEnv(const char* name) {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE | GTEST_OS_WINDOWS_RT
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT
   // We are on Windows CE, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return NULL;
@@ -2412,7 +2546,7 @@ inline const char* GetEnv(const char* name) {
 #endif
 }
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 #if GTEST_OS_WINDOWS_MOBILE
 // Windows CE has no C library. The abort() function is used in
@@ -2528,15 +2662,15 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 # define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
 # define GTEST_DECLARE_int32_(name) \
     GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name)
-#define GTEST_DECLARE_string_(name) \
+# define GTEST_DECLARE_string_(name) \
     GTEST_API_ extern ::std::string GTEST_FLAG(name)
 
 // Macros for defining flags.
-#define GTEST_DEFINE_bool_(name, default_val, doc) \
+# define GTEST_DEFINE_bool_(name, default_val, doc) \
     GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_int32_(name, default_val, doc) \
+# define GTEST_DEFINE_int32_(name, default_val, doc) \
     GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val)
-#define GTEST_DEFINE_string_(name, default_val, doc) \
+# define GTEST_DEFINE_string_(name, default_val, doc) \
     GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
 
 #endif  // !defined(GTEST_DECLARE_bool_)
@@ -2550,7 +2684,7 @@ typedef TypeWithSize<8>::Int TimeInMillis;  // Represents time in milliseconds.
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-// TODO(chandlerc): Find a better way to refactor flag and environment parsing
+// FIXME: Find a better way to refactor flag and environment parsing
 // out of both gtest-port.cc and gtest.cc to avoid exporting this utility
 // function.
 bool ParseInt32(const Message& src_text, const char* str, Int32* value);
@@ -2559,7 +2693,8 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value);
 // corresponding to the given Google Test flag.
 bool BoolFromGTestEnv(const char* flag, bool default_val);
 GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val);
-std::string StringFromGTestEnv(const char* flag, const char* default_val);
+std::string OutputFlagAlsoCheckEnvVar();
+const char* StringFromGTestEnv(const char* flag, const char* default_val);
 
 }  // namespace internal
 }  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 97f1a7fdd2c..4c9b6262c3c 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -27,17 +27,17 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
-// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
 // Google Test.  They are subject to change without notice. They should not used
 // by code external to Google Test.
 //
-// This header file is #included by <gtest/internal/gtest-internal.h>.
+// This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h
index e9b405340a8..78a3a6a01fa 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h
@@ -30,11 +30,12 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Implements a subset of TR1 tuple needed by Google Test and Google Mock.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 
@@ -42,7 +43,7 @@
 
 // The compiler used in Symbian has a bug that prevents us from declaring the
 // tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
+// bypasses the bug by declaring the members that should otherwise be
 // private as public.
 // Sun Studio versions < 12 also have the above bug.
 #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump
index 429ddfeecaa..bb626e049f0 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-tuple.h.pump
@@ -29,11 +29,12 @@ $$ This meta comment fixes auto-indentation in Emacs. }}
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Implements a subset of TR1 tuple needed by Google Test and Google Mock.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_
 
@@ -41,7 +42,7 @@ $$ This meta comment fixes auto-indentation in Emacs. }}
 
 // The compiler used in Symbian has a bug that prevents us from declaring the
 // tuple template as a friend (it complains that tuple is redefined).  This
-// hack bypasses the bug by declaring the members that should otherwise be
+// bypasses the bug by declaring the members that should otherwise be
 // private as public.
 // Sun Studio versions < 12 also have the above bug.
 #if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590)
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index e46f7cfcb48..28e41124536 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -30,8 +30,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Type utilities needed for implementing typed and type-parameterized
 // tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -41,6 +40,8 @@
 // Please contact googletestframework@googlegroups.com if you need
 // more.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
@@ -57,6 +58,22 @@
 namespace testing {
 namespace internal {
 
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
 // GetTypeName<T>() returns a human-readable name of type T.
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
@@ -75,7 +92,7 @@ std::string GetTypeName() {
   char* const readable_name = __cxa_demangle(name, 0, 0, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
-  return name_str;
+  return CanonicalizeForStdLibVersioning(name_str);
 #  else
   return name;
 #  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
diff --git a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
index 251fdf025b2..0001a5d39df 100644
--- a/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
+++ b/libs/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h.pump
@@ -28,8 +28,7 @@ $var n = 50  $$ Maximum length of type lists we want to support.
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 // Type utilities needed for implementing typed and type-parameterized
 // tests.  This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
@@ -39,6 +38,8 @@ $var n = 50  $$ Maximum length of type lists we want to support.
 // Please contact googletestframework@googlegroups.com if you need
 // more.
 
+// GOOGLETEST_CM0001 DO NOT DELETE
+
 #ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
@@ -55,6 +56,22 @@ $var n = 50  $$ Maximum length of type lists we want to support.
 namespace testing {
 namespace internal {
 
+// Canonicalizes a given name with respect to the Standard C++ Library.
+// This handles removing the inline namespace within `std` that is
+// used by various standard libraries (e.g., `std::__1`).  Names outside
+// of namespace std are returned unmodified.
+inline std::string CanonicalizeForStdLibVersioning(std::string s) {
+  static const char prefix[] = "std::__";
+  if (s.compare(0, strlen(prefix), prefix) == 0) {
+    std::string::size_type end = s.find("::", strlen(prefix));
+    if (end != s.npos) {
+      // Erase everything between the initial `std` and the second `::`.
+      s.erase(strlen("std"), end - strlen("std"));
+    }
+  }
+  return s;
+}
+
 // GetTypeName<T>() returns a human-readable name of type T.
 // NB: This function is also used in Google Mock, so don't move it inside of
 // the typed-test-only section below.
@@ -73,7 +90,7 @@ std::string GetTypeName() {
   char* const readable_name = __cxa_demangle(name, 0, 0, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
-  return name_str;
+  return CanonicalizeForStdLibVersioning(name_str);
 #  else
   return name;
 #  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
index 0a9cee52233..b217a18006b 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-all.cc
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// Google C++ Testing Framework (Google Test)
+// Google C++ Testing and Mocking Framework (Google Test)
 //
 // Sometimes it's desirable to build Google Test by compiling a single file.
 // This file serves this purpose.
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc
index a01a3698308..09083551612 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-death-test.cc
@@ -26,8 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev)
+
 //
 // This file implements death tests.
 
@@ -62,26 +61,30 @@
 #  include <spawn.h>
 # endif  // GTEST_OS_QNX
 
+# if GTEST_OS_FUCHSIA
+#  include <lib/fdio/io.h>
+#  include <lib/fdio/spawn.h>
+#  include <zircon/processargs.h>
+#  include <zircon/syscalls.h>
+#  include <zircon/syscalls/port.h>
+# endif  // GTEST_OS_FUCHSIA
+
 #endif  // GTEST_HAS_DEATH_TEST
 
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
 // Constants.
 
 // The default death test style.
-static const char kDefaultDeathTestStyle[] = "fast";
+//
+// This is defined in internal/gtest-port.h as "fast", but can be overridden by
+// a definition in internal/custom/gtest-port.h. The recommended value, which is
+// used internally at Google, is "threadsafe".
+static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
 GTEST_DEFINE_string_(
     death_test_style,
@@ -121,7 +124,7 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
 # endif
 
@@ -131,10 +134,10 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
-  // On Windows, death tests are thread-safe regardless of the value of the
-  // death_test_style flag.
+  // On Windows and Fuchsia, death tests are thread-safe regardless of the value
+  // of the death_test_style flag.
   return !GTEST_FLAG(internal_run_death_test).empty();
 
 # else
@@ -154,7 +157,7 @@ ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
@@ -162,10 +165,10 @@ bool ExitedWithCode::operator()(int exit_status) const {
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
 KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
 }
@@ -182,7 +185,7 @@ bool KilledBySignal::operator()(int exit_status) const {
 #  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -193,7 +196,7 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS
+# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
@@ -209,7 +212,7 @@ static std::string ExitSummary(int exit_code) {
     m << " (core dumped)";
   }
 #  endif
-# endif  // GTEST_OS_WINDOWS
+# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -220,7 +223,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS
+# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -229,13 +232,19 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
   Message msg;
   msg << "Death tests use fork(), which is unsafe particularly"
       << " in a threaded context. For this test, " << GTEST_NAME_ << " ";
-  if (thread_count == 0)
+  if (thread_count == 0) {
     msg << "couldn't detect the number of threads.";
-  else
+  } else {
     msg << "detected " << thread_count << " threads.";
+  }
+  msg << " See "
+         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "advanced.md#death-tests-and-threads"
+      << " for more explanation and suggested solutions, especially if"
+      << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS
+# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -243,6 +252,13 @@ static const char kDeathTestReturned = 'R';
 static const char kDeathTestThrew = 'T';
 static const char kDeathTestInternalError = 'I';
 
+#if GTEST_OS_FUCHSIA
+
+// File descriptor used for the pipe in the child process.
+static const int kFuchsiaReadPipeFd = 3;
+
+#endif
+
 // An enumeration describing all of the possible ways that a death test can
 // conclude.  DIED means that the process died while executing the test
 // code; LIVED means that process lived beyond the end of the test code;
@@ -250,7 +266,7 @@ static const char kDeathTestInternalError = 'I';
 // statement, which is not allowed; THREW means that the test statement
 // returned control by throwing an exception.  IN_PROGRESS means the test
 // has not yet concluded.
-// TODO(vladl@google.com): Unify names and possibly values for
+// FIXME: Unify names and possibly values for
 // AbortReason, DeathTestOutcome, and flag characters above.
 enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 
@@ -259,7 +275,7 @@ enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW };
 // message is propagated back to the parent process.  Otherwise, the
 // message is simply printed to stderr.  In either case, the program
 // then exits with status 1.
-void DeathTestAbort(const std::string& message) {
+static void DeathTestAbort(const std::string& message) {
   // On a POSIX system, this function may be called from a threadsafe-style
   // death test child process, which operates on a very small stack.  Use
   // the heap for any additional non-minuscule memory requirements.
@@ -563,7 +579,12 @@ bool DeathTestImpl::Passed(bool status_ok) {
       break;
     case DIED:
       if (status_ok) {
+# if GTEST_USES_PCRE
+        // PCRE regexes support embedded NULs.
+        const bool matched = RE::PartialMatch(error_message, *regex());
+# else
         const bool matched = RE::PartialMatch(error_message.c_str(), *regex());
+# endif  // GTEST_USES_PCRE
         if (matched) {
           success = true;
         } else {
@@ -779,7 +800,200 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   set_spawned(true);
   return OVERSEE_TEST;
 }
-# else  // We are not on Windows.
+
+# elif GTEST_OS_FUCHSIA
+
+class FuchsiaDeathTest : public DeathTestImpl {
+ public:
+  FuchsiaDeathTest(const char* a_statement,
+                   const RE* a_regex,
+                   const char* file,
+                   int line)
+      : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {}
+  virtual ~FuchsiaDeathTest() {
+    zx_status_t status = zx_handle_close(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+    status = zx_handle_close(port_);
+    GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  }
+
+  // All of these virtual functions are inherited from DeathTest.
+  virtual int Wait();
+  virtual TestRole AssumeRole();
+
+ private:
+  // The name of the file in which the death test is located.
+  const char* const file_;
+  // The line number on which the death test is located.
+  const int line_;
+
+  zx_handle_t child_process_ = ZX_HANDLE_INVALID;
+  zx_handle_t port_ = ZX_HANDLE_INVALID;
+};
+
+// Utility class for accumulating command-line arguments.
+class Arguments {
+ public:
+  Arguments() {
+    args_.push_back(NULL);
+  }
+
+  ~Arguments() {
+    for (std::vector<char*>::iterator i = args_.begin(); i != args_.end();
+         ++i) {
+      free(*i);
+    }
+  }
+  void AddArgument(const char* argument) {
+    args_.insert(args_.end() - 1, posix::StrDup(argument));
+  }
+
+  template <typename Str>
+  void AddArguments(const ::std::vector<Str>& arguments) {
+    for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
+         i != arguments.end();
+         ++i) {
+      args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
+    }
+  }
+  char* const* Argv() {
+    return &args_[0];
+  }
+
+  int size() {
+    return args_.size() - 1;
+  }
+
+ private:
+  std::vector<char*> args_;
+};
+
+// Waits for the child in a death test to exit, returning its exit
+// status, or 0 if no child process exists.  As a side effect, sets the
+// outcome data member.
+int FuchsiaDeathTest::Wait() {
+  if (!spawned())
+    return 0;
+
+  // Register to wait for the child process to terminate.
+  zx_status_t status_zx;
+  status_zx = zx_object_wait_async(child_process_,
+                                   port_,
+                                   0 /* key */,
+                                   ZX_PROCESS_TERMINATED,
+                                   ZX_WAIT_ASYNC_ONCE);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  // Wait for it to terminate, or an exception to be received.
+  zx_port_packet_t packet;
+  status_zx = zx_port_wait(port_, ZX_TIME_INFINITE, &packet);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  if (ZX_PKT_IS_EXCEPTION(packet.type)) {
+    // Process encountered an exception. Kill it directly rather than letting
+    // other handlers process the event.
+    status_zx = zx_task_kill(child_process_);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+    // Now wait for |child_process_| to terminate.
+    zx_signals_t signals = 0;
+    status_zx = zx_object_wait_one(
+        child_process_, ZX_PROCESS_TERMINATED, ZX_TIME_INFINITE, &signals);
+    GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+    GTEST_DEATH_TEST_CHECK_(signals & ZX_PROCESS_TERMINATED);
+  } else {
+    // Process terminated.
+    GTEST_DEATH_TEST_CHECK_(ZX_PKT_IS_SIGNAL_ONE(packet.type));
+    GTEST_DEATH_TEST_CHECK_(packet.signal.observed & ZX_PROCESS_TERMINATED);
+  }
+
+  ReadAndInterpretStatusByte();
+
+  zx_info_process_t buffer;
+  status_zx = zx_object_get_info(
+      child_process_,
+      ZX_INFO_PROCESS,
+      &buffer,
+      sizeof(buffer),
+      nullptr,
+      nullptr);
+  GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
+
+  GTEST_DEATH_TEST_CHECK_(buffer.exited);
+  set_status(buffer.return_code);
+  return status();
+}
+
+// The AssumeRole process for a Fuchsia death test.  It creates a child
+// process with the same executable as the current process to run the
+// death test.  The child process is given the --gtest_filter and
+// --gtest_internal_run_death_test flags such that it knows to run the
+// current death test only.
+DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
+  const UnitTestImpl* const impl = GetUnitTestImpl();
+  const InternalRunDeathTestFlag* const flag =
+      impl->internal_run_death_test_flag();
+  const TestInfo* const info = impl->current_test_info();
+  const int death_test_index = info->result()->death_test_count();
+
+  if (flag != NULL) {
+    // ParseInternalRunDeathTestFlag() has performed all the necessary
+    // processing.
+    set_write_fd(kFuchsiaReadPipeFd);
+    return EXECUTE_TEST;
+  }
+
+  CaptureStderr();
+  // Flush the log buffers since the log streams are shared with the child.
+  FlushInfoLog();
+
+  // Build the child process command line.
+  const std::string filter_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kFilterFlag + "="
+      + info->test_case_name() + "." + info->name();
+  const std::string internal_flag =
+      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
+      + file_ + "|"
+      + StreamableToString(line_) + "|"
+      + StreamableToString(death_test_index);
+  Arguments args;
+  args.AddArguments(GetInjectableArgvs());
+  args.AddArgument(filter_flag.c_str());
+  args.AddArgument(internal_flag.c_str());
+
+  // Build the pipe for communication with the child.
+  zx_status_t status;
+  zx_handle_t child_pipe_handle;
+  uint32_t type;
+  status = fdio_pipe_half(&child_pipe_handle, &type);
+  GTEST_DEATH_TEST_CHECK_(status >= 0);
+  set_read_fd(status);
+
+  // Set the pipe handle for the child.
+  fdio_spawn_action_t add_handle_action = {};
+  add_handle_action.action = FDIO_SPAWN_ACTION_ADD_HANDLE;
+  add_handle_action.h.id = PA_HND(type, kFuchsiaReadPipeFd);
+  add_handle_action.h.handle = child_pipe_handle;
+
+  // Spawn the child process.
+  status = fdio_spawn_etc(ZX_HANDLE_INVALID, FDIO_SPAWN_CLONE_ALL,
+                          args.Argv()[0], args.Argv(), nullptr, 1,
+                          &add_handle_action, &child_process_, nullptr);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  // Create an exception port and attach it to the |child_process_|, to allow
+  // us to suppress the system default exception handler from firing.
+  status = zx_port_create(0, &port_);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+  status = zx_task_bind_exception_port(
+      child_process_, port_, 0 /* key */, 0 /*options */);
+  GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
+
+  set_spawned(true);
+  return OVERSEE_TEST;
+}
+
+#else  // We are neither on Windows, nor on Fuchsia.
 
 // ForkingDeathTest provides implementations for most of the abstract
 // methods of the DeathTest interface.  Only the AssumeRole method is
@@ -883,11 +1097,10 @@ class ExecDeathTest : public ForkingDeathTest {
       ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { }
   virtual TestRole AssumeRole();
  private:
-  static ::std::vector<testing::internal::string>
-  GetArgvsForDeathTestChildProcess() {
-    ::std::vector<testing::internal::string> args = GetInjectableArgvs();
+  static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
+    ::std::vector<std::string> args = GetInjectableArgvs();
 #  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
-    ::std::vector<testing::internal::string> extra_args =
+    ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
 #  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
@@ -986,6 +1199,7 @@ static int ExecDeathTestChildMain(void* child_arg) {
 }
 #  endif  // !GTEST_OS_QNX
 
+#  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -995,20 +1209,22 @@ static int ExecDeathTestChildMain(void* child_arg) {
 // GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining
 // StackLowerThanAddress into StackGrowsDown, which then doesn't give
 // correct answer.
-void StackLowerThanAddress(const void* ptr, bool* result) GTEST_NO_INLINE_;
-void StackLowerThanAddress(const void* ptr, bool* result) {
+static void StackLowerThanAddress(const void* ptr,
+                                  bool* result) GTEST_NO_INLINE_;
+static void StackLowerThanAddress(const void* ptr, bool* result) {
   int dummy;
   *result = (&dummy < ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-bool StackGrowsDown() {
+static bool StackGrowsDown() {
   int dummy;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
+#  endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1200,6 +1416,13 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
     *test = new WindowsDeathTest(statement, regex, file, line);
   }
 
+# elif GTEST_OS_FUCHSIA
+
+  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
+      GTEST_FLAG(death_test_style) == "fast") {
+    *test = new FuchsiaDeathTest(statement, regex, file, line);
+  }
+
 # else
 
   if (GTEST_FLAG(death_test_style) == "threadsafe") {
@@ -1224,7 +1447,7 @@ bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex,
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
-int GetStatusFileDescriptor(unsigned int parent_process_id,
+static int GetStatusFileDescriptor(unsigned int parent_process_id,
                             size_t write_handle_as_size_t,
                             size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
@@ -1235,7 +1458,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
                    StreamableToString(parent_process_id));
   }
 
-  // TODO(vladl@google.com): Replace the following check with a
+  // FIXME: Replace the following check with a
   // compile-time assertion when available.
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
@@ -1243,7 +1466,7 @@ int GetStatusFileDescriptor(unsigned int parent_process_id,
       reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
-  // The newly initialized handle is accessible only in in the parent
+  // The newly initialized handle is accessible only in the parent
   // process. To obtain one accessible within the child, we need to use
   // DuplicateHandle.
   if (!::DuplicateHandle(parent_process_handle.Get(), write_handle,
@@ -1320,6 +1543,16 @@ InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
   write_fd = GetStatusFileDescriptor(parent_process_id,
                                      write_handle_as_size_t,
                                      event_handle_as_size_t);
+
+# elif GTEST_OS_FUCHSIA
+
+  if (fields.size() != 3
+      || !ParseNaturalNumber(fields[1], &line)
+      || !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
+        + GTEST_FLAG(internal_run_death_test));
+  }
+
 # else
 
   if (fields.size() != 4
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc
index 0292dc11957..a7e65c082a7 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-filepath.cc
@@ -26,14 +26,12 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Authors: keith.ray@gmail.com (Keith Ray)
 
-#include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-filepath.h"
-#include "gtest/internal/gtest-port.h"
 
 #include <stdlib.h>
+#include "gtest/internal/gtest-port.h"
+#include "gtest/gtest-message.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
 # include <windows.h>
@@ -48,6 +46,8 @@
 # include <climits>  // Some Linux distributions define PATH_MAX here.
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
+#include "gtest/internal/gtest-string.h"
+
 #if GTEST_OS_WINDOWS
 # define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
@@ -58,8 +58,6 @@
 # define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
-#include "gtest/internal/gtest-string.h"
-
 namespace testing {
 namespace internal {
 
@@ -130,7 +128,7 @@ FilePath FilePath::RemoveExtension(const char* extension) const {
   return *this;
 }
 
-// Returns a pointer to the last occurence of a valid path separator in
+// Returns a pointer to the last occurrence of a valid path separator in
 // the FilePath. On Windows, for example, both '/' and '\' are valid path
 // separators. Returns NULL if no path separator was found.
 const char* FilePath::FindLastPathSeparator() const {
@@ -252,7 +250,7 @@ bool FilePath::DirectoryExists() const {
 // root directory per disk drive.)
 bool FilePath::IsRootDirectory() const {
 #if GTEST_OS_WINDOWS
-  // TODO(wan@google.com): on Windows a network share like
+  // FIXME: on Windows a network share like
   // \\server\share can be a root directory, although it cannot be the
   // current directory.  Handle this properly.
   return pathname_.length() == 3 && IsAbsolutePath();
@@ -352,7 +350,7 @@ FilePath FilePath::RemoveTrailingPathSeparator() const {
 // Removes any redundant separators that might be in the pathname.
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
-// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share).
+// FIXME: handle Windows network shares (e.g. \\server\share).
 void FilePath::Normalize() {
   if (pathname_.c_str() == NULL) {
     pathname_ = "";
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
index ed8a682a964..479004149b4 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
@@ -27,24 +27,13 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// Utility functions and classes used by the Google C++ testing framework.
-//
-// Author: wan@google.com (Zhanyong Wan)
-//
+// Utility functions and classes used by the Google C++ testing framework.//
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
 #ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
 #define GTEST_SRC_GTEST_INTERNAL_INL_H_
 
-// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is
-// part of Google Test's implementation; otherwise it's undefined.
-#if !GTEST_IMPLEMENTATION_
-// If this file is included from the user's code, just say no.
-# error "gtest-internal-inl.h is part of Google Test's internal implementation."
-# error "It must not be included except by Google Test itself."
-#endif  // GTEST_IMPLEMENTATION_
-
 #ifndef _WIN32_WCE
 # include <errno.h>
 #endif  // !_WIN32_WCE
@@ -67,9 +56,12 @@
 # include <windows.h>  // NOLINT
 #endif  // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"  // NOLINT
+#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
 
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
 namespace testing {
 
 // Declares the flags.
@@ -94,6 +86,7 @@ const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
 const char kPrintTimeFlag[] = "print_time";
+const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
 const char kRepeatFlag[] = "repeat";
 const char kShuffleFlag[] = "shuffle";
@@ -174,6 +167,7 @@ class GTestFlagSaver {
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
     print_time_ = GTEST_FLAG(print_time);
+    print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
     repeat_ = GTEST_FLAG(repeat);
     shuffle_ = GTEST_FLAG(shuffle);
@@ -195,6 +189,7 @@ class GTestFlagSaver {
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
     GTEST_FLAG(print_time) = print_time_;
+    GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
     GTEST_FLAG(repeat) = repeat_;
     GTEST_FLAG(shuffle) = shuffle_;
@@ -216,6 +211,7 @@ class GTestFlagSaver {
   bool list_tests_;
   std::string output_;
   bool print_time_;
+  bool print_utf8_;
   internal::Int32 random_seed_;
   internal::Int32 repeat_;
   bool shuffle_;
@@ -426,7 +422,7 @@ class OsStackTraceGetterInterface {
   //                in the trace.
   //   skip_count - the number of top frames to be skipped; doesn't count
   //                against max_depth.
-  virtual string CurrentStackTrace(int max_depth, int skip_count) = 0;
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count) = 0;
 
   // UponLeavingGTest() should be called immediately before Google Test calls
   // user code. It saves some information about the current stack that
@@ -446,10 +442,20 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
  public:
   OsStackTraceGetter() {}
 
-  virtual string CurrentStackTrace(int max_depth, int skip_count);
+  virtual std::string CurrentStackTrace(int max_depth, int skip_count);
   virtual void UponLeavingGTest();
 
  private:
+#if GTEST_HAS_ABSL
+  Mutex mutex_;  // Protects all internal state.
+
+  // We save the stack frame below the frame that calls user code.
+  // We do this because the address of the frame immediately below
+  // the user code changes between the call to UponLeavingGTest()
+  // and any calls to the stack trace code from within the user code.
+  void* caller_frame_ = nullptr;
+#endif  // GTEST_HAS_ABSL
+
   GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
 };
 
@@ -664,13 +670,11 @@ class GTEST_API_ UnitTestImpl {
                 tear_down_tc)->AddTestInfo(test_info);
   }
 
-#if GTEST_HAS_PARAM_TEST
   // Returns ParameterizedTestCaseRegistry object used to keep track of
   // value-parameterized tests and instantiate and register them.
   internal::ParameterizedTestCaseRegistry& parameterized_test_registry() {
     return parameterized_test_registry_;
   }
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Sets the TestCase object for the test that's currently running.
   void set_current_test_case(TestCase* a_current_test_case) {
@@ -845,14 +849,12 @@ class GTEST_API_ UnitTestImpl {
   // shuffled order.
   std::vector<int> test_case_indices_;
 
-#if GTEST_HAS_PARAM_TEST
   // ParameterizedTestRegistry object used to register value-parameterized
   // tests.
   internal::ParameterizedTestCaseRegistry parameterized_test_registry_;
 
   // Indicates whether RegisterParameterizedTests() has been called already.
   bool parameterized_tests_registered_;
-#endif  // GTEST_HAS_PARAM_TEST
 
   // Index of the last death test case registered.  Initially -1.
   int last_death_test_case_;
@@ -992,7 +994,7 @@ bool ParseNaturalNumber(const ::std::string& str, Integer* number) {
 
   const bool parse_success = *end == '\0' && errno == 0;
 
-  // TODO(vladl@google.com): Convert this to compile time assertion when it is
+  // FIXME: Convert this to compile time assertion when it is
   // available.
   GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed));
 
@@ -1032,7 +1034,7 @@ class TestResultAccessor {
 #if GTEST_CAN_STREAM_RESULTS_
 
 // Streams test results to the given port on the given host machine.
-class GTEST_API_ StreamingListener : public EmptyTestEventListener {
+class StreamingListener : public EmptyTestEventListener {
  public:
   // Abstract base class for writing strings to a socket.
   class AbstractSocketWriter {
@@ -1040,21 +1042,19 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener {
     virtual ~AbstractSocketWriter() {}
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) = 0;
+    virtual void Send(const std::string& message) = 0;
 
     // Closes the socket.
     virtual void CloseConnection() {}
 
     // Sends a string and a newline to the socket.
-    void SendLn(const string& message) {
-      Send(message + "\n");
-    }
+    void SendLn(const std::string& message) { Send(message + "\n"); }
   };
 
   // Concrete class for actually writing strings to a socket.
   class SocketWriter : public AbstractSocketWriter {
    public:
-    SocketWriter(const string& host, const string& port)
+    SocketWriter(const std::string& host, const std::string& port)
         : sockfd_(-1), host_name_(host), port_num_(port) {
       MakeConnection();
     }
@@ -1065,7 +1065,7 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener {
     }
 
     // Sends a string to the socket.
-    virtual void Send(const string& message) {
+    virtual void Send(const std::string& message) {
       GTEST_CHECK_(sockfd_ != -1)
           << "Send() can be called only when there is a connection.";
 
@@ -1091,17 +1091,19 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener {
     }
 
     int sockfd_;  // socket file descriptor
-    const string host_name_;
-    const string port_num_;
+    const std::string host_name_;
+    const std::string port_num_;
 
     GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
-  static string UrlEncode(const char* str);
+  static std::string UrlEncode(const char* str);
 
-  StreamingListener(const string& host, const string& port)
-      : socket_writer_(new SocketWriter(host, port)) { Start(); }
+  StreamingListener(const std::string& host, const std::string& port)
+      : socket_writer_(new SocketWriter(host, port)) {
+    Start();
+  }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
       : socket_writer_(socket_writer) { Start(); }
@@ -1162,13 +1164,13 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener {
 
  private:
   // Sends the given message and a newline to the socket.
-  void SendLn(const string& message) { socket_writer_->SendLn(message); }
+  void SendLn(const std::string& message) { socket_writer_->SendLn(message); }
 
   // Called at the start of streaming to notify the receiver what
   // protocol we are using.
   void Start() { SendLn("gtest_streaming_protocol_version=1.0"); }
 
-  string FormatBool(bool value) { return value ? "1" : "0"; }
+  std::string FormatBool(bool value) { return value ? "1" : "0"; }
 
   const scoped_ptr<AbstractSocketWriter> socket_writer_;
 
@@ -1180,4 +1182,6 @@ class GTEST_API_ StreamingListener : public EmptyTestEventListener {
 }  // namespace internal
 }  // namespace testing
 
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
 #endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc
index e5bf3dd2be4..fecb5d11c21 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-port.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-port.cc
@@ -26,8 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 #include "gtest/internal/gtest-port.h"
 
@@ -63,19 +62,16 @@
 # include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
+#if GTEST_OS_FUCHSIA
+# include <zircon/process.h>
+# include <zircon/syscalls.h>
+#endif  // GTEST_OS_FUCHSIA
+
 #include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 namespace internal {
@@ -93,7 +89,7 @@ const int kStdErrFileno = STDERR_FILENO;
 
 namespace {
 template <typename T>
-T ReadProcFileField(const string& filename, int field) {
+T ReadProcFileField(const std::string& filename, int field) {
   std::string dummy;
   std::ifstream file(filename.c_str());
   while (field-- > 0) {
@@ -107,7 +103,7 @@ T ReadProcFileField(const string& filename, int field) {
 
 // Returns the number of active threads, or 0 when there is an error.
 size_t GetThreadCount() {
-  const string filename =
+  const std::string filename =
       (Message() << "/proc/" << getpid() << "/stat").GetString();
   return ReadProcFileField<int>(filename, 19);
 }
@@ -164,6 +160,25 @@ size_t GetThreadCount() {
   }
 }
 
+#elif GTEST_OS_FUCHSIA
+
+size_t GetThreadCount() {
+  int dummy_buffer;
+  size_t avail;
+  zx_status_t status = zx_object_get_info(
+      zx_process_self(),
+      ZX_INFO_PROCESS_THREADS,
+      &dummy_buffer,
+      0,
+      nullptr,
+      &avail);
+  if (status == ZX_OK) {
+    return avail;
+  } else {
+    return 0;
+  }
+}
+
 #else
 
 size_t GetThreadCount() {
@@ -246,9 +261,9 @@ Mutex::Mutex()
 Mutex::~Mutex() {
   // Static mutexes are leaked intentionally. It is not thread-safe to try
   // to clean them up.
-  // TODO(yukawa): Switch to Slim Reader/Writer (SRW) Locks, which requires
+  // FIXME: Switch to Slim Reader/Writer (SRW) Locks, which requires
   // nothing to clean it up but is available only on Vista and later.
-  // http://msdn.microsoft.com/en-us/library/windows/desktop/aa904937.aspx
+  // https://docs.microsoft.com/en-us/windows/desktop/Sync/slim-reader-writer--srw--locks
   if (type_ == kDynamic) {
     ::DeleteCriticalSection(critical_section_);
     delete critical_section_;
@@ -279,6 +294,43 @@ void Mutex::AssertHeld() {
       << "The current thread is not holding the mutex @" << this;
 }
 
+namespace {
+
+// Use the RAII idiom to flag mem allocs that are intentionally never
+// deallocated. The motivation is to silence the false positive mem leaks
+// that are reported by the debug version of MS's CRT which can only detect
+// if an alloc is missing a matching deallocation.
+// Example:
+//    MemoryIsNotDeallocated memory_is_not_deallocated;
+//    critical_section_ = new CRITICAL_SECTION;
+//
+class MemoryIsNotDeallocated
+{
+ public:
+  MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
+#ifdef _MSC_VER
+    old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
+    // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
+    // doesn't report mem leak if there's no matching deallocation.
+    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+#endif  //  _MSC_VER
+  }
+
+  ~MemoryIsNotDeallocated() {
+#ifdef _MSC_VER
+    // Restore the original _CRTDBG_ALLOC_MEM_DF flag
+    _CrtSetDbgFlag(old_crtdbg_flag_);
+#endif  //  _MSC_VER
+  }
+
+ private:
+  int old_crtdbg_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+};
+
+}  // namespace
+
 // Initializes owner_thread_id_ and critical_section_ in static mutexes.
 void Mutex::ThreadSafeLazyInit() {
   // Dynamic mutexes are initialized in the constructor.
@@ -289,7 +341,11 @@ void Mutex::ThreadSafeLazyInit() {
         // If critical_section_init_phase_ was 0 before the exchange, we
         // are the first to test it and need to perform the initialization.
         owner_thread_id_ = 0;
-        critical_section_ = new CRITICAL_SECTION;
+        {
+          // Use RAII to flag that following mem alloc is never deallocated.
+          MemoryIsNotDeallocated memory_is_not_deallocated;
+          critical_section_ = new CRITICAL_SECTION;
+        }
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
@@ -328,7 +384,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
                              Notification* thread_can_start) {
     ThreadMainParam* param = new ThreadMainParam(runnable, thread_can_start);
     DWORD thread_id;
-    // TODO(yukawa): Consider to use _beginthreadex instead.
+    // FIXME: Consider to use _beginthreadex instead.
     HANDLE thread_handle = ::CreateThread(
         NULL,    // Default security.
         0,       // Default stack size.
@@ -496,7 +552,7 @@ class ThreadLocalRegistryImpl {
                                  FALSE,
                                  thread_id);
     GTEST_CHECK_(thread != NULL);
-    // We need to to pass a valid thread ID pointer into CreateThread for it
+    // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
     DWORD watcher_thread_id;
     HANDLE watcher_thread = ::CreateThread(
@@ -531,7 +587,8 @@ class ThreadLocalRegistryImpl {
   // Returns map of thread local instances.
   static ThreadIdToThreadLocals* GetThreadLocalsMapLocked() {
     mutex_.AssertHeld();
-    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals;
+    MemoryIsNotDeallocated memory_is_not_deallocated;
+    static ThreadIdToThreadLocals* map = new ThreadIdToThreadLocals();
     return map;
   }
 
@@ -671,7 +728,7 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 }
 
 // Helper function used by ValidateRegex() to format error messages.
-std::string FormatRegexSyntaxError(const char* regex, int index) {
+static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
           << " in simple regular expression \"" << regex << "\": ").GetString();
 }
@@ -680,7 +737,7 @@ std::string FormatRegexSyntaxError(const char* regex, int index) {
 // otherwise returns true.
 bool ValidateRegex(const char* regex) {
   if (regex == NULL) {
-    // TODO(wan@google.com): fix the source file location in the
+    // FIXME: fix the source file location in the
     // assertion failures to match where the regex is used in user
     // code.
     ADD_FAILURE() << "NULL is not a valid simple regular expression.";
@@ -923,9 +980,10 @@ GTestLog::~GTestLog() {
     posix::Abort();
   }
 }
+
 // Disable Microsoft deprecation warnings for POSIX functions called from
 // this class (creat, dup, dup2, and close)
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
 
 #if GTEST_HAS_STREAM_REDIRECTION
 
@@ -1009,13 +1067,14 @@ class CapturedStream {
   GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
 };
 
-GTEST_DISABLE_MSC_WARNINGS_POP_()
+GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
 static CapturedStream* g_captured_stderr = NULL;
 static CapturedStream* g_captured_stdout = NULL;
 
 // Starts capturing an output stream (stdout/stderr).
-void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
+static void CaptureStream(int fd, const char* stream_name,
+                          CapturedStream** stream) {
   if (*stream != NULL) {
     GTEST_LOG_(FATAL) << "Only one " << stream_name
                       << " capturer can exist at a time.";
@@ -1024,7 +1083,7 @@ void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) {
 }
 
 // Stops capturing the output stream and returns the captured string.
-std::string GetCapturedStream(CapturedStream** captured_stream) {
+static std::string GetCapturedStream(CapturedStream** captured_stream) {
   const std::string content = (*captured_stream)->GetCapturedString();
 
   delete *captured_stream;
@@ -1055,23 +1114,9 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-std::string TempDir() {
-#if GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = posix::GetEnv("TEMP");
-  if (temp_dir == NULL || temp_dir[0] == '\0')
-    return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
-    return temp_dir;
-  else
-    return std::string(temp_dir) + "\\";
-#elif GTEST_OS_LINUX_ANDROID
-  return "/sdcard/";
-#else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
-}
+
+
+
 
 size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
@@ -1101,22 +1146,36 @@ std::string ReadEntireFile(FILE* file) {
 }
 
 #if GTEST_HAS_DEATH_TEST
+static const std::vector<std::string>* g_injected_test_argvs = NULL;  // Owned.
 
-static const ::std::vector<testing::internal::string>* g_injected_test_argvs =
-                                        NULL;  // Owned.
-
-void SetInjectableArgvs(const ::std::vector<testing::internal::string>* argvs) {
-  if (g_injected_test_argvs != argvs)
-    delete g_injected_test_argvs;
-  g_injected_test_argvs = argvs;
-}
-
-const ::std::vector<testing::internal::string>& GetInjectableArgvs() {
+std::vector<std::string> GetInjectableArgvs() {
   if (g_injected_test_argvs != NULL) {
     return *g_injected_test_argvs;
   }
   return GetArgvs();
 }
+
+void SetInjectableArgvs(const std::vector<std::string>* new_argvs) {
+  if (g_injected_test_argvs != new_argvs) delete g_injected_test_argvs;
+  g_injected_test_argvs = new_argvs;
+}
+
+void SetInjectableArgvs(const std::vector<std::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+
+#if GTEST_HAS_GLOBAL_STRING
+void SetInjectableArgvs(const std::vector< ::string>& new_argvs) {
+  SetInjectableArgvs(
+      new std::vector<std::string>(new_argvs.begin(), new_argvs.end()));
+}
+#endif  // GTEST_HAS_GLOBAL_STRING
+
+void ClearInjectableArgvs() {
+  delete g_injected_test_argvs;
+  g_injected_test_argvs = NULL;
+}
 #endif  // GTEST_HAS_DEATH_TEST
 
 #if GTEST_OS_WINDOWS_MOBILE
@@ -1191,11 +1250,12 @@ bool ParseInt32(const Message& src_text, const char* str, Int32* value) {
 bool BoolFromGTestEnv(const char* flag, bool default_value) {
 #if defined(GTEST_GET_BOOL_FROM_ENV_)
   return GTEST_GET_BOOL_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   return string_value == NULL ?
       default_value : strcmp(string_value, "0") != 0;
+#endif  // defined(GTEST_GET_BOOL_FROM_ENV_)
 }
 
 // Reads and returns a 32-bit integer stored in the environment
@@ -1204,7 +1264,7 @@ bool BoolFromGTestEnv(const char* flag, bool default_value) {
 Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
 #if defined(GTEST_GET_INT32_FROM_ENV_)
   return GTEST_GET_INT32_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
   const char* const string_value = posix::GetEnv(env_var.c_str());
   if (string_value == NULL) {
@@ -1222,37 +1282,36 @@ Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) {
   }
 
   return result;
+#endif  // defined(GTEST_GET_INT32_FROM_ENV_)
+}
+
+// As a special case for the 'output' flag, if GTEST_OUTPUT is not
+// set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
+// system.  The value of XML_OUTPUT_FILE is a filename without the
+// "xml:" prefix of GTEST_OUTPUT.
+// Note that this is meant to be called at the call site so it does
+// not check that the flag is 'output'
+// In essence this checks an env variable called XML_OUTPUT_FILE
+// and if it is set we prepend "xml:" to its value, if it not set we return ""
+std::string OutputFlagAlsoCheckEnvVar(){
+  std::string default_value_for_output_flag = "";
+  const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
+  if (NULL != xml_output_file_env) {
+    default_value_for_output_flag = std::string("xml:") + xml_output_file_env;
+  }
+  return default_value_for_output_flag;
 }
 
 // Reads and returns the string environment variable corresponding to
 // the given flag; if it's not set, returns default_value.
-std::string StringFromGTestEnv(const char* flag, const char* default_value) {
+const char* StringFromGTestEnv(const char* flag, const char* default_value) {
 #if defined(GTEST_GET_STRING_FROM_ENV_)
   return GTEST_GET_STRING_FROM_ENV_(flag, default_value);
-#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
+#else
   const std::string env_var = FlagToEnvVar(flag);
-  const char* value = posix::GetEnv(env_var.c_str());
-  if (value != NULL) {
-    return value;
-  }
-
-  // As a special case for the 'output' flag, if GTEST_OUTPUT is not
-  // set, we look for XML_OUTPUT_FILE, which is set by the Bazel build
-  // system.  The value of XML_OUTPUT_FILE is a filename without the
-  // "xml:" prefix of GTEST_OUTPUT.
-  //
-  // The net priority order after flag processing is thus:
-  //   --gtest_output command line flag
-  //   GTEST_OUTPUT environment variable
-  //   XML_OUTPUT_FILE environment variable
-  //   'default_value'
-  if (strcmp(flag, "output") == 0) {
-    value = posix::GetEnv("XML_OUTPUT_FILE");
-    if (value != NULL) {
-      return std::string("xml:") + value;
-    }
-  }
-  return default_value;
+  const char* const value = posix::GetEnv(env_var.c_str());
+  return value == NULL ? default_value : value;
+#endif  // defined(GTEST_GET_STRING_FROM_ENV_)
 }
 
 }  // namespace internal
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc
index a2df412f8a2..b5022549f90 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-printers.cc
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
 
-// Google Test - The Google C++ Testing Framework
+
+// Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
 // value of any type T:
@@ -43,12 +42,13 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
-#include <ctype.h>
 #include <stdio.h>
+#include <cctype>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
 #include "gtest/internal/gtest-port.h"
+#include "src/gtest-internal-inl.h"
 
 namespace testing {
 
@@ -89,7 +89,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
   // If the object size is bigger than kThreshold, we'll have to omit
   // some details by printing only the first and the last kChunkSize
   // bytes.
-  // TODO(wan): let the user control the threshold using a flag.
+  // FIXME: let the user control the threshold using a flag.
   if (count < kThreshold) {
     PrintByteSegmentInObjectTo(obj_bytes, 0, count, os);
   } else {
@@ -123,7 +123,7 @@ namespace internal {
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
-//   - as a hexidecimal escape sequence (e.g. '\x7F'), or
+//   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
 enum CharFormat {
   kAsIs,
@@ -180,7 +180,10 @@ static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
-        *os << "\\x" + String::FormatHexInt(static_cast<UnsignedChar>(c));
+        ostream::fmtflags flags = os->flags();
+        *os << "\\x" << std::hex << std::uppercase
+            << static_cast<int>(static_cast<UnsignedChar>(c));
+        os->flags(flags);
         return kHexEscape;
       }
   }
@@ -227,7 +230,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
     return;
   *os << " (" << static_cast<int>(c);
 
-  // For more convenience, we print c's code again in hexidecimal,
+  // For more convenience, we print c's code again in hexadecimal,
   // unless c was already printed in the form '\x##' or the code is in
   // [1, 9].
   if (format == kHexEscape || (1 <= c && c <= 9)) {
@@ -259,11 +262,12 @@ template <typename CharType>
 GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void PrintCharsAsStringTo(
+static CharFormat PrintCharsAsStringTo(
     const CharType* begin, size_t len, ostream* os) {
   const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
   *os << kQuoteBegin;
   bool is_previous_hex = false;
+  CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
     const CharType cur = begin[index];
     if (is_previous_hex && IsXDigit(cur)) {
@@ -273,8 +277,13 @@ static void PrintCharsAsStringTo(
       *os << "\" " << kQuoteBegin;
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
+    // Remember if any characters required hex escaping.
+    if (is_previous_hex) {
+      print_format = kHexEscape;
+    }
   }
   *os << "\"";
+  return print_format;
 }
 
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
@@ -339,20 +348,95 @@ void PrintTo(const wchar_t* s, ostream* os) {
     *os << "NULL";
   } else {
     *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, std::wcslen(s), os);
+    PrintCharsAsStringTo(s, wcslen(s), os);
   }
 }
 #endif  // wchar_t is native
 
+namespace {
+
+bool ContainsUnprintableControlCodes(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length; i++) {
+    unsigned char ch = *s++;
+    if (std::iscntrl(ch)) {
+        switch (ch) {
+        case '\t':
+        case '\n':
+        case '\r':
+          break;
+        default:
+          return true;
+        }
+      }
+  }
+  return false;
+}
+
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+
+bool IsValidUTF8(const char* str, size_t length) {
+  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+
+  for (size_t i = 0; i < length;) {
+    unsigned char lead = s[i++];
+
+    if (lead <= 0x7f) {
+      continue;  // single-byte character (ASCII) 0..7F
+    }
+    if (lead < 0xc2) {
+      return false;  // trail byte or non-shortest form
+    } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
+      ++i;  // 2-byte character
+    } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               // check for non-shortest form and surrogate
+               (lead != 0xe0 || s[i] >= 0xa0) &&
+               (lead != 0xed || s[i] < 0xa0)) {
+      i += 2;  // 3-byte character
+    } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
+               IsUTF8TrailByte(s[i]) &&
+               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i + 2]) &&
+               // check for non-shortest form
+               (lead != 0xf0 || s[i] >= 0x90) &&
+               (lead != 0xf4 || s[i] < 0x90)) {
+      i += 3;  // 4-byte character
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
+  if (!ContainsUnprintableControlCodes(str, length) &&
+      IsValidUTF8(str, length)) {
+    *os << "\n    As Text: \"" << str << "\"";
+  }
+}
+
+}  // anonymous namespace
+
 // Prints a ::string object.
 #if GTEST_HAS_GLOBAL_STRING
 void PrintStringTo(const ::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 #endif  // GTEST_HAS_GLOBAL_STRING
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
-  PrintCharsAsStringTo(s.data(), s.size(), os);
+  if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
+    if (GTEST_FLAG(print_utf8)) {
+      ConditionalPrintAsText(s.data(), s.size(), os);
+    }
+  }
 }
 
 // Prints a ::wstring object.
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc
index fb0e35425e1..c88860d9238 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-test-part.cc
@@ -26,21 +26,12 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: mheule@google.com (Markus Heule)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest-test-part.h"
-
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick exists to
-// prevent the accidental inclusion of gtest-internal-inl.h in the
-// user's code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 namespace testing {
 
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
index df1eef4754e..1dc2ad38bab 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
@@ -26,10 +26,10 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Author: wan@google.com (Zhanyong Wan)
+
 
 #include "gtest/gtest-typed-test.h"
+
 #include "gtest/gtest.h"
 
 namespace testing {
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest.cc b/libs/libvpx/third_party/googletest/src/src/gtest.cc
index 5a8932c73e3..96b07c68abb 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest.cc
@@ -26,10 +26,9 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 //
-// Author: wan@google.com (Zhanyong Wan)
-//
-// The Google C++ Testing Framework (Google Test)
+// The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
 #include "gtest/internal/custom/gtest.h"
@@ -55,7 +54,7 @@
 
 #if GTEST_OS_LINUX
 
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 // gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -94,9 +93,9 @@
 
 # if GTEST_OS_WINDOWS_MINGW
 // MinGW has gettimeofday() but not _ftime64().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
-// TODO(kenton@google.com): There are other ways to get the time on
+// FIXME: There are other ways to get the time on
 //   Windows, like GetTickCount() or GetSystemTimeAsFileTime().  MinGW
 //   supports these.  consider using them instead.
 #  define GTEST_HAS_GETTIMEOFDAY_ 1
@@ -111,7 +110,7 @@
 #else
 
 // Assume other platforms have gettimeofday().
-// TODO(kenton@google.com): Use autoconf to detect availability of
+// FIXME: Use autoconf to detect availability of
 //   gettimeofday().
 # define GTEST_HAS_GETTIMEOFDAY_ 1
 
@@ -133,19 +132,25 @@
 # include <sys/types.h>  // NOLINT
 #endif
 
-// Indicates that this translation unit is part of Google Test's
-// implementation.  It must come before gtest-internal-inl.h is
-// included, or there will be a compiler error.  This trick is to
-// prevent a user from accidentally including gtest-internal-inl.h in
-// his code.
-#define GTEST_IMPLEMENTATION_ 1
 #include "src/gtest-internal-inl.h"
-#undef GTEST_IMPLEMENTATION_
 
 #if GTEST_OS_WINDOWS
 # define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+#include <crt_externs.h>
+#endif
+#endif
+
+#if GTEST_HAS_ABSL
+#include "absl/debugging/failure_signal_handler.h"
+#include "absl/debugging/stacktrace.h"
+#include "absl/debugging/symbolize.h"
+#include "absl/strings/str_cat.h"
+#endif  // GTEST_HAS_ABSL
+
 namespace testing {
 
 using internal::CountIf;
@@ -167,8 +172,10 @@ static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*";
 // A test filter that matches everything.
 static const char kUniversalFilter[] = "*";
 
-// The default output file for XML output.
-static const char kDefaultOutputFile[] = "test_detail.xml";
+// The default output format.
+static const char kDefaultOutputFormat[] = "xml";
+// The default output file.
+static const char kDefaultOutputFile[] = "test_detail";
 
 // The environment variable name for the test shard index.
 static const char kTestShardIndex[] = "GTEST_SHARD_INDEX";
@@ -187,15 +194,31 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // specified on the command line.
 bool g_help_flag = false;
 
+// Utilty function to Open File for Writing
+static FILE* OpenFileForWriting(const std::string& output_file) {
+  FILE* fileout = NULL;
+  FilePath output_file_path(output_file);
+  FilePath output_dir(output_file_path.RemoveFileName());
+
+  if (output_dir.CreateDirectoriesRecursively()) {
+    fileout = posix::FOpen(output_file.c_str(), "w");
+  }
+  if (fileout == NULL) {
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << output_file << "\"";
+  }
+  return fileout;
+}
+
 }  // namespace internal
 
+// Bazel passes in the argument to '--test_filter' via the TESTBRIDGE_TEST_ONLY
+// environment variable.
 static const char* GetDefaultFilter() {
-#ifdef GTEST_TEST_FILTER_ENV_VAR_
-  const char* const testbridge_test_only = getenv(GTEST_TEST_FILTER_ENV_VAR_);
+  const char* const testbridge_test_only =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_ONLY");
   if (testbridge_test_only != NULL) {
     return testbridge_test_only;
   }
-#endif  // GTEST_TEST_FILTER_ENV_VAR_
   return kUniversalFilter;
 }
 
@@ -232,15 +255,28 @@ GTEST_DEFINE_string_(
     "exclude).  A test is run if it matches one of the positive "
     "patterns and does not match any of the negative patterns.");
 
+GTEST_DEFINE_bool_(
+    install_failure_signal_handler,
+    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
+    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    "install a signal handler that dumps debugging information when fatal "
+    "signals are raised.");
+
 GTEST_DEFINE_bool_(list_tests, false,
                    "List all tests without running them.");
 
+// The net priority order after flag processing is thus:
+//   --gtest_output command line flag
+//   GTEST_OUTPUT environment variable
+//   XML_OUTPUT_FILE environment variable
+//   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output", ""),
-    "A format (currently must be \"xml\"), optionally followed "
-    "by a colon and an output file name or directory. A directory "
-    "is indicated by a trailing pathname separator. "
+    internal::StringFromGTestEnv("output",
+      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    "A format (defaults to \"xml\" but can be specified to be \"json\"), "
+    "optionally followed by a colon and an output file name or directory. "
+    "A directory is indicated by a trailing pathname separator. "
     "Examples: \"xml:filename.xml\", \"xml::directoryname/\". "
     "If a directory is specified, output files will be created "
     "within that directory, with file-names based on the test "
@@ -253,6 +289,12 @@ GTEST_DEFINE_bool_(
     "True iff " GTEST_NAME_
     " should display elapsed time in text output.");
 
+GTEST_DEFINE_bool_(
+    print_utf8,
+    internal::BoolFromGTestEnv("print_utf8", true),
+    "True iff " GTEST_NAME_
+    " prints UTF8 characters as text.");
+
 GTEST_DEFINE_int32_(
     random_seed,
     internal::Int32FromGTestEnv("random_seed", 0),
@@ -294,7 +336,7 @@ GTEST_DEFINE_bool_(
     internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
-    "otherwise.");
+    "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
@@ -308,10 +350,10 @@ namespace internal {
 // Generates a random number from [0, range), using a Linear
 // Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
 // than kMaxRange.
-GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
 UInt32 Random::Generate(UInt32 range) {
   // These constants are the same as are used in glibc's rand(3).
-  state_ = (1103515245U*state_ + 12345U) % kMaxRange;
+  // Use wider types than necessary to prevent unsigned overflow diagnostics.
+  state_ = static_cast<UInt32>(1103515245ULL*state_ + 12345U) % kMaxRange;
 
   GTEST_CHECK_(range > 0)
       << "Cannot generate a number in the range [0, 0).";
@@ -385,12 +427,15 @@ void AssertHelper::operator=(const Message& message) const {
 GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex);
 
 // A copy of all command line arguments.  Set by InitGoogleTest().
-::std::vector<testing::internal::string> g_argvs;
+static ::std::vector<std::string> g_argvs;
 
-const ::std::vector<testing::internal::string>& GetArgvs() {
+::std::vector<std::string> GetArgvs() {
 #if defined(GTEST_CUSTOM_GET_ARGVS_)
-  return GTEST_CUSTOM_GET_ARGVS_();
-#else  // defined(GTEST_CUSTOM_GET_ARGVS_)
+  // GTEST_CUSTOM_GET_ARGVS_() may return a container of std::string or
+  // ::string. This code converts it to the appropriate type.
+  const auto& custom = GTEST_CUSTOM_GET_ARGVS_();
+  return ::std::vector<std::string>(custom.begin(), custom.end());
+#else   // defined(GTEST_CUSTOM_GET_ARGVS_)
   return g_argvs;
 #endif  // defined(GTEST_CUSTOM_GET_ARGVS_)
 }
@@ -414,8 +459,6 @@ FilePath GetCurrentExecutableName() {
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL) return std::string("");
-
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == NULL) ?
       std::string(gtest_output_flag) :
@@ -426,19 +469,22 @@ std::string UnitTestOptions::GetOutputFormat() {
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
   const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
-  if (gtest_output_flag == NULL)
-    return "";
+
+  std::string format = GetOutputFormat();
+  if (format.empty())
+    format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == NULL)
-    return internal::FilePath::ConcatPaths(
+    return internal::FilePath::MakeFileName(
         internal::FilePath(
             UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile)).string();
+        internal::FilePath(kDefaultOutputFile), 0,
+        format.c_str()).string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
-    // TODO(wan@google.com): on Windows \some\path is not an absolute
+    // FIXME: on Windows \some\path is not an absolute
     // path (as its meaning depends on the current drive), yet the
     // following logic for turning it into an absolute path is wrong.
     // Fix it.
@@ -629,12 +675,12 @@ extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId();
 // This predicate-formatter checks that 'results' contains a test part
 // failure of the given type and that the failure message contains the
 // given substring.
-AssertionResult HasOneFailure(const char* /* results_expr */,
-                              const char* /* type_expr */,
-                              const char* /* substr_expr */,
-                              const TestPartResultArray& results,
-                              TestPartResult::Type type,
-                              const string& substr) {
+static AssertionResult HasOneFailure(const char* /* results_expr */,
+                                     const char* /* type_expr */,
+                                     const char* /* substr_expr */,
+                                     const TestPartResultArray& results,
+                                     TestPartResult::Type type,
+                                     const std::string& substr) {
   const std::string expected(type == TestPartResult::kFatalFailure ?
                         "1 fatal failure" :
                         "1 non-fatal failure");
@@ -668,13 +714,10 @@ AssertionResult HasOneFailure(const char* /* results_expr */,
 // The constructor of SingleFailureChecker remembers where to look up
 // test part results, what type of failure we expect, and what
 // substring the failure message should contain.
-SingleFailureChecker:: SingleFailureChecker(
-    const TestPartResultArray* results,
-    TestPartResult::Type type,
-    const string& substr)
-    : results_(results),
-      type_(type),
-      substr_(substr) {}
+SingleFailureChecker::SingleFailureChecker(const TestPartResultArray* results,
+                                           TestPartResult::Type type,
+                                           const std::string& substr)
+    : results_(results), type_(type), substr_(substr) {}
 
 // The destructor of SingleFailureChecker verifies that the given
 // TestPartResultArray contains exactly one failure that has the given
@@ -815,7 +858,7 @@ TimeInMillis GetTimeInMillis() {
   SYSTEMTIME now_systime;
   FILETIME now_filetime;
   ULARGE_INTEGER now_int64;
-  // TODO(kenton@google.com): Shouldn't this just use
+  // FIXME: Shouldn't this just use
   //   GetSystemTimeAsFileTime()?
   GetSystemTime(&now_systime);
   if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
@@ -831,11 +874,11 @@ TimeInMillis GetTimeInMillis() {
 
   // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
   // (deprecated function) there.
-  // TODO(kenton@google.com): Use GetTickCount()?  Or use
+  // FIXME: Use GetTickCount()?  Or use
   //   SystemTimeToFileTime()
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
   _ftime64(&now);
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
+  GTEST_DISABLE_MSC_DEPRECATED_POP_()
 
   return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
 #elif GTEST_HAS_GETTIMEOFDAY_
@@ -1172,7 +1215,7 @@ class Hunk {
   // Print a unified diff header for one hunk.
   // The format is
   //   "@@ -<left_start>,<left_length> +<right_start>,<right_length> @@"
-  // where the left/right parts are ommitted if unnecessary.
+  // where the left/right parts are omitted if unnecessary.
   void PrintHeader(std::ostream* ss) const {
     *ss << "@@ ";
     if (removes_) {
@@ -1316,13 +1359,14 @@ AssertionResult EqFailure(const char* lhs_expression,
                           const std::string& rhs_value,
                           bool ignoring_case) {
   Message msg;
-  msg << "      Expected: " << lhs_expression;
+  msg << "Expected equality of these values:";
+  msg << "\n  " << lhs_expression;
   if (lhs_value != lhs_expression) {
-    msg << "\n      Which is: " << lhs_value;
+    msg << "\n    Which is: " << lhs_value;
   }
-  msg << "\nTo be equal to: " << rhs_expression;
+  msg << "\n  " << rhs_expression;
   if (rhs_value != rhs_expression) {
-    msg << "\n      Which is: " << rhs_value;
+    msg << "\n    Which is: " << rhs_value;
   }
 
   if (ignoring_case) {
@@ -1369,7 +1413,7 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
-  // TODO(wan): do not print the value of an expression if it's
+  // FIXME: do not print the value of an expression if it's
   // already a literal.
   return AssertionFailure()
       << "The difference between " << expr1 << " and " << expr2
@@ -1664,7 +1708,7 @@ namespace {
 AssertionResult HRESULTFailureHelper(const char* expr,
                                      const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE
+# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
@@ -1721,7 +1765,7 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 // Utility functions for encoding Unicode text (wide strings) in
 // UTF-8.
 
-// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8
+// A Unicode code-point can have up to 21 bits, and is encoded in UTF-8
 // like this:
 //
 // Code-point length   Encoding
@@ -1785,7 +1829,7 @@ std::string CodePointToUtf8(UInt32 code_point) {
   return str;
 }
 
-// The following two functions only make sense if the the system
+// The following two functions only make sense if the system
 // uses UTF-16 for wide string encoding. All supported systems
 // with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16.
 
@@ -2097,13 +2141,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-  "classname",
-  "name",
-  "status",
-  "time",
-  "type_param",
-  "value_param"
-};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 template <int kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
@@ -2139,8 +2178,9 @@ static std::string FormatWordList(const std::vector<std::string>& words) {
   return word_list.GetString();
 }
 
-bool ValidateTestPropertyName(const std::string& property_name,
-                              const std::vector<std::string>& reserved_names) {
+static bool ValidateTestPropertyName(
+    const std::string& property_name,
+    const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
           reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
@@ -2437,6 +2477,8 @@ Result HandleExceptionsInMethodIfSupported(
 #if GTEST_HAS_EXCEPTIONS
     try {
       return HandleSehExceptionsInMethodIfSupported(object, method, location);
+    } catch (const AssertionException&) {  // NOLINT
+      // This failure was reported already.
     } catch (const internal::GoogleTestFailureException&) {  // NOLINT
       // This exception type can only be thrown by a failed Google
       // Test assertion with the intention of letting another testing
@@ -2558,7 +2600,6 @@ TestInfo* MakeAndRegisterTestInfo(
   return test_info;
 }
 
-#if GTEST_HAS_PARAM_TEST
 void ReportInvalidTestCaseType(const char* test_case_name,
                                CodeLocation code_location) {
   Message errors;
@@ -2572,13 +2613,10 @@ void ReportInvalidTestCaseType(const char* test_case_name,
       << "probably rename one of the classes to put the tests into different\n"
       << "test cases.";
 
-  fprintf(stderr, "%s %s",
-          FormatFileLocation(code_location.file.c_str(),
-                             code_location.line).c_str(),
-          errors.GetString().c_str());
+  GTEST_LOG_(ERROR) << FormatFileLocation(code_location.file.c_str(),
+                                          code_location.line)
+                    << " " << errors.GetString();
 }
-#endif  // GTEST_HAS_PARAM_TEST
-
 }  // namespace internal
 
 namespace {
@@ -2616,12 +2654,10 @@ namespace internal {
 // and INSTANTIATE_TEST_CASE_P into regular tests and registers those.
 // This will be done just once during the program runtime.
 void UnitTestImpl::RegisterParameterizedTests() {
-#if GTEST_HAS_PARAM_TEST
   if (!parameterized_tests_registered_) {
     parameterized_test_registry_.RegisterTests();
     parameterized_tests_registered_ = true;
   }
-#endif
 }
 
 }  // namespace internal
@@ -2649,18 +2685,18 @@ void TestInfo::Run() {
       factory_, &internal::TestFactoryBase::CreateTest,
       "the test fixture's constructor");
 
-  // Runs the test only if the test object was created and its
-  // constructor didn't generate a fatal failure.
-  if ((test != NULL) && !Test::HasFatalFailure()) {
+  // Runs the test if the constructor didn't generate a fatal failure.
+  // Note that the object will not be null
+  if (!Test::HasFatalFailure()) {
     // This doesn't throw as all user code that can throw are wrapped into
     // exception handling code.
     test->Run();
   }
 
-  // Deletes the test object.
-  impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      test, &Test::DeleteSelf_, "the test fixture's destructor");
+    // Deletes the test object.
+    impl->os_stack_trace_getter()->UponLeavingGTest();
+    internal::HandleExceptionsInMethodIfSupported(
+        test, &Test::DeleteSelf_, "the test fixture's destructor");
 
   result_.set_elapsed_time(internal::GetTimeInMillis() - start);
 
@@ -2886,10 +2922,10 @@ enum GTestColor {
 };
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
-WORD GetColorAttribute(GTestColor color) {
+static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
     case COLOR_RED:    return FOREGROUND_RED;
     case COLOR_GREEN:  return FOREGROUND_GREEN;
@@ -2898,11 +2934,42 @@ WORD GetColorAttribute(GTestColor color) {
   }
 }
 
+static int GetBitOffset(WORD color_mask) {
+  if (color_mask == 0) return 0;
+
+  int bitOffset = 0;
+  while ((color_mask & 1) == 0) {
+    color_mask >>= 1;
+    ++bitOffset;
+  }
+  return bitOffset;
+}
+
+static WORD GetNewColor(GTestColor color, WORD old_color_attrs) {
+  // Let's reuse the BG
+  static const WORD background_mask = BACKGROUND_BLUE | BACKGROUND_GREEN |
+                                      BACKGROUND_RED | BACKGROUND_INTENSITY;
+  static const WORD foreground_mask = FOREGROUND_BLUE | FOREGROUND_GREEN |
+                                      FOREGROUND_RED | FOREGROUND_INTENSITY;
+  const WORD existing_bg = old_color_attrs & background_mask;
+
+  WORD new_color =
+      GetColorAttribute(color) | existing_bg | FOREGROUND_INTENSITY;
+  static const int bg_bitOffset = GetBitOffset(background_mask);
+  static const int fg_bitOffset = GetBitOffset(foreground_mask);
+
+  if (((new_color & background_mask) >> bg_bitOffset) ==
+      ((new_color & foreground_mask) >> fg_bitOffset)) {
+    new_color ^= FOREGROUND_INTENSITY;  // invert intensity
+  }
+  return new_color;
+}
+
 #else
 
 // Returns the ANSI color code for the given color.  COLOR_DEFAULT is
 // an invalid input.
-const char* GetAnsiColorCode(GTestColor color) {
+static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
     case COLOR_RED:     return "1";
     case COLOR_GREEN:   return "2";
@@ -2918,7 +2985,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
   const char* const gtest_color = GTEST_FLAG(color).c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
-#if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
     // On Windows the TERM variable is usually not set, but the
     // console there does support colors.
     return stdout_is_tty;
@@ -2954,7 +3021,7 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -2975,20 +3042,21 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   }
 
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
   CONSOLE_SCREEN_BUFFER_INFO buffer_info;
   GetConsoleScreenBufferInfo(stdout_handle, &buffer_info);
   const WORD old_color_attrs = buffer_info.wAttributes;
+  const WORD new_color = GetNewColor(color, old_color_attrs);
 
   // We need to flush the stream buffers into the console before each
   // SetConsoleTextAttribute call lest it affect the text that is already
   // printed but has not yet reached the console.
   fflush(stdout);
-  SetConsoleTextAttribute(stdout_handle,
-                          GetColorAttribute(color) | FOREGROUND_INTENSITY);
+  SetConsoleTextAttribute(stdout_handle, new_color);
+
   vprintf(fmt, args);
 
   fflush(stdout);
@@ -3002,12 +3070,12 @@ void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_end(args);
 }
 
-// Text printed in Google Test's text output and --gunit_list_tests
+// Text printed in Google Test's text output and --gtest_list_tests
 // output to label the type parameter and value parameter for a test.
 static const char kTypeParamLabel[] = "TypeParam";
 static const char kValueParamLabel[] = "GetParam()";
 
-void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
+static void PrintFullTestCommentIfPresent(const TestInfo& test_info) {
   const char* const type_param = test_info.type_param();
   const char* const value_param = test_info.value_param();
 
@@ -3278,7 +3346,7 @@ void TestEventRepeater::Append(TestEventListener *listener) {
   listeners_.push_back(listener);
 }
 
-// TODO(vladl@google.com): Factor the search functionality into Vector::Find.
+// FIXME: Factor the search functionality into Vector::Find.
 TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
@@ -3352,6 +3420,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   explicit XmlUnitTestResultPrinter(const char* output_file);
 
   virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+  void ListTestsMatchingFilter(const std::vector<TestCase*>& test_cases);
+
+  // Prints an XML summary of all unit tests.
+  static void PrintXmlTestsList(std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
 
  private:
   // Is c a whitespace character that is normalized to a space character
@@ -3413,6 +3486,11 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // to delimit this attribute from prior attributes.
   static std::string TestPropertiesAsXmlAttributes(const TestResult& result);
 
+  // Streams an XML representation of the test properties of a TestResult
+  // object.
+  static void OutputXmlTestProperties(std::ostream* stream,
+                                      const TestResult& result);
+
   // The output file.
   const std::string output_file_;
 
@@ -3422,46 +3500,30 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
 // Creates a new XmlUnitTestResultPrinter.
 XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file)
     : output_file_(output_file) {
-  if (output_file_.c_str() == NULL || output_file_.empty()) {
-    fprintf(stderr, "XML output file may not be null\n");
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "XML output file may not be null";
   }
 }
 
 // Called after the unit test ends.
 void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                   int /*iteration*/) {
-  FILE* xmlout = NULL;
-  FilePath output_file(output_file_);
-  FilePath output_dir(output_file.RemoveFileName());
-
-  if (output_dir.CreateDirectoriesRecursively()) {
-    xmlout = posix::FOpen(output_file_.c_str(), "w");
-  }
-  if (xmlout == NULL) {
-    // TODO(wan): report the reason of the failure.
-    //
-    // We don't do it for now as:
-    //
-    //   1. There is no urgent need for it.
-    //   2. It's a bit involved to make the errno variable thread-safe on
-    //      all three operating systems (Linux, Windows, and Mac OS).
-    //   3. To interpret the meaning of errno in a thread-safe way,
-    //      we need the strerror_r() function, which is not available on
-    //      Windows.
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            output_file_.c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
-  }
+  FILE* xmlout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintXmlUnitTest(&stream, unit_test);
   fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
   fclose(xmlout);
 }
 
+void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
+    const std::vector<TestCase*>& test_cases) {
+  FILE* xmlout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintXmlTestsList(&stream, test_cases);
+  fprintf(xmlout, "%s", StringStreamToString(&stream).c_str());
+  fclose(xmlout);
+}
+
 // Returns an XML-escaped copy of the input string str.  If is_attribute
 // is true, the text is meant to appear as an attribute value, and
 // normalizable whitespace is preserved by replacing it with character
@@ -3472,7 +3534,7 @@ void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-// TODO(wan): It might be nice to have a minimally invasive, human-readable
+// FIXME: It might be nice to have a minimally invasive, human-readable
 // escaping scheme for invalid characters, rather than dropping them.
 std::string XmlUnitTestResultPrinter::EscapeXml(
     const std::string& str, bool is_attribute) {
@@ -3533,6 +3595,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
+// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -3622,13 +3685,17 @@ void XmlUnitTestResultPrinter::OutputXmlAttribute(
 }
 
 // Prints an XML representation of a TestInfo object.
-// TODO(wan): There is also value in printing properties with the plain printer.
+// FIXME: There is also value in printing properties with the plain printer.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_case_name,
                                                  const TestInfo& test_info) {
   const TestResult& result = *test_info.result();
   const std::string kTestcase = "testcase";
 
+  if (test_info.is_in_another_shard()) {
+    return;
+  }
+
   *stream << "    <testcase";
   OutputXmlAttribute(stream, kTestcase, "name", test_info.name());
 
@@ -3639,13 +3706,19 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
   if (test_info.type_param() != NULL) {
     OutputXmlAttribute(stream, kTestcase, "type_param", test_info.type_param());
   }
+  if (GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestcase, "file", test_info.file());
+    OutputXmlAttribute(stream, kTestcase, "line",
+                       StreamableToString(test_info.line()));
+    *stream << " />\n";
+    return;
+  }
 
   OutputXmlAttribute(stream, kTestcase, "status",
                      test_info.should_run() ? "run" : "notrun");
   OutputXmlAttribute(stream, kTestcase, "time",
                      FormatTimeInMillisAsSeconds(result.elapsed_time()));
   OutputXmlAttribute(stream, kTestcase, "classname", test_case_name);
-  *stream << TestPropertiesAsXmlAttributes(result);
 
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
@@ -3654,22 +3727,28 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
       if (++failures == 1) {
         *stream << ">\n";
       }
-      const string location = internal::FormatCompilerIndependentFileLocation(
-          part.file_name(), part.line_number());
-      const string summary = location + "\n" + part.summary();
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
               << EscapeXmlAttribute(summary.c_str())
               << "\" type=\"\">";
-      const string detail = location + "\n" + part.message();
+      const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
     }
   }
 
-  if (failures == 0)
+  if (failures == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
-  else
+  } else {
+    if (failures == 0) {
+      *stream << ">\n";
+    }
+    OutputXmlTestProperties(stream, result);
     *stream << "    </testcase>\n";
+  }
 }
 
 // Prints an XML representation of a TestCase object
@@ -3680,17 +3759,18 @@ void XmlUnitTestResultPrinter::PrintXmlTestCase(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_case.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_case.reportable_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "failures",
-                     StreamableToString(test_case.failed_test_count()));
-  OutputXmlAttribute(
-      stream, kTestsuite, "disabled",
-      StreamableToString(test_case.reportable_disabled_test_count()));
-  OutputXmlAttribute(stream, kTestsuite, "errors", "0");
-  OutputXmlAttribute(stream, kTestsuite, "time",
-                     FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
-  *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result())
-          << ">\n";
-
+  if (!GTEST_FLAG(list_tests)) {
+    OutputXmlAttribute(stream, kTestsuite, "failures",
+                       StreamableToString(test_case.failed_test_count()));
+    OutputXmlAttribute(
+        stream, kTestsuite, "disabled",
+        StreamableToString(test_case.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+    OutputXmlAttribute(stream, kTestsuite, "time",
+                       FormatTimeInMillisAsSeconds(test_case.elapsed_time()));
+    *stream << TestPropertiesAsXmlAttributes(test_case.ad_hoc_test_result());
+  }
+  *stream << ">\n";
   for (int i = 0; i < test_case.total_test_count(); ++i) {
     if (test_case.GetTestInfo(i)->is_reportable())
       OutputXmlTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
@@ -3724,7 +3804,6 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
-
   *stream << TestPropertiesAsXmlAttributes(unit_test.ad_hoc_test_result());
 
   OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
@@ -3737,6 +3816,28 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
   *stream << "</" << kTestsuites << ">\n";
 }
 
+void XmlUnitTestResultPrinter::PrintXmlTestsList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+
+  *stream << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+  *stream << "<" << kTestsuites;
+
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputXmlAttribute(stream, kTestsuites, "tests",
+                     StreamableToString(total_tests));
+  OutputXmlAttribute(stream, kTestsuites, "name", "AllTests");
+  *stream << ">\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    PrintXmlTestCase(stream, *test_cases[i]);
+  }
+  *stream << "</" << kTestsuites << ">\n";
+}
+
 // Produces a string representing the test properties in a result as space
 // delimited XML attributes based on the property key="value" pairs.
 std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
@@ -3750,8 +3851,390 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   return attributes.GetString();
 }
 
+void XmlUnitTestResultPrinter::OutputXmlTestProperties(
+    std::ostream* stream, const TestResult& result) {
+  const std::string kProperties = "properties";
+  const std::string kProperty = "property";
+
+  if (result.test_property_count() <= 0) {
+    return;
+  }
+
+  *stream << "<" << kProperties << ">\n";
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    *stream << "<" << kProperty;
+    *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
+    *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
+    *stream << "/>\n";
+  }
+  *stream << "</" << kProperties << ">\n";
+}
+
 // End XmlUnitTestResultPrinter
 
+// This class generates an JSON output file.
+class JsonUnitTestResultPrinter : public EmptyTestEventListener {
+ public:
+  explicit JsonUnitTestResultPrinter(const char* output_file);
+
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration);
+
+  // Prints an JSON summary of all unit tests.
+  static void PrintJsonTestList(::std::ostream* stream,
+                                const std::vector<TestCase*>& test_cases);
+
+ private:
+  // Returns an JSON-escaped copy of the input string str.
+  static std::string EscapeJson(const std::string& str);
+
+  //// Verifies that the given attribute belongs to the given element and
+  //// streams the attribute as JSON.
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            const std::string& value,
+                            const std::string& indent,
+                            bool comma = true);
+  static void OutputJsonKey(std::ostream* stream,
+                            const std::string& element_name,
+                            const std::string& name,
+                            int value,
+                            const std::string& indent,
+                            bool comma = true);
+
+  // Streams a JSON representation of a TestInfo object.
+  static void OutputJsonTestInfo(::std::ostream* stream,
+                                 const char* test_case_name,
+                                 const TestInfo& test_info);
+
+  // Prints a JSON representation of a TestCase object
+  static void PrintJsonTestCase(::std::ostream* stream,
+                                const TestCase& test_case);
+
+  // Prints a JSON summary of unit_test to output stream out.
+  static void PrintJsonUnitTest(::std::ostream* stream,
+                                const UnitTest& unit_test);
+
+  // Produces a string representing the test properties in a result as
+  // a JSON dictionary.
+  static std::string TestPropertiesAsJson(const TestResult& result,
+                                          const std::string& indent);
+
+  // The output file.
+  const std::string output_file_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+};
+
+// Creates a new JsonUnitTestResultPrinter.
+JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
+    : output_file_(output_file) {
+  if (output_file_.empty()) {
+    GTEST_LOG_(FATAL) << "JSON output file may not be null";
+  }
+}
+
+void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                  int /*iteration*/) {
+  FILE* jsonout = OpenFileForWriting(output_file_);
+  std::stringstream stream;
+  PrintJsonUnitTest(&stream, unit_test);
+  fprintf(jsonout, "%s", StringStreamToString(&stream).c_str());
+  fclose(jsonout);
+}
+
+// Returns an JSON-escaped copy of the input string str.
+std::string JsonUnitTestResultPrinter::EscapeJson(const std::string& str) {
+  Message m;
+
+  for (size_t i = 0; i < str.size(); ++i) {
+    const char ch = str[i];
+    switch (ch) {
+      case '\\':
+      case '"':
+      case '/':
+        m << '\\' << ch;
+        break;
+      case '\b':
+        m << "\\b";
+        break;
+      case '\t':
+        m << "\\t";
+        break;
+      case '\n':
+        m << "\\n";
+        break;
+      case '\f':
+        m << "\\f";
+        break;
+      case '\r':
+        m << "\\r";
+        break;
+      default:
+        if (ch < ' ') {
+          m << "\\u00" << String::FormatByte(static_cast<unsigned char>(ch));
+        } else {
+          m << ch;
+        }
+        break;
+    }
+  }
+
+  return m.GetString();
+}
+
+// The following routines generate an JSON representation of a UnitTest
+// object.
+
+// Formats the given time in milliseconds as seconds.
+static std::string FormatTimeInMillisAsDuration(TimeInMillis ms) {
+  ::std::stringstream ss;
+  ss << (static_cast<double>(ms) * 1e-3) << "s";
+  return ss.str();
+}
+
+// Converts the given epoch time in milliseconds to a date string in the
+// RFC3339 format, without the timezone information.
+static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
+  struct tm time_struct;
+  if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
+    return "";
+  // YYYY-MM-DDThh:mm:ss
+  return StreamableToString(time_struct.tm_year + 1900) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+      String::FormatIntWidth2(time_struct.tm_min) + ":" +
+      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+}
+
+static inline std::string Indent(int width) {
+  return std::string(width, ' ');
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    const std::string& value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
+  if (comma)
+    *stream << ",\n";
+}
+
+void JsonUnitTestResultPrinter::OutputJsonKey(
+    std::ostream* stream,
+    const std::string& element_name,
+    const std::string& name,
+    int value,
+    const std::string& indent,
+    bool comma) {
+  const std::vector<std::string>& allowed_names =
+      GetReservedAttributesForElement(element_name);
+
+  GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
+                   allowed_names.end())
+      << "Key \"" << name << "\" is not allowed for value \"" << element_name
+      << "\".";
+
+  *stream << indent << "\"" << name << "\": " << StreamableToString(value);
+  if (comma)
+    *stream << ",\n";
+}
+
+// Prints a JSON representation of a TestInfo object.
+void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
+                                                   const char* test_case_name,
+                                                   const TestInfo& test_info) {
+  const TestResult& result = *test_info.result();
+  const std::string kTestcase = "testcase";
+  const std::string kIndent = Indent(10);
+
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, kTestcase, "name", test_info.name(), kIndent);
+
+  if (test_info.value_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "value_param",
+                  test_info.value_param(), kIndent);
+  }
+  if (test_info.type_param() != NULL) {
+    OutputJsonKey(stream, kTestcase, "type_param", test_info.type_param(),
+                  kIndent);
+  }
+  if (GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestcase, "file", test_info.file(), kIndent);
+    OutputJsonKey(stream, kTestcase, "line", test_info.line(), kIndent, false);
+    *stream << "\n" << Indent(8) << "}";
+    return;
+  }
+
+  OutputJsonKey(stream, kTestcase, "status",
+                test_info.should_run() ? "RUN" : "NOTRUN", kIndent);
+  OutputJsonKey(stream, kTestcase, "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()), kIndent);
+  OutputJsonKey(stream, kTestcase, "classname", test_case_name, kIndent, false);
+  *stream << TestPropertiesAsJson(result, kIndent);
+
+  int failures = 0;
+  for (int i = 0; i < result.total_part_count(); ++i) {
+    const TestPartResult& part = result.GetTestPartResult(i);
+    if (part.failed()) {
+      *stream << ",\n";
+      if (++failures == 1) {
+        *stream << kIndent << "\"" << "failures" << "\": [\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string message = EscapeJson(location + "\n" + part.message());
+      *stream << kIndent << "  {\n"
+              << kIndent << "    \"failure\": \"" << message << "\",\n"
+              << kIndent << "    \"type\": \"\"\n"
+              << kIndent << "  }";
+    }
+  }
+
+  if (failures > 0)
+    *stream << "\n" << kIndent << "]";
+  *stream << "\n" << Indent(8) << "}";
+}
+
+// Prints an JSON representation of a TestCase object
+void JsonUnitTestResultPrinter::PrintJsonTestCase(std::ostream* stream,
+                                                  const TestCase& test_case) {
+  const std::string kTestsuite = "testsuite";
+  const std::string kIndent = Indent(6);
+
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, kTestsuite, "name", test_case.name(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "tests", test_case.reportable_test_count(),
+                kIndent);
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, kTestsuite, "failures", test_case.failed_test_count(),
+                  kIndent);
+    OutputJsonKey(stream, kTestsuite, "disabled",
+                  test_case.reportable_disabled_test_count(), kIndent);
+    OutputJsonKey(stream, kTestsuite, "errors", 0, kIndent);
+    OutputJsonKey(stream, kTestsuite, "time",
+                  FormatTimeInMillisAsDuration(test_case.elapsed_time()),
+                  kIndent, false);
+    *stream << TestPropertiesAsJson(test_case.ad_hoc_test_result(), kIndent)
+            << ",\n";
+  }
+
+  *stream << kIndent << "\"" << kTestsuite << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < test_case.total_test_count(); ++i) {
+    if (test_case.GetTestInfo(i)->is_reportable()) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      OutputJsonTestInfo(stream, test_case.name(), *test_case.GetTestInfo(i));
+    }
+  }
+  *stream << "\n" << kIndent << "]\n" << Indent(4) << "}";
+}
+
+// Prints a JSON summary of unit_test to output stream out.
+void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
+                                                  const UnitTest& unit_test) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+
+  OutputJsonKey(stream, kTestsuites, "tests", unit_test.reportable_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "failures", unit_test.failed_test_count(),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "disabled",
+                unit_test.reportable_disabled_test_count(), kIndent);
+  OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
+  if (GTEST_FLAG(shuffle)) {
+    OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
+                  kIndent);
+  }
+  OutputJsonKey(stream, kTestsuites, "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(unit_test.start_timestamp()),
+                kIndent);
+  OutputJsonKey(stream, kTestsuites, "time",
+                FormatTimeInMillisAsDuration(unit_test.elapsed_time()), kIndent,
+                false);
+
+  *stream << TestPropertiesAsJson(unit_test.ad_hoc_test_result(), kIndent)
+          << ",\n";
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  bool comma = false;
+  for (int i = 0; i < unit_test.total_test_case_count(); ++i) {
+    if (unit_test.GetTestCase(i)->reportable_test_count() > 0) {
+      if (comma) {
+        *stream << ",\n";
+      } else {
+        comma = true;
+      }
+      PrintJsonTestCase(stream, *unit_test.GetTestCase(i));
+    }
+  }
+
+  *stream << "\n" << kIndent << "]\n" << "}\n";
+}
+
+void JsonUnitTestResultPrinter::PrintJsonTestList(
+    std::ostream* stream, const std::vector<TestCase*>& test_cases) {
+  const std::string kTestsuites = "testsuites";
+  const std::string kIndent = Indent(2);
+  *stream << "{\n";
+  int total_tests = 0;
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    total_tests += test_cases[i]->total_test_count();
+  }
+  OutputJsonKey(stream, kTestsuites, "tests", total_tests, kIndent);
+
+  OutputJsonKey(stream, kTestsuites, "name", "AllTests", kIndent);
+  *stream << kIndent << "\"" << kTestsuites << "\": [\n";
+
+  for (size_t i = 0; i < test_cases.size(); ++i) {
+    if (i != 0) {
+      *stream << ",\n";
+    }
+    PrintJsonTestCase(stream, *test_cases[i]);
+  }
+
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
+}
+// Produces a string representing the test properties in a result as
+// a JSON dictionary.
+std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
+    const TestResult& result, const std::string& indent) {
+  Message attributes;
+  for (int i = 0; i < result.test_property_count(); ++i) {
+    const TestProperty& property = result.GetTestProperty(i);
+    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+               << "\"" << EscapeJson(property.value()) << "\"";
+  }
+  return attributes.GetString();
+}
+
+// End JsonUnitTestResultPrinter
+
 #if GTEST_CAN_STREAM_RESULTS_
 
 // Checks if str contains '=', '&', '%' or '\n' characters. If yes,
@@ -3759,8 +4242,8 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
 // example, replaces "=" with "%3D".  This algorithm is O(strlen(str))
 // in both time and space -- important as the input str may contain an
 // arbitrarily long test failure message and stack trace.
-string StreamingListener::UrlEncode(const char* str) {
-  string result;
+std::string StreamingListener::UrlEncode(const char* str) {
+  std::string result;
   result.reserve(strlen(str) + 1);
   for (char ch = *str; ch != '\0'; ch = *++str) {
     switch (ch) {
@@ -3822,47 +4305,82 @@ void StreamingListener::SocketWriter::MakeConnection() {
 // End of class Streaming Listener
 #endif  // GTEST_CAN_STREAM_RESULTS__
 
-// Class ScopedTrace
+// class OsStackTraceGetter
 
-// Pushes the given source file location and message onto a per-thread
-// trace stack maintained by Google Test.
-ScopedTrace::ScopedTrace(const char* file, int line, const Message& message)
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  TraceInfo trace;
-  trace.file = file;
-  trace.line = line;
-  trace.message = message.GetString();
+const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
+    "... " GTEST_NAME_ " internal frames ...";
 
-  UnitTest::GetInstance()->PushGTestTrace(trace);
-}
+std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  std::string result;
 
-// Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
-  UnitTest::GetInstance()->PopGTestTrace();
-}
+  if (max_depth <= 0) {
+    return result;
+  }
 
+  max_depth = std::min(max_depth, kMaxStackTraceDepth);
 
-// class OsStackTraceGetter
+  std::vector<void*> raw_stack(max_depth);
+  // Skips the frames requested by the caller, plus this function.
+  const int raw_stack_size =
+      absl::GetStackTrace(&raw_stack[0], max_depth, skip_count + 1);
 
-const char* const OsStackTraceGetterInterface::kElidedFramesMarker =
-    "... " GTEST_NAME_ " internal frames ...";
+  void* caller_frame = nullptr;
+  {
+    MutexLock lock(&mutex_);
+    caller_frame = caller_frame_;
+  }
+
+  for (int i = 0; i < raw_stack_size; ++i) {
+    if (raw_stack[i] == caller_frame &&
+        !GTEST_FLAG(show_internal_stack_frames)) {
+      // Add a marker to the trace and stop adding frames.
+      absl::StrAppend(&result, kElidedFramesMarker, "\n");
+      break;
+    }
 
-string OsStackTraceGetter::CurrentStackTrace(int /*max_depth*/,
-                                             int /*skip_count*/) {
+    char tmp[1024];
+    const char* symbol = "(unknown)";
+    if (absl::Symbolize(raw_stack[i], tmp, sizeof(tmp))) {
+      symbol = tmp;
+    }
+
+    char line[1024];
+    snprintf(line, sizeof(line), "  %p: %s\n", raw_stack[i], symbol);
+    result += line;
+  }
+
+  return result;
+
+#else  // !GTEST_HAS_ABSL
+  static_cast<void>(max_depth);
+  static_cast<void>(skip_count);
   return "";
+#endif  // GTEST_HAS_ABSL
 }
 
-void OsStackTraceGetter::UponLeavingGTest() {}
+void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
+#if GTEST_HAS_ABSL
+  void* caller_frame = nullptr;
+  if (absl::GetStackTrace(&caller_frame, 1, 3) <= 0) {
+    caller_frame = nullptr;
+  }
+
+  MutexLock lock(&mutex_);
+  caller_frame_ = caller_frame;
+#endif  // GTEST_HAS_ABSL
+}
 
 // A helper class that creates the premature-exit file in its
 // constructor and deletes the file in its destructor.
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath) {
+      : premature_exit_filepath_(premature_exit_filepath ?
+                                 premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
-    if (premature_exit_filepath != NULL && *premature_exit_filepath != '\0') {
+    if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
@@ -3873,13 +4391,18 @@ class ScopedPrematureExitFile {
   }
 
   ~ScopedPrematureExitFile() {
-    if (premature_exit_filepath_ != NULL && *premature_exit_filepath_ != '\0') {
-      remove(premature_exit_filepath_);
+    if (!premature_exit_filepath_.empty()) {
+      int retval = remove(premature_exit_filepath_.c_str());
+      if (retval) {
+        GTEST_LOG_(ERROR) << "Failed to remove premature exit filepath \""
+                          << premature_exit_filepath_ << "\" with error "
+                          << retval;
+      }
     }
   }
 
  private:
-  const char* const premature_exit_filepath_;
+  const std::string premature_exit_filepath_;
 
   GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
 };
@@ -4149,6 +4672,11 @@ void UnitTest::AddTestPartResult(
       // when a failure happens and both the --gtest_break_on_failure and
       // the --gtest_catch_exceptions flags are specified.
       DebugBreak();
+#elif (!defined(__native_client__)) &&            \
+    ((defined(__clang__) || defined(__GNUC__)) && \
+     (defined(__x86_64__) || defined(__i386__)))
+      // with clang/gcc we can achieve the same effect on x86 by invoking int3
+      asm("int3");
 #else
       // Dereference NULL through a volatile pointer to prevent the compiler
       // from removing. We use this rather than abort() or __builtin_trap() for
@@ -4216,7 +4744,7 @@ int UnitTest::Run() {
   // used for the duration of the program.
   impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
 
-#if GTEST_HAS_SEH
+#if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
   // tests or this is executing in the context of death test child
   // process. In either case the user does not want to see pop-up dialogs
@@ -4245,7 +4773,7 @@ int UnitTest::Run() {
     // VC++ doesn't define _set_abort_behavior() prior to the version 8.0.
     // Users of prior VC versions shall suffer the agony and pain of
     // clicking through the countless debug dialogs.
-    // TODO(vladl@google.com): find a way to suppress the abort dialog() in the
+    // FIXME: find a way to suppress the abort dialog() in the
     // debug mode when compiled with VC 7.1 or lower.
     if (!GTEST_FLAG(break_on_failure))
       _set_abort_behavior(
@@ -4253,7 +4781,7 @@ int UnitTest::Run() {
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
 # endif
   }
-#endif  // GTEST_HAS_SEH
+#endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
       impl(),
@@ -4286,7 +4814,6 @@ const TestInfo* UnitTest::current_test_info() const
 // Returns the random seed used at the start of the current test run.
 int UnitTest::random_seed() const { return impl_->random_seed(); }
 
-#if GTEST_HAS_PARAM_TEST
 // Returns ParameterizedTestCaseRegistry object used to keep track of
 // value-parameterized tests and instantiate and register them.
 internal::ParameterizedTestCaseRegistry&
@@ -4294,7 +4821,6 @@ internal::ParameterizedTestCaseRegistry&
         GTEST_LOCK_EXCLUDED_(mutex_) {
   return impl_->parameterized_test_registry();
 }
-#endif  // GTEST_HAS_PARAM_TEST
 
 // Creates an empty UnitTest.
 UnitTest::UnitTest() {
@@ -4333,10 +4859,8 @@ UnitTestImpl::UnitTestImpl(UnitTest* parent)
           &default_global_test_part_result_reporter_),
       per_thread_test_part_result_reporter_(
           &default_per_thread_test_part_result_reporter_),
-#if GTEST_HAS_PARAM_TEST
       parameterized_test_registry_(),
       parameterized_tests_registered_(false),
-#endif  // GTEST_HAS_PARAM_TEST
       last_death_test_case_(-1),
       current_test_case_(NULL),
       current_test_info_(NULL),
@@ -4403,10 +4927,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
   if (output_format == "xml") {
     listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter(
         UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
+  } else if (output_format == "json") {
+    listeners()->SetDefaultXmlGenerator(new JsonUnitTestResultPrinter(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str()));
   } else if (output_format != "") {
-    printf("WARNING: unrecognized output format \"%s\" ignored.\n",
-           output_format.c_str());
-    fflush(stdout);
+    GTEST_LOG_(WARNING) << "WARNING: unrecognized output format \""
+                        << output_format << "\" ignored.";
   }
 }
 
@@ -4421,9 +4947,8 @@ void UnitTestImpl::ConfigureStreamingOutput() {
       listeners()->Append(new StreamingListener(target.substr(0, pos),
                                                 target.substr(pos+1)));
     } else {
-      printf("WARNING: unrecognized streaming target \"%s\" ignored.\n",
-             target.c_str());
-      fflush(stdout);
+      GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
+                          << "\" ignored.";
     }
   }
 }
@@ -4462,6 +4987,13 @@ void UnitTestImpl::PostFlagParsingInit() {
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
 #endif  // GTEST_CAN_STREAM_RESULTS_
+
+#if GTEST_HAS_ABSL
+    if (GTEST_FLAG(install_failure_signal_handler)) {
+      absl::FailureSignalHandlerOptions options;
+      absl::InstallFailureSignalHandler(options);
+    }
+#endif  // GTEST_HAS_ABSL
   }
 }
 
@@ -4505,11 +5037,11 @@ TestCase* UnitTestImpl::GetTestCase(const char* test_case_name,
                                     Test::SetUpTestCaseFunc set_up_tc,
                                     Test::TearDownTestCaseFunc tear_down_tc) {
   // Can we find a TestCase with the given name?
-  const std::vector<TestCase*>::const_iterator test_case =
-      std::find_if(test_cases_.begin(), test_cases_.end(),
+  const std::vector<TestCase*>::const_reverse_iterator test_case =
+      std::find_if(test_cases_.rbegin(), test_cases_.rend(),
                    TestCaseNameIs(test_case_name));
 
-  if (test_case != test_cases_.end())
+  if (test_case != test_cases_.rend())
     return *test_case;
 
   // No.  Let's create one.
@@ -4550,13 +5082,8 @@ static void TearDownEnvironment(Environment* env) { env->TearDown(); }
 // All other functions called from RunAllTests() may safely assume that
 // parameterized tests are ready to be counted and run.
 bool UnitTestImpl::RunAllTests() {
-  // Makes sure InitGoogleTest() was called.
-  if (!GTestIsInitialized()) {
-    printf("%s",
-           "\nThis test program did NOT call ::testing::InitGoogleTest "
-           "before calling RUN_ALL_TESTS().  Please fix it.\n");
-    return false;
-  }
+  // True iff Google Test is initialized before RUN_ALL_TESTS() is called.
+  const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
   if (g_help_flag)
@@ -4684,6 +5211,20 @@ bool UnitTestImpl::RunAllTests() {
 
   repeater->OnTestProgramEnd(*parent_);
 
+  if (!gtest_is_initialized_before_run_all_tests) {
+    ColoredPrintf(
+        COLOR_RED,
+        "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
+        "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
+        "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
+        " will start to enforce the valid usage. "
+        "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
+#if GTEST_FOR_GOOGLE_
+    ColoredPrintf(COLOR_RED,
+                  "For more details, see http://wiki/Main/ValidGUnitMain.\n");
+#endif  // GTEST_FOR_GOOGLE_
+  }
+
   return !failed;
 }
 
@@ -4785,8 +5326,8 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // each TestCase and TestInfo object.
 // If shard_tests == true, further filters tests based on sharding
 // variables in the environment - see
-// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide.
-// Returns the number of tests that should run.
+// https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
+// . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
   const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
       Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
@@ -4825,10 +5366,11 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
           (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
-      const bool is_selected = is_runnable &&
-          (shard_tests == IGNORE_SHARDING_PROTOCOL ||
-           ShouldRunTestOnShard(total_shards, shard_index,
-                                num_runnable_tests));
+      const bool is_in_another_shard =
+          shard_tests != IGNORE_SHARDING_PROTOCOL &&
+          !ShouldRunTestOnShard(total_shards, shard_index, num_runnable_tests);
+      test_info->is_in_another_shard_ = is_in_another_shard;
+      const bool is_selected = is_runnable && !is_in_another_shard;
 
       num_runnable_tests += is_runnable;
       num_selected_tests += is_selected;
@@ -4898,6 +5440,23 @@ void UnitTestImpl::ListTestsMatchingFilter() {
     }
   }
   fflush(stdout);
+  const std::string& output_format = UnitTestOptions::GetOutputFormat();
+  if (output_format == "xml" || output_format == "json") {
+    FILE* fileout = OpenFileForWriting(
+        UnitTestOptions::GetAbsolutePathToOutputFile().c_str());
+    std::stringstream stream;
+    if (output_format == "xml") {
+      XmlUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintXmlTestsList(&stream, test_cases_);
+    } else if (output_format == "json") {
+      JsonUnitTestResultPrinter(
+          UnitTestOptions::GetAbsolutePathToOutputFile().c_str())
+          .PrintJsonTestList(&stream, test_cases_);
+    }
+    fprintf(fileout, "%s", StringStreamToString(&stream).c_str());
+    fclose(fileout);
+  }
 }
 
 // Sets the OS stack trace getter.
@@ -4928,11 +5487,15 @@ OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() {
   return os_stack_trace_getter_;
 }
 
-// Returns the TestResult for the test that's currently running, or
-// the TestResult for the ad hoc test if no test is running.
+// Returns the most specific TestResult currently running.
 TestResult* UnitTestImpl::current_test_result() {
-  return current_test_info_ ?
-      &(current_test_info_->result_) : &ad_hoc_test_result_;
+  if (current_test_info_ != NULL) {
+    return &current_test_info_->result_;
+  }
+  if (current_test_case_ != NULL) {
+    return &current_test_case_->ad_hoc_test_result_;
+  }
+  return &ad_hoc_test_result_;
 }
 
 // Shuffles all test cases, and the tests within each test case,
@@ -5013,9 +5576,8 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-const char* ParseFlagValue(const char* str,
-                           const char* flag,
-                           bool def_optional) {
+static const char* ParseFlagValue(const char* str, const char* flag,
+                                  bool def_optional) {
   // str and flag must not be NULL.
   if (str == NULL || flag == NULL) return NULL;
 
@@ -5051,7 +5613,7 @@ const char* ParseFlagValue(const char* str,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, true);
 
@@ -5085,7 +5647,8 @@ bool ParseInt32Flag(const char* str, const char* flag, Int32* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseStringFlag(const char* str, const char* flag, std::string* value) {
+template <typename String>
+static bool ParseStringFlag(const char* str, const char* flag, String* value) {
   // Gets the value of the flag as a string.
   const char* const value_str = ParseFlagValue(str, flag, false);
 
@@ -5121,7 +5684,7 @@ static bool HasGoogleTestFlagPrefix(const char* str) {
 //   @Y    changes the color to yellow.
 //   @D    changes to the default terminal text color.
 //
-// TODO(wan@google.com): Write tests for this once we add stdout
+// FIXME: Write tests for this once we add stdout
 // capturing to Google Test.
 static void PrintColorEncoded(const char* str) {
   GTestColor color = COLOR_DEFAULT;  // The current color.
@@ -5187,24 +5750,25 @@ static const char kColorEncodedHelpMessage[] =
 "      Enable/disable colored output. The default is @Gauto@D.\n"
 "  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
 "      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G"
+"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
     GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate an XML report in the given directory or with the given file\n"
-"      name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
-#if GTEST_CAN_STREAM_RESULTS_
+"      Generate a JSON or XML report in the given directory or with the given\n"
+"      file name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n"
+# if GTEST_CAN_STREAM_RESULTS_
 "  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
 "      Stream test results to the given server.\n"
-#endif  // GTEST_CAN_STREAM_RESULTS_
+# endif  // GTEST_CAN_STREAM_RESULTS_
 "\n"
 "Assertion Behavior:\n"
-#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
 "      Set the default death test style.\n"
-#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
 "  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
 "      Turn assertion failures into debugger break-points.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions.\n"
+"      Turn assertion failures into C++ exceptions for use by an external\n"
+"      test framework.\n"
 "  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
 "      Do not report exceptions as test failures. Instead, allow them\n"
 "      to crash the program or throw a pop-up (on Windows).\n"
@@ -5221,7 +5785,7 @@ static const char kColorEncodedHelpMessage[] =
 "(not one in your own code or tests), please report it to\n"
 "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
-bool ParseGoogleTestFlag(const char* const arg) {
+static bool ParseGoogleTestFlag(const char* const arg) {
   return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
                        &GTEST_FLAG(also_run_disabled_tests)) ||
       ParseBoolFlag(arg, kBreakOnFailureFlag,
@@ -5239,6 +5803,7 @@ bool ParseGoogleTestFlag(const char* const arg) {
       ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
       ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
       ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
       ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
       ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
       ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
@@ -5251,14 +5816,11 @@ bool ParseGoogleTestFlag(const char* const arg) {
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-void LoadFlagsFromFile(const std::string& path) {
+static void LoadFlagsFromFile(const std::string& path) {
   FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    fprintf(stderr,
-            "Unable to open file \"%s\"\n",
-            GTEST_FLAG(flagfile).c_str());
-    fflush(stderr);
-    exit(EXIT_FAILURE);
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+                      << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
   posix::FClose(flagfile);
@@ -5332,6 +5894,17 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+
+  // Fix the value of *_NSGetArgc() on macOS, but iff
+  // *_NSGetArgv() == argv
+  // Only applicable to char** version of argv
+#if GTEST_OS_MAC
+#ifndef GTEST_OS_IOS
+  if (*_NSGetArgv() == argv) {
+    *_NSGetArgc() = *argc;
+  }
+#endif
+#endif
 }
 void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) {
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
@@ -5353,6 +5926,10 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
     g_argvs.push_back(StreamableToString(argv[i]));
   }
 
+#if GTEST_HAS_ABSL
+  absl::InitializeSymbolizer(g_argvs[0].c_str());
+#endif  // GTEST_HAS_ABSL
+
   ParseGoogleTestFlagsOnly(argc, argv);
   GetUnitTestImpl()->PostFlagParsingInit();
 }
@@ -5386,4 +5963,45 @@ void InitGoogleTest(int* argc, wchar_t** argv) {
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+std::string TempDir() {
+#if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+  return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
+#endif
+
+#if GTEST_OS_WINDOWS_MOBILE
+  return "\\temp\\";
+#elif GTEST_OS_WINDOWS
+  const char* temp_dir = internal::posix::GetEnv("TEMP");
+  if (temp_dir == NULL || temp_dir[0] == '\0')
+    return "\\temp\\";
+  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+    return temp_dir;
+  else
+    return std::string(temp_dir) + "\\";
+#elif GTEST_OS_LINUX_ANDROID
+  return "/sdcard/";
+#else
+  return "/tmp/";
+#endif  // GTEST_OS_WINDOWS_MOBILE
+}
+
+// Class ScopedTrace
+
+// Pushes the given source file location and message onto a per-thread
+// trace stack maintained by Google Test.
+void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
+  internal::TraceInfo trace;
+  trace.file = file;
+  trace.line = line;
+  trace.message.swap(message);
+
+  UnitTest::GetInstance()->PushGTestTrace(trace);
+}
+
+// Pops the info pushed by the c'tor.
+ScopedTrace::~ScopedTrace()
+    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+  UnitTest::GetInstance()->PopGTestTrace();
+}
+
 }  // namespace testing
diff --git a/libs/libvpx/third_party/googletest/src/src/gtest_main.cc b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc
index f3028225523..2113f621e65 100644
--- a/libs/libvpx/third_party/googletest/src/src/gtest_main.cc
+++ b/libs/libvpx/third_party/googletest/src/src/gtest_main.cc
@@ -28,11 +28,10 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <stdio.h>
-
 #include "gtest/gtest.h"
 
 GTEST_API_ int main(int argc, char **argv) {
-  printf("Running main() from gtest_main.cc\n");
+  printf("Running main() from %s\n", __FILE__);
   testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/libs/libvpx/third_party/libwebm/Android.mk b/libs/libvpx/third_party/libwebm/Android.mk
index 8149a083f4f..b46ba101d42 100644
--- a/libs/libvpx/third_party/libwebm/Android.mk
+++ b/libs/libvpx/third_party/libwebm/Android.mk
@@ -3,7 +3,7 @@ LOCAL_PATH:= $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE:= libwebm
 LOCAL_CPPFLAGS:=-D__STDC_CONSTANT_MACROS -D__STDC_FORMAT_MACROS
-LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -Wno-extern-c-compat
+LOCAL_CPPFLAGS+=-D__STDC_LIMIT_MACROS -std=c++11
 LOCAL_C_INCLUDES:= $(LOCAL_PATH)
 LOCAL_EXPORT_C_INCLUDES:= $(LOCAL_PATH)
 
diff --git a/libs/libvpx/third_party/libwebm/README.libvpx b/libs/libvpx/third_party/libwebm/README.libvpx
index ebb5ff2f4d7..714f5d0eb5a 100644
--- a/libs/libvpx/third_party/libwebm/README.libvpx
+++ b/libs/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74
+Version: 81de00c43ea3c087b48a8c20337db7531b9f7612
 License: BSD
 License File: LICENSE.txt
 
@@ -7,4 +7,14 @@ Description:
 libwebm is used to handle WebM container I/O.
 
 Local Changes:
-* <none>
+Only keep:
+ - Android.mk
+ - AUTHORS.TXT
+ - common/
+    file_util.cc/h
+    hdr_util.cc/h
+    webmids.h
+ - LICENSE.TXT
+ - mkvmuxer/
+ - mkvparser/
+ - PATENTS.TXT
diff --git a/libs/libvpx/third_party/libwebm/common/file_util.cc b/libs/libvpx/third_party/libwebm/common/file_util.cc
index 6dab146dd98..6eb6428b987 100644
--- a/libs/libvpx/third_party/libwebm/common/file_util.cc
+++ b/libs/libvpx/third_party/libwebm/common/file_util.cc
@@ -17,14 +17,15 @@
 #include <cstring>
 #include <fstream>
 #include <ios>
+#include <string>
 
 namespace libwebm {
 
 std::string GetTempFileName() {
 #if !defined _MSC_VER && !defined __MINGW32__
   std::string temp_file_name_template_str =
-      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") :
-                                               ".") +
+      std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR")
+                                             : ".") +
       "/libwebm_temp.XXXXXX";
   char* temp_file_name_template =
       new char[temp_file_name_template_str.length() + 1];
@@ -41,7 +42,12 @@ std::string GetTempFileName() {
   return temp_file_name;
 #else
   char tmp_file_name[_MAX_PATH];
+#if defined _MSC_VER || defined MINGW_HAS_SECURE_API
   errno_t err = tmpnam_s(tmp_file_name);
+#else
+  char* fname_pointer = tmpnam(tmp_file_name);
+  int err = (fname_pointer == &tmp_file_name[0]) ? 0 : -1;
+#endif
   if (err == 0) {
     return std::string(tmp_file_name);
   }
@@ -65,6 +71,15 @@ uint64_t GetFileSize(const std::string& file_name) {
   return file_size;
 }
 
+bool GetFileContents(const std::string& file_name, std::string* contents) {
+  std::ifstream file(file_name.c_str());
+  *contents = std::string(static_cast<size_t>(GetFileSize(file_name)), 0);
+  if (file.good() && contents->size()) {
+    file.read(&(*contents)[0], contents->size());
+  }
+  return !file.fail();
+}
+
 TempFileDeleter::TempFileDeleter() { file_name_ = GetTempFileName(); }
 
 TempFileDeleter::~TempFileDeleter() {
diff --git a/libs/libvpx/third_party/libwebm/common/file_util.h b/libs/libvpx/third_party/libwebm/common/file_util.h
index 0e71eac11e4..a8737346418 100644
--- a/libs/libvpx/third_party/libwebm/common/file_util.h
+++ b/libs/libvpx/third_party/libwebm/common/file_util.h
@@ -22,6 +22,9 @@ std::string GetTempFileName();
 // Returns size of file specified by |file_name|, or 0 upon failure.
 uint64_t GetFileSize(const std::string& file_name);
 
+// Gets the contents file_name as a string. Returns false on error.
+bool GetFileContents(const std::string& file_name, std::string* contents);
+
 // Manages life of temporary file specified at time of construction. Deletes
 // file upon destruction.
 class TempFileDeleter {
@@ -38,4 +41,4 @@ class TempFileDeleter {
 
 }  // namespace libwebm
 
-#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
\ No newline at end of file
+#endif  // LIBWEBM_COMMON_FILE_UTIL_H_
diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.cc b/libs/libvpx/third_party/libwebm/common/hdr_util.cc
index e1618ce75a7..916f7170b67 100644
--- a/libs/libvpx/third_party/libwebm/common/hdr_util.cc
+++ b/libs/libvpx/third_party/libwebm/common/hdr_util.cc
@@ -36,10 +36,10 @@ bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm,
   if (MasteringMetadataValuePresent(parser_mm.luminance_min))
     muxer_mm->set_luminance_min(parser_mm.luminance_min);
 
-  PrimaryChromaticityPtr r_ptr(NULL);
-  PrimaryChromaticityPtr g_ptr(NULL);
-  PrimaryChromaticityPtr b_ptr(NULL);
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
+  PrimaryChromaticityPtr g_ptr(nullptr);
+  PrimaryChromaticityPtr b_ptr(nullptr);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
 
   if (parser_mm.r) {
     if (!CopyPrimaryChromaticity(*parser_mm.r, &r_ptr))
diff --git a/libs/libvpx/third_party/libwebm/common/hdr_util.h b/libs/libvpx/third_party/libwebm/common/hdr_util.h
index 3ef5388fd03..78e2eeb7058 100644
--- a/libs/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/libs/libvpx/third_party/libwebm/common/hdr_util.h
@@ -47,15 +47,7 @@ struct Vp9CodecFeatures {
   int chroma_subsampling;
 };
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic pop
-#endif
+typedef std::unique_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
diff --git a/libs/libvpx/third_party/libwebm/common/webmids.h b/libs/libvpx/third_party/libwebm/common/webmids.h
index 89d722a71bc..fc0c2081409 100644
--- a/libs/libvpx/third_party/libwebm/common/webmids.h
+++ b/libs/libvpx/third_party/libwebm/common/webmids.h
@@ -93,6 +93,7 @@ enum MkvId {
   kMkvDisplayHeight = 0x54BA,
   kMkvDisplayUnit = 0x54B2,
   kMkvAspectRatioType = 0x54B3,
+  kMkvColourSpace = 0x2EB524,
   kMkvFrameRate = 0x2383E3,
   // end video
   // colour
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 15b9a908d8a..51203121192 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvmuxer.h"
 
+#include <stdint.h>
+
 #include <cfloat>
 #include <climits>
 #include <cstdio>
@@ -24,11 +26,6 @@
 #include "mkvmuxer/mkvwriter.h"
 #include "mkvparser/mkvparser.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvmuxer {
 
 const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -72,7 +69,7 @@ bool StrCpy(const char* src, char** dst_ptr) {
   return true;
 }
 
-typedef std::auto_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
+typedef std::unique_ptr<PrimaryChromaticity> PrimaryChromaticityPtr;
 bool CopyChromaticity(const PrimaryChromaticity* src,
                       PrimaryChromaticityPtr* dst) {
   if (!dst)
@@ -776,6 +773,14 @@ bool Track::Write(IMkvWriter* writer) const {
   if (!type_ || !codec_id_)
     return false;
 
+  // AV1 tracks require a CodecPrivate. See
+  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
+  // point to a stable version once it is finalized, or our own WebM mappings
+  // page on webmproject.org should we decide to release them.
+  if (!strcmp(codec_id_, Tracks::kAv1CodecId) && !codec_private_)
+    return false;
+
   // |size| may be bigger than what is written out in this function because
   // derived classes may write out more data in the Track element.
   const uint64_t payload_size = PayloadSize();
@@ -1030,19 +1035,16 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
       !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) {
     return false;
   }
-  if (r_ &&
-      !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
-                 libwebm::kMkvPrimaryRChromaticityY)) {
+  if (r_ && !r_->Write(writer, libwebm::kMkvPrimaryRChromaticityX,
+                       libwebm::kMkvPrimaryRChromaticityY)) {
     return false;
   }
-  if (g_ &&
-      !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
-                 libwebm::kMkvPrimaryGChromaticityY)) {
+  if (g_ && !g_->Write(writer, libwebm::kMkvPrimaryGChromaticityX,
+                       libwebm::kMkvPrimaryGChromaticityY)) {
     return false;
   }
-  if (b_ &&
-      !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
-                 libwebm::kMkvPrimaryBChromaticityY)) {
+  if (b_ && !b_->Write(writer, libwebm::kMkvPrimaryBChromaticityX,
+                       libwebm::kMkvPrimaryBChromaticityY)) {
     return false;
   }
   if (white_point_ &&
@@ -1057,22 +1059,22 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const {
 bool MasteringMetadata::SetChromaticity(
     const PrimaryChromaticity* r, const PrimaryChromaticity* g,
     const PrimaryChromaticity* b, const PrimaryChromaticity* white_point) {
-  PrimaryChromaticityPtr r_ptr(NULL);
+  PrimaryChromaticityPtr r_ptr(nullptr);
   if (r) {
     if (!CopyChromaticity(r, &r_ptr))
       return false;
   }
-  PrimaryChromaticityPtr g_ptr(NULL);
+  PrimaryChromaticityPtr g_ptr(nullptr);
   if (g) {
     if (!CopyChromaticity(g, &g_ptr))
       return false;
   }
-  PrimaryChromaticityPtr b_ptr(NULL);
+  PrimaryChromaticityPtr b_ptr(nullptr);
   if (b) {
     if (!CopyChromaticity(b, &b_ptr))
       return false;
   }
-  PrimaryChromaticityPtr wp_ptr(NULL);
+  PrimaryChromaticityPtr wp_ptr(nullptr);
   if (white_point) {
     if (!CopyChromaticity(white_point, &wp_ptr))
       return false;
@@ -1238,7 +1240,7 @@ bool Colour::Write(IMkvWriter* writer) const {
 }
 
 bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) {
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -1424,6 +1426,7 @@ VideoTrack::VideoTrack(unsigned int* seed)
       stereo_mode_(0),
       alpha_mode_(0),
       width_(0),
+      colour_space_(NULL),
       colour_(NULL),
       projection_(NULL) {}
 
@@ -1521,6 +1524,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
                           static_cast<uint64>(alpha_mode_)))
       return false;
   }
+  if (colour_space_) {
+    if (!WriteEbmlElement(writer, libwebm::kMkvColourSpace, colour_space_))
+      return false;
+  }
   if (frame_rate_ > 0.0) {
     if (!WriteEbmlElement(writer, libwebm::kMkvFrameRate,
                           static_cast<float>(frame_rate_))) {
@@ -1545,8 +1552,24 @@ bool VideoTrack::Write(IMkvWriter* writer) const {
   return true;
 }
 
+void VideoTrack::set_colour_space(const char* colour_space) {
+  if (colour_space) {
+    delete[] colour_space_;
+
+    const size_t length = strlen(colour_space) + 1;
+    colour_space_ = new (std::nothrow) char[length];  // NOLINT
+    if (colour_space_) {
+#ifdef _MSC_VER
+      strcpy_s(colour_space_, length, colour_space);
+#else
+      strcpy(colour_space_, colour_space);
+#endif
+    }
+  }
+}
+
 bool VideoTrack::SetColour(const Colour& colour) {
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -1574,7 +1597,7 @@ bool VideoTrack::SetColour(const Colour& colour) {
 }
 
 bool VideoTrack::SetProjection(const Projection& projection) {
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -1628,6 +1651,8 @@ uint64_t VideoTrack::VideoPayloadSize() const {
   if (frame_rate_ > 0.0)
     size += EbmlElementSize(libwebm::kMkvFrameRate,
                             static_cast<float>(frame_rate_));
+  if (colour_space_)
+    size += EbmlElementSize(libwebm::kMkvColourSpace, colour_space_);
   if (colour_)
     size += colour_->ColourSize();
   if (projection_)
@@ -1705,9 +1730,9 @@ bool AudioTrack::Write(IMkvWriter* writer) const {
 
 const char Tracks::kOpusCodecId[] = "A_OPUS";
 const char Tracks::kVorbisCodecId[] = "A_VORBIS";
+const char Tracks::kAv1CodecId[] = "V_AV1";
 const char Tracks::kVp8CodecId[] = "V_VP8";
 const char Tracks::kVp9CodecId[] = "V_VP9";
-const char Tracks::kVp10CodecId[] = "V_VP10";
 const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS";
 const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS";
 const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA";
@@ -2666,7 +2691,7 @@ bool Cluster::QueueOrWriteFrame(const Frame* const frame) {
   // and write it if it is okay to do so (i.e.) no other track has an held back
   // frame with timestamp <= the timestamp of the frame in question.
   std::vector<std::list<Frame*>::iterator> frames_to_erase;
-  for (std::list<Frame *>::iterator
+  for (std::list<Frame*>::iterator
            current_track_iterator = stored_frames_[track_number].begin(),
            end = --stored_frames_[track_number].end();
        current_track_iterator != end; ++current_track_iterator) {
@@ -4168,8 +4193,8 @@ bool Segment::DocTypeIsWebm() const {
   // TODO(vigneshv): Tweak .clang-format.
   const char* kWebmCodecIds[kNumCodecIds] = {
       Tracks::kOpusCodecId,          Tracks::kVorbisCodecId,
-      Tracks::kVp8CodecId,           Tracks::kVp9CodecId,
-      Tracks::kVp10CodecId,          Tracks::kWebVttCaptionsId,
+      Tracks::kAv1CodecId,           Tracks::kVp8CodecId,
+      Tracks::kVp9CodecId,           Tracks::kWebVttCaptionsId,
       Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId,
       Tracks::kWebVttSubtitlesId};
 
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
index 46b0029dc47..f2db3771453 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -795,6 +795,8 @@ class VideoTrack : public Track {
   uint64_t alpha_mode() { return alpha_mode_; }
   void set_width(uint64_t width) { width_ = width; }
   uint64_t width() const { return width_; }
+  void set_colour_space(const char* colour_space);
+  const char* colour_space() const { return colour_space_; }
 
   Colour* colour() { return colour_; }
 
@@ -824,6 +826,7 @@ class VideoTrack : public Track {
   uint64_t stereo_mode_;
   uint64_t alpha_mode_;
   uint64_t width_;
+  char* colour_space_;
 
   Colour* colour_;
   Projection* projection_;
@@ -871,9 +874,9 @@ class Tracks {
 
   static const char kOpusCodecId[];
   static const char kVorbisCodecId[];
+  static const char kAv1CodecId[];
   static const char kVp8CodecId[];
   static const char kVp9CodecId[];
-  static const char kVp10CodecId[];
   static const char kWebVttCaptionsId[];
   static const char kWebVttDescriptionsId[];
   static const char kWebVttMetadataId[];
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 355d4e22b3c..7636a9f4ef9 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -136,9 +136,8 @@ uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode,
     return false;
   }
 
-  if (!frame->is_key() &&
-      !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
-                        reference_block_timestamp)) {
+  if (!frame->is_key() && !WriteEbmlElement(writer, libwebm::kMkvReferenceBlock,
+                                            reference_block_timestamp)) {
     return false;
   }
 
@@ -563,10 +562,10 @@ uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame,
   if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode)
     return 0;
 
-  return frame->CanBeSimpleBlock() ?
-             WriteSimpleBlock(writer, frame, relative_timecode) :
-             WriteBlock(writer, frame, relative_timecode,
-                        cluster->timecode_scale());
+  return frame->CanBeSimpleBlock()
+             ? WriteSimpleBlock(writer, frame, relative_timecode)
+             : WriteBlock(writer, frame, relative_timecode,
+                          cluster->timecode_scale());
 }
 
 uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
index 132388da599..3355428bd1e 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h
@@ -31,6 +31,9 @@ const int64 kMaxBlockTimecode = 0x07FFFLL;
 // Writes out |value| in Big Endian order. Returns 0 on success.
 int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size);
 
+// Writes out |f| in Big Endian order. Returns 0 on success.
+int32 SerializeFloat(IMkvWriter* writer, float f);
+
 // Returns the size in bytes of the element.
 int32 GetUIntSize(uint64 value);
 int32 GetIntSize(int64 value);
diff --git a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index 84655d802a8..d668384d85f 100644
--- a/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libs/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -78,6 +78,8 @@ int32 MkvWriter::Position(int64 position) {
 
 #ifdef _MSC_VER
   return _fseeki64(file_, position, SEEK_SET);
+#elif defined(_WIN32)
+  return fseeko64(file_, static_cast<off_t>(position), SEEK_SET);
 #else
   return fseeko(file_, static_cast<off_t>(position), SEEK_SET);
 #endif
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index 37f230d0a95..ace65bd5957 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,12 +22,8 @@
 
 #include "common/webmids.h"
 
-// disable deprecation warnings for auto_ptr
-#if defined(__GNUC__) && __GNUC__ >= 5
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
 namespace mkvparser {
+const long long kStringElementSizeLimit = 20 * 1000 * 1000;
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
 const float Projection::kValueNotPresent = FLT_MAX;
@@ -40,8 +36,6 @@ inline bool isnan(double val) { return std::isnan(val); }
 inline bool isinf(double val) { return std::isinf(val); }
 #endif  // MSC_COMPAT
 
-IMkvReader::~IMkvReader() {}
-
 template <typename Type>
 Type* SafeArrayAlloc(unsigned long long num_elements,
                      unsigned long long element_size) {
@@ -330,7 +324,7 @@ long UnserializeString(IMkvReader* pReader, long long pos, long long size,
   delete[] str;
   str = NULL;
 
-  if (size >= LONG_MAX || size < 0)
+  if (size >= LONG_MAX || size < 0 || size > kStringElementSizeLimit)
     return E_FILE_FORMAT_INVALID;
 
   // +1 for '\0' terminator
@@ -4236,6 +4230,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         new (std::nothrow) ContentEncryption*[encryption_count];
     if (!encryption_entries_) {
       delete[] compression_entries_;
+      compression_entries_ = NULL;
       return -1;
     }
     encryption_entries_end_ = encryption_entries_;
@@ -4267,6 +4262,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         delete compression;
         return status;
       }
+      assert(compression_count > 0);
       *compression_entries_end_++ = compression;
     } else if (id == libwebm::kMkvContentEncryption) {
       ContentEncryption* const encryption =
@@ -4279,6 +4275,7 @@ long ContentEncoding::ParseContentEncodingEntry(long long start, long long size,
         delete encryption;
         return status;
       }
+      assert(encryption_count > 0);
       *encryption_entries_end_++ = encryption;
     }
 
@@ -4331,6 +4328,12 @@ long ContentEncoding::ParseCompressionEntry(long long start, long long size,
         return status;
       }
 
+      // There should be only one settings element per content compression.
+      if (compression->settings != NULL) {
+        delete[] buf;
+        return E_FILE_FORMAT_INVALID;
+      }
+
       compression->settings = buf;
       compression->settings_len = buflen;
     }
@@ -5015,7 +5018,7 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
   if (!reader || *mm)
     return false;
 
-  std::auto_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
+  std::unique_ptr<MasteringMetadata> mm_ptr(new MasteringMetadata());
   if (!mm_ptr.get())
     return false;
 
@@ -5035,6 +5038,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_max = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_max < 0.0 ||
           mm_ptr->luminance_max > 9999.99) {
@@ -5044,6 +5051,10 @@ bool MasteringMetadata::Parse(IMkvReader* reader, long long mm_start,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
+      if (value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
+        return false;
+      }
       mm_ptr->luminance_min = static_cast<float>(value);
       if (value_parse_status < 0 || mm_ptr->luminance_min < 0.0 ||
           mm_ptr->luminance_min > 999.9999) {
@@ -5096,7 +5107,7 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start,
   if (!reader || *colour)
     return false;
 
-  std::auto_ptr<Colour> colour_ptr(new Colour());
+  std::unique_ptr<Colour> colour_ptr(new Colour());
   if (!colour_ptr.get())
     return false;
 
@@ -5194,7 +5205,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
   if (!reader || *projection)
     return false;
 
-  std::auto_ptr<Projection> projection_ptr(new Projection());
+  std::unique_ptr<Projection> projection_ptr(new Projection());
   if (!projection_ptr.get())
     return false;
 
@@ -5270,6 +5281,7 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
 VideoTrack::VideoTrack(Segment* pSegment, long long element_start,
                        long long element_size)
     : Track(pSegment, element_start, element_size),
+      m_colour_space(NULL),
       m_colour(NULL),
       m_projection(NULL) {}
 
@@ -5295,6 +5307,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   long long stereo_mode = 0;
 
   double rate = 0.0;
+  char* colour_space = NULL;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5307,8 +5320,8 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
 
   const long long stop = pos + s.size;
 
-  Colour* colour = NULL;
-  Projection* projection = NULL;
+  std::unique_ptr<Colour> colour_ptr;
+  std::unique_ptr<Projection> projection_ptr;
 
   while (pos < stop) {
     long long id, size;
@@ -5357,11 +5370,23 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
       if (rate <= 0)
         return E_FILE_FORMAT_INVALID;
     } else if (id == libwebm::kMkvColour) {
-      if (!Colour::Parse(pReader, pos, size, &colour))
+      Colour* colour = NULL;
+      if (!Colour::Parse(pReader, pos, size, &colour)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        colour_ptr.reset(colour);
+      }
     } else if (id == libwebm::kMkvProjection) {
-      if (!Projection::Parse(pReader, pos, size, &projection))
+      Projection* projection = NULL;
+      if (!Projection::Parse(pReader, pos, size, &projection)) {
         return E_FILE_FORMAT_INVALID;
+      } else {
+        projection_ptr.reset(projection);
+      }
+    } else if (id == libwebm::kMkvColourSpace) {
+      const long status = UnserializeString(pReader, pos, size, colour_space);
+      if (status < 0)
+        return status;
     }
 
     pos += size;  // consume payload
@@ -5392,8 +5417,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info,
   pTrack->m_display_unit = display_unit;
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
-  pTrack->m_colour = colour;
-  pTrack->m_projection = projection;
+  pTrack->m_colour = colour_ptr.release();
+  pTrack->m_colour_space = colour_space;
+  pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
   return 0;  // success
@@ -7903,6 +7929,10 @@ long Block::Parse(const Cluster* pCluster) {
         return E_FILE_FORMAT_INVALID;
 
       curr.len = static_cast<long>(frame_size);
+      // Check if size + curr.len could overflow.
+      if (size > LLONG_MAX - curr.len) {
+        return E_FILE_FORMAT_INVALID;
+      }
       size += curr.len;  // contribution of this frame
 
       --frame_count;
@@ -7964,6 +7994,11 @@ long long Block::GetTimeCode(const Cluster* pCluster) const {
   const long long tc0 = pCluster->GetTimeCode();
   assert(tc0 >= 0);
 
+  // Check if tc0 + m_timecode would overflow.
+  if (tc0 < 0 || LLONG_MAX - tc0 < m_timecode) {
+    return -1;
+  }
+
   const long long tc = tc0 + m_timecode;
 
   return tc;  // unscaled timecode units
@@ -7981,6 +8016,10 @@ long long Block::GetTime(const Cluster* pCluster) const {
   const long long scale = pInfo->GetTimeCodeScale();
   assert(scale >= 1);
 
+  // Check if tc * scale could overflow.
+  if (tc != 0 && scale > LLONG_MAX / tc) {
+    return -1;
+  }
   const long long ns = tc * scale;
 
   return ns;
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
index 26c2b7e5ebf..848d01f03ec 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvparser.h
@@ -22,7 +22,7 @@ class IMkvReader {
   virtual int Length(long long* total, long long* available) = 0;
 
  protected:
-  virtual ~IMkvReader();
+  virtual ~IMkvReader() {}
 };
 
 template <typename Type>
@@ -527,6 +527,8 @@ class VideoTrack : public Track {
 
   Projection* GetProjection() const;
 
+  const char* GetColourSpace() const { return m_colour_space; }
+
  private:
   long long m_width;
   long long m_height;
@@ -534,7 +536,7 @@ class VideoTrack : public Track {
   long long m_display_height;
   long long m_display_unit;
   long long m_stereo_mode;
-
+  char* m_colour_space;
   double m_rate;
 
   Colour* m_colour;
diff --git a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index 23d68f50891..9d19c1be569 100644
--- a/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libs/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -118,6 +118,8 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) {
 
   if (status)
     return -1;  // error
+#elif defined(_WIN32)
+  fseeko64(m_file, static_cast<off_t>(offset), SEEK_SET);
 #else
   fseeko(m_file, static_cast<off_t>(offset), SEEK_SET);
 #endif
diff --git a/libs/libvpx/third_party/libyuv/LICENSE b/libs/libvpx/third_party/libyuv/LICENSE
new file mode 100644
index 00000000000..c911747a6b5
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/LICENSE
@@ -0,0 +1,29 @@
+Copyright 2011 The LibYuv Project Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in
+    the documentation and/or other materials provided with the
+    distribution.
+
+  * Neither the name of Google nor the names of its contributors may
+    be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/libs/libvpx/third_party/libyuv/README.libvpx b/libs/libvpx/third_party/libyuv/README.libvpx
index 485f79c0ff7..9519dc4beed 100644
--- a/libs/libvpx/third_party/libyuv/README.libvpx
+++ b/libs/libvpx/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv
-Version: de944ed8c74909ea6fbd743a22efe1e55e851b83
+Version: a37e7bfece9e0676ae90a1700b0ec85b0f4f22a1
 License: BSD
 License File: LICENSE
 
@@ -8,15 +8,16 @@ Description:
 libyuv is an open source project that includes YUV conversion and scaling
 functionality.
 
-The optimized scaler in libyuv is used in multiple resolution encoder example,
-which down-samples the original input video (f.g. 1280x720) a number of times
-in order to encode multiple resolution bit streams.
+The optimized scaler in libyuv is used in the multiple resolution encoder
+example which down-samples the original input video (f.g. 1280x720) a number of
+times in order to encode multiple resolution bit streams.
 
 Local Modifications:
-rm -rf .gitignore .gn AUTHORS Android.mk BUILD.gn CMakeLists.txt DEPS LICENSE \
-  LICENSE_THIRD_PARTY OWNERS PATENTS PRESUBMIT.py README.chromium README.md \
-  all.gyp build_overrides/ chromium/ codereview.settings docs/ \
-  download_vs_toolchain.py gyp_libyuv gyp_libyuv.py include/libyuv.h \
-  include/libyuv/compare_row.h libyuv.gyp libyuv.gypi libyuv_nacl.gyp \
-  libyuv_test.gyp linux.mk public.mk setup_links.py sync_chromium.py \
-  third_party/ tools/ unit_test/ util/ winarm.mk
+Disable ARGBToRGB24Row_AVX512VBMI due to build failure on Mac.
+rm libyuv/include/libyuv.h libyuv/include/libyuv/compare_row.h
+mv libyuv/include tmp/
+mv libyuv/source tmp/
+mv libyuv/LICENSE tmp/
+rm -rf libyuv
+
+mv tmp/* third_party/libyuv/
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h
index 54a2181430c..01d9dfc7736 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/basic_types.h
@@ -8,82 +8,36 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_BASIC_TYPES_H_
 #define INCLUDE_LIBYUV_BASIC_TYPES_H_
 
-#include <stddef.h>  // for NULL, size_t
+#include <stddef.h>  // For size_t and NULL
+
+#if !defined(INT_TYPES_DEFINED) && !defined(GG_LONGLONG)
+#define INT_TYPES_DEFINED
 
 #if defined(_MSC_VER) && (_MSC_VER < 1600)
 #include <sys/types.h>  // for uintptr_t on x86
+typedef unsigned __int64 uint64_t;
+typedef __int64 int64_t;
+typedef unsigned int uint32_t;
+typedef int int32_t;
+typedef unsigned short uint16_t;
+typedef short int16_t;
+typedef unsigned char uint8_t;
+typedef signed char int8_t;
 #else
-#include <stdint.h>  // for uintptr_t
-#endif
-
-#ifndef GG_LONGLONG
-#ifndef INT_TYPES_DEFINED
-#define INT_TYPES_DEFINED
-#ifdef COMPILER_MSVC
-typedef unsigned __int64 uint64;
-typedef __int64 int64;
-#ifndef INT64_C
-#define INT64_C(x) x ## I64
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UI64
-#endif
-#define INT64_F "I64"
-#else  // COMPILER_MSVC
-#if defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long uint64;  // NOLINT
-typedef long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## L
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## UL
-#endif
-#define INT64_F "l"
-#else  // defined(__LP64__) && !defined(__OpenBSD__) && !defined(__APPLE__)
-typedef unsigned long long uint64;  // NOLINT
-typedef long long int64;  // NOLINT
-#ifndef INT64_C
-#define INT64_C(x) x ## LL
-#endif
-#ifndef UINT64_C
-#define UINT64_C(x) x ## ULL
-#endif
-#define INT64_F "ll"
-#endif  // __LP64__
-#endif  // COMPILER_MSVC
-typedef unsigned int uint32;
-typedef int int32;
-typedef unsigned short uint16;  // NOLINT
-typedef short int16;  // NOLINT
-typedef unsigned char uint8;
-typedef signed char int8;
+#include <stdint.h>  // for uintptr_t and C99 types
+#endif               // defined(_MSC_VER) && (_MSC_VER < 1600)
+typedef uint64_t uint64;
+typedef int64_t int64;
+typedef uint32_t uint32;
+typedef int32_t int32;
+typedef uint16_t uint16;
+typedef int16_t int16;
+typedef uint8_t uint8;
+typedef int8_t int8;
 #endif  // INT_TYPES_DEFINED
-#endif  // GG_LONGLONG
-
-// Detect compiler is for x86 or x64.
-#if defined(__x86_64__) || defined(_M_X64) || \
-    defined(__i386__) || defined(_M_IX86)
-#define CPU_X86 1
-#endif
-// Detect compiler is for ARM.
-#if defined(__arm__) || defined(_M_ARM)
-#define CPU_ARM 1
-#endif
-
-#ifndef ALIGNP
-#ifdef __cplusplus
-#define ALIGNP(p, t) \
-    (reinterpret_cast<uint8*>(((reinterpret_cast<uintptr_t>(p) + \
-    ((t) - 1)) & ~((t) - 1))))
-#else
-#define ALIGNP(p, t) \
-    ((uint8*)((((uintptr_t)(p) + ((t) - 1)) & ~((t) - 1))))  /* NOLINT */
-#endif
-#endif
 
 #if !defined(LIBYUV_API)
 #if defined(_WIN32) || defined(__CYGWIN__)
@@ -95,24 +49,17 @@ typedef signed char int8;
 #define LIBYUV_API
 #endif  // LIBYUV_BUILDING_SHARED_LIBRARY
 #elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__APPLE__) && \
-    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) || \
-    defined(LIBYUV_USING_SHARED_LIBRARY))
-#define LIBYUV_API __attribute__ ((visibility ("default")))
+    (defined(LIBYUV_BUILDING_SHARED_LIBRARY) ||                      \
+     defined(LIBYUV_USING_SHARED_LIBRARY))
+#define LIBYUV_API __attribute__((visibility("default")))
 #else
 #define LIBYUV_API
 #endif  // __GNUC__
 #endif  // LIBYUV_API
 
+// TODO(fbarchard): Remove bool macros.
 #define LIBYUV_BOOL int
 #define LIBYUV_FALSE 0
 #define LIBYUV_TRUE 1
 
-// Visual C x86 or GCC little endian.
-#if defined(__x86_64__) || defined(_M_X64) || \
-  defined(__i386__) || defined(_M_IX86) || \
-  defined(__arm__) || defined(_M_ARM) || \
-  (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#define LIBYUV_LITTLE_ENDIAN
-#endif
-
-#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_BASIC_TYPES_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/compare.h b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h
index 08b2bb2ecf4..3353ad71c68 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/compare.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/compare.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_COMPARE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_COMPARE_H_
 #define INCLUDE_LIBYUV_COMPARE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,59 +20,92 @@ extern "C" {
 
 // Compute a hash for specified memory. Seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed);
+
+// Hamming Distance
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count);
 
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height);
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height);
 
 // Sum Square Error - used to compute Mean Square Error or PSNR.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a,
-                             const uint8* src_b, int count);
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count);
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height);
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height);
 
 static const int kMaxPsnr = 128;
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count);
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height);
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height);
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_COMPARE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_COMPARE_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h
index fcfcf544e12..d12ef24f799 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/convert.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_H_
 #define INCLUDE_LIBYUV_CONVERT_H_
 
 #include "libyuv/basic_types.h"
@@ -16,8 +16,8 @@
 #include "libyuv/rotate.h"  // For enum RotationMode.
 
 // TODO(fbarchard): fix WebRTC source to include following libyuv headers:
-#include "libyuv/convert_argb.h"  // For WebRTC I420ToARGB. b/620
-#include "libyuv/convert_from.h"  // For WebRTC ConvertFromI420. b/620
+#include "libyuv/convert_argb.h"      // For WebRTC I420ToARGB. b/620
+#include "libyuv/convert_from.h"      // For WebRTC ConvertFromI420. b/620
 #include "libyuv/planar_functions.h"  // For WebRTC I420Rect, CopyPlane. b/618
 
 #ifdef __cplusplus
@@ -27,195 +27,335 @@ extern "C" {
 
 // Convert I444 to I420.
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert I422 to I420.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert I411 to I420.
-LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Copy I420 to I420.
 #define I420ToI420 I420Copy
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Copy I010 to I010
+#define I010ToI010 I010Copy
+#define H010ToH010 I010Copy
+LIBYUV_API
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
+
+// Convert 10 bit YUV to 8 bit
+#define H010ToH420 I010ToI420
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert I400 (grey) to I420.
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 #define J400ToJ420 I400ToI420
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert NV21 to I420.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert M420 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int M420ToI420(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert Android420 to I420.
 LIBYUV_API
-int Android420ToI420(const uint8* src_y, int src_stride_y,
-                     const uint8* src_u, int src_stride_u,
-                     const uint8* src_v, int src_stride_v,
-                     int pixel_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height);
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height);
 
 // ARGB little endian (bgra in memory) to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // RGB little endian (bgr in memory) to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height);
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height);
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height);
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height);
 
 #ifdef HAVE_JPEG
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToI420(const uint8* sample, size_t sample_size,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height);
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height);
 #endif
 
 // Convert camera sample to I420 with cropping, rotation and vertical flip.
@@ -238,22 +378,29 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToI420(const uint8* src_frame, size_t src_size,
-                  uint8* dst_y, int dst_stride_y,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToI420(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
index 19672f32690..ab772b6c323 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -30,258 +30,621 @@ extern "C" {
 
 // Copy ARGB to ARGB.
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Duplicate prototype for function in convert_from.h for remoting.
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ARGB.
+LIBYUV_API
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to ABGR.
+LIBYUV_API
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert H010 to ABGR.
+LIBYUV_API
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert I444 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J444 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert I444 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
-
-// Convert I411 to ARGB.
-LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I420 with Alpha to preattenuated ARGB.
 LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate);
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate);
 
 // Convert I420 with Alpha to preattenuated ABGR.
 LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate);
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate);
 
 // Convert I400 (grey) to ARGB.  Reverse of ARGBToI400.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J400 (jpeg grey) to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Alias.
 #define YToARGB I400ToARGB
 
 // Convert NV12 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert NV21 to ARGB.
 LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert NV12 to ABGR.
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 // Convert M420 to ARGB.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int M420ToARGB(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J420 to ARGB.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J422 to ARGB.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert J420 to ABGR.
 LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert J422 to ABGR.
 LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert H420 to ARGB.
 LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert H422 to ARGB.
 LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert H420 to ABGR.
 LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert H422 to ABGR.
 LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert H010 to ARGB.
+LIBYUV_API
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
 // BGRA little endian (argb in memory) to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // ABGR little endian (rgba in memory) to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // RGBA little endian (abgr in memory) to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Deprecated function name.
 #define BG24ToARGB RGB24ToARGB
 
 // RGB little endian (bgr in memory) to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 // RGB big endian (rgb in memory) to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_frame, int src_stride_frame,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // RGB16 (RGBP fourcc) little endian to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // RGB15 (RGBO fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
 
 // RGB12 (R444 fourcc) little endian to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height);
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height);
+
+// Aliases
+#define AB30ToARGB AR30ToABGR
+#define AB30ToABGR AR30ToARGB
+#define AB30ToAR30 AR30ToAB30
+
+// Convert AR30 To ARGB.
+LIBYUV_API
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
+
+// Convert AR30 To ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
+
+// Convert AR30 To AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height);
 
 #ifdef HAVE_JPEG
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample, size_t sample_size,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height,
-               int dst_width, int dst_height);
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height);
 #endif
 
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height);
+
 // Convert camera sample to ARGB with cropping, rotation and vertical flip.
-// "src_size" is needed to parse MJPG.
+// "sample_size" is needed to parse MJPG.
 // "dst_stride_argb" number of bytes in a row of the dst_argb plane.
 //   Normally this would be the same as dst_width, with recommended alignment
 //   to 16 bytes for better efficiency.
@@ -300,20 +663,25 @@ int MJPGToARGB(const uint8* sample, size_t sample_size,
 //    Must be less than or equal to src_width/src_height
 //    Cropping parameters are pre-rotation.
 // "rotation" can be 0, 90, 180 or 270.
-// "format" is a fourcc. ie 'I420', 'YUY2'
+// "fourcc" is a fourcc. ie 'I420', 'YUY2'
 // Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
 LIBYUV_API
-int ConvertToARGB(const uint8* src_frame, size_t src_size,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 format);
+                  uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_ARGB_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h
index 39e1578a0e3..5cd8a4bfc04 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_H_
 
 #include "libyuv/basic_types.h"
@@ -21,159 +21,322 @@ extern "C" {
 
 // See Also convert.h for conversions from formats to I420.
 
-// I420Copy in convert to I420ToI420.
+// Convert 8 bit YUV to 10 bit.
+#define H420ToH010 I420ToI010
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
 LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height);
+
+LIBYUV_API
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
-// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height);
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
 
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_frame, int dst_stride_frame,
-                int width, int height);
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_frame, int dst_stride_frame,
-              int width, int height);
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_frame, int dst_stride_frame,
-                 int width, int height);
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // Convert I420 To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 
 LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_frame, int dst_stride_frame,
-                       const uint8* dither4x4, int width, int height);
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
 
+// Convert I420 to AR30.
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
+// Convert H420 to AR30.
 LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_frame, int dst_stride_frame,
-                   int width, int height);
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
 // Convert I420 to specified format.
 // "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
 //    buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 format);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
index 1df53200dde..05c815a093e 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 #define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -21,170 +21,267 @@ extern "C" {
 // Copy ARGB to ARGB.
 #define ARGBToARGB ARGBCopy
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Convert ARGB To BGRA.
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
+int ARGBToBGRA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 // Convert ARGB To ABGR.
 LIBYUV_API
-int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int ARGBToABGR(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert ARGB To RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
+
+// Aliases
+#define ARGBToAB30 ABGRToAR30
+#define ABGRToAB30 ARGBToAR30
+
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height);
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height);
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height);
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_rgb, int dst_stride_rgb,
-              int width, int height);
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height);
 
 // Convert ARGB To RGB565.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 // Values in dither matrix from 0 to 7 recommended.
 // The order of the dither matrix is first byte is upper left.
 // TODO(fbarchard): Consider pointer to 2d array for dither4x4.
-// const uint8(*dither)[4][4];
+// const uint8_t(*dither)[4][4];
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height);
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height);
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height);
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height);
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height);
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height);
 
 // Convert ARGB To I444.
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I422.
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB To I420. (also in convert.h)
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J422.
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-// Convert ARGB To I411.
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert ARGB to J400. (JPeg full range).
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height);
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height);
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 LIBYUV_API
-int ARGBToG(const uint8* src_argb, int src_stride_argb,
-            uint8* dst_g, int dst_stride_g,
-            int width, int height);
+int ARGBToG(const uint8_t* src_argb,
+            int src_stride_argb,
+            uint8_t* dst_g,
+            int dst_stride_g,
+            int width,
+            int height);
 
 // Convert ARGB To NV12.
 LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
 
 // Convert ARGB To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert ARGB To NV21.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height);
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
 
 // Convert ARGB To YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height);
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 // Convert ARGB To UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height);
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
index dfb7445e2f1..0229cb5e736 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/cpu_id.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_CPU_ID_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_CPU_ID_H_
 #define INCLUDE_LIBYUV_CPU_ID_H_
 
 #include "libyuv/basic_types.h"
@@ -31,50 +31,89 @@ static const int kCpuHasX86 = 0x10;
 static const int kCpuHasSSE2 = 0x20;
 static const int kCpuHasSSSE3 = 0x40;
 static const int kCpuHasSSE41 = 0x80;
-static const int kCpuHasSSE42 = 0x100;
+static const int kCpuHasSSE42 = 0x100;  // unused at this time.
 static const int kCpuHasAVX = 0x200;
 static const int kCpuHasAVX2 = 0x400;
 static const int kCpuHasERMS = 0x800;
 static const int kCpuHasFMA3 = 0x1000;
-static const int kCpuHasAVX3 = 0x2000;
-// 0x2000, 0x4000, 0x8000 reserved for future X86 flags.
+static const int kCpuHasF16C = 0x2000;
+static const int kCpuHasGFNI = 0x4000;
+static const int kCpuHasAVX512BW = 0x8000;
+static const int kCpuHasAVX512VL = 0x10000;
+static const int kCpuHasAVX512VBMI = 0x20000;
+static const int kCpuHasAVX512VBMI2 = 0x40000;
+static const int kCpuHasAVX512VBITALG = 0x80000;
+static const int kCpuHasAVX512VPOPCNTDQ = 0x100000;
 
 // These flags are only valid on MIPS processors.
-static const int kCpuHasMIPS = 0x10000;
-static const int kCpuHasDSPR2 = 0x20000;
+static const int kCpuHasMIPS = 0x200000;
+static const int kCpuHasMSA = 0x400000;
 
-// Internal function used to auto-init.
+// Optional init function. TestCpuFlag does an auto-init.
+// Returns cpu_info flags.
 LIBYUV_API
 int InitCpuFlags(void);
 
-// Internal function for parsing /proc/cpuinfo.
-LIBYUV_API
-int ArmCpuCaps(const char* cpuinfo_name);
-
 // Detect CPU has SSE2 etc.
 // Test_flag parameter should be one of kCpuHas constants above.
-// returns non-zero if instruction set is detected
+// Returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
   LIBYUV_API extern int cpu_info_;
-  return (!cpu_info_ ? InitCpuFlags() : cpu_info_) & test_flag;
+#ifdef __ATOMIC_RELAXED
+  int cpu_info = __atomic_load_n(&cpu_info_, __ATOMIC_RELAXED);
+#else
+  int cpu_info = cpu_info_;
+#endif
+  return (!cpu_info ? InitCpuFlags() : cpu_info) & test_flag;
 }
 
+// Internal function for parsing /proc/cpuinfo.
+LIBYUV_API
+int ArmCpuCaps(const char* cpuinfo_name);
+
 // For testing, allow CPU flags to be disabled.
 // ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
 // MaskCpuFlags(-1) to enable all cpu specific optimizations.
 // MaskCpuFlags(1) to disable all cpu specific optimizations.
+// MaskCpuFlags(0) to reset state so next call will auto init.
+// Returns cpu_info flags.
 LIBYUV_API
-void MaskCpuFlags(int enable_flags);
+int MaskCpuFlags(int enable_flags);
+
+// Sets the CPU flags to |cpu_flags|, bypassing the detection code. |cpu_flags|
+// should be a valid combination of the kCpuHas constants above and include
+// kCpuInitialized. Use this method when running in a sandboxed process where
+// the detection code might fail (as it might access /proc/cpuinfo). In such
+// cases the cpu_info can be obtained from a non sandboxed process by calling
+// InitCpuFlags() and passed to the sandboxed process (via command line
+// parameters, IPC...) which can then call this method to initialize the CPU
+// flags.
+// Notes:
+// - when specifying 0 for |cpu_flags|, the auto initialization is enabled
+//   again.
+// - enabling CPU features that are not supported by the CPU will result in
+//   undefined behavior.
+// TODO(fbarchard): consider writing a helper function that translates from
+// other library CPU info to libyuv CPU info and add a .md doc that explains
+// CPU detection.
+static __inline void SetCpuFlags(int cpu_flags) {
+  LIBYUV_API extern int cpu_info_;
+#ifdef __ATOMIC_RELAXED
+  __atomic_store_n(&cpu_info_, cpu_flags, __ATOMIC_RELAXED);
+#else
+  cpu_info_ = cpu_flags;
+#endif
+}
 
 // Low level cpuid for X86. Returns zeros on other CPUs.
 // eax is the info type that you want.
 // ecx is typically the cpu number, and should normally be zero.
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info);
+void CpuId(int info_eax, int info_ecx, int* cpu_info);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_CPU_ID_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_CPU_ID_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h b/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h
new file mode 100644
index 00000000000..bba0e8aedac
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/macros_msa.h
@@ -0,0 +1,233 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MACROS_MSA_H_
+#define INCLUDE_LIBYUV_MACROS_MSA_H_
+
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include <msa.h>
+#include <stdint.h>
+
+#if (__mips_isa_rev >= 6)
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \
+    uint32_t val_m;                                     \
+    asm volatile("lw  %[val_m],  %[psrc_lw_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \
+    uint64_t val_m = 0;                                 \
+    asm volatile("ld  %[val_m],  %[psrc_ld_m]  \n"      \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("sw  %[val_m],  %[pdst_sw_m]  \n"      \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+
+#if (__mips == 64)
+#define SD(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint64_t val_m = (val);                             \
+    asm volatile("sd  %[val_m],  %[pdst_sd_m]  \n"      \
+                 : [pdst_sd_m] "=m"(*pdst_sd_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+#else  // !(__mips == 64)
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // !(__mips == 64)
+#else   // !(__mips_isa_rev >= 6)
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_lw_m = (const uint8_t*)(psrc);  \
+    uint32_t val_m;                                     \
+    asm volatile("ulw  %[val_m],  %[psrc_lw_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_lw_m] "m"(*psrc_lw_m));        \
+    val_m;                                              \
+  })
+
+#if (__mips == 64)
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);  \
+    uint64_t val_m = 0;                                 \
+    asm volatile("uld  %[val_m],  %[psrc_ld_m]  \n"     \
+                 : [val_m] "=r"(val_m)                  \
+                 : [psrc_ld_m] "m"(*psrc_ld_m));        \
+    val_m;                                              \
+  })
+#else  // !(__mips == 64)
+#define LD(psrc)                                                         \
+  ({                                                                     \
+    const uint8_t* psrc_ld_m = (const uint8_t*)(psrc);                   \
+    uint32_t val0_m, val1_m;                                             \
+    uint64_t val_m = 0;                                                  \
+    val0_m = LW(psrc_ld_m);                                              \
+    val1_m = LW(psrc_ld_m + 4);                                          \
+    val_m = (uint64_t)(val1_m);                             /* NOLINT */ \
+    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); /* NOLINT */ \
+    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           /* NOLINT */ \
+    val_m;                                                               \
+  })
+#endif  // (__mips == 64)
+
+#define SW(val, pdst)                                   \
+  ({                                                    \
+    uint8_t* pdst_sw_m = (uint8_t*)(pdst); /* NOLINT */ \
+    uint32_t val_m = (val);                             \
+    asm volatile("usw  %[val_m],  %[pdst_sw_m]  \n"     \
+                 : [pdst_sw_m] "=m"(*pdst_sw_m)         \
+                 : [val_m] "r"(val_m));                 \
+  })
+
+#define SD(val, pdst)                                        \
+  ({                                                         \
+    uint8_t* pdst_sd_m = (uint8_t*)(pdst); /* NOLINT */      \
+    uint32_t val0_m, val1_m;                                 \
+    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+    SW(val0_m, pdst_sd_m);                                   \
+    SW(val1_m, pdst_sd_m + 4);                               \
+  })
+#endif  // (__mips_isa_rev >= 6)
+
+// TODO(fbarchard): Consider removing __VAR_ARGS versions.
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) /* NOLINT */
+#define LD_UB(...) LD_B(const v16u8, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = (in) /* NOLINT */
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+
+/* Description : Load two vectors with 16 'byte' sized elements
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+  {                                            \
+    out0 = LD_B(RTYPE, (psrc));                \
+    out1 = LD_B(RTYPE, (psrc) + stride);       \
+  }
+#define LD_UB2(...) LD_B2(const v16u8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+  {                                                        \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+  }
+#define LD_UB4(...) LD_B4(const v16u8, __VA_ARGS__)
+
+/* Description : Store two vectors with stride each having 16 'byte' sized
+                 elements
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_B(RTYPE, in0, (pdst));                \
+    ST_B(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+  {                                                      \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+  }
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+  {                                          \
+    ST_H(RTYPE, in0, (pdst));                \
+    ST_H(RTYPE, in1, (pdst) + stride);       \
+  }
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+
+// TODO(fbarchard): Consider using __msa_vshf_b and __msa_ilvr_b directly.
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
+  {                                                                   \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
+  }
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
+  {                                                     \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
+    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
+  }
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+
+#endif /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
+
+#endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
index 8423121d11e..275f8d4c185 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_
 #define INCLUDE_LIBYUV_MJPEG_DECODER_H_
 
 #include "libyuv/basic_types.h"
@@ -26,25 +26,24 @@ namespace libyuv {
 extern "C" {
 #endif
 
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+static const uint32_t kUnknownDataSize = 0xFFFFFFFF;
 
 enum JpegSubsamplingType {
   kJpegYuv420,
   kJpegYuv422,
-  kJpegYuv411,
   kJpegYuv444,
   kJpegYuv400,
   kJpegUnknown
 };
 
 struct Buffer {
-  const uint8* data;
+  const uint8_t* data;
   int len;
 };
 
@@ -66,7 +65,7 @@ struct SetJmpErrorMgr;
 class LIBYUV_API MJpegDecoder {
  public:
   typedef void (*CallbackFunction)(void* opaque,
-                                   const uint8* const* data,
+                                   const uint8_t* const* data,
                                    const int* strides,
                                    int rows);
 
@@ -86,7 +85,7 @@ class LIBYUV_API MJpegDecoder {
   // If return value is LIBYUV_TRUE, then the values for all the following
   // getters are populated.
   // src_len is the size of the compressed mjpeg frame in bytes.
-  LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+  LIBYUV_BOOL LoadFrame(const uint8_t* src, size_t src_len);
 
   // Returns width of the last loaded frame in pixels.
   int GetWidth();
@@ -139,18 +138,22 @@ class LIBYUV_API MJpegDecoder {
   // at least GetComponentSize(i). The pointers in planes are incremented
   // to point to after the end of the written data.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToBuffers(uint8_t** planes, int dst_width, int dst_height);
 
   // Decodes the entire image and passes the data via repeated calls to a
   // callback function. Each call will get the data for a whole number of
   // image scanlines.
   // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
-  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
-                        int dst_width, int dst_height);
+  LIBYUV_BOOL DecodeToCallback(CallbackFunction fn,
+                               void* opaque,
+                               int dst_width,
+                               int dst_height);
 
   // The helper function which recognizes the jpeg sub-sampling type.
   static JpegSubsamplingType JpegSubsamplingTypeHelper(
-     int* subsample_x, int* subsample_y, int number_of_components);
+      int* subsample_x,
+      int* subsample_y,
+      int number_of_components);
 
  private:
   void AllocOutputBuffers(int num_outbufs);
@@ -159,7 +162,7 @@ class LIBYUV_API MJpegDecoder {
   LIBYUV_BOOL StartDecode();
   LIBYUV_BOOL FinishDecode();
 
-  void SetScanlinePointers(uint8** data);
+  void SetScanlinePointers(uint8_t** data);
   LIBYUV_BOOL DecodeImcuRow();
 
   int GetComponentScanlinePadding(int component);
@@ -178,15 +181,15 @@ class LIBYUV_API MJpegDecoder {
 
   // Temporaries used to point to scanline outputs.
   int num_outbufs_;  // Outermost size of all arrays below.
-  uint8*** scanlines_;
+  uint8_t*** scanlines_;
   int* scanlines_sizes_;
   // Temporary buffer used for decoding when we can't decode directly to the
   // output buffers. Large enough for just one iMCU row.
-  uint8** databuf_;
+  uint8_t** databuf_;
   int* databuf_strides_;
 };
 
 }  // namespace libyuv
 
 #endif  //  __cplusplus
-#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_MJPEG_DECODER_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
index 9662516c573..91137baba25 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/planar_functions.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 #define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
 
 #include "libyuv/basic_types.h"
@@ -22,102 +22,228 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// TODO(fbarchard): Move cpu macros to row.h
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
+#define LIBYUV_DISABLE_X86
+#endif
+// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#define LIBYUV_DISABLE_X86
+#endif
+#endif
+// The following are available on all x86 platforms:
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_ARGBAFFINEROW_SSE2
+#endif
+
 // Copy a plane of data.
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
+
+LIBYUV_API
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height);
+
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height);
 
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height);
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 1024 for 10 bits
+                       int width,
+                       int height);
 
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value);
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value);
 
 // Split interleaved UV plane into separate U and V planes.
 LIBYUV_API
-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int width, int height);
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height);
 
 // Merge separate U and V planes into one interleaved UV plane.
 LIBYUV_API
-void MergeUVPlane(const uint8* src_u, int src_stride_u,
-                  const uint8* src_v, int src_stride_v,
-                  uint8* dst_uv, int dst_stride_uv,
-                  int width, int height);
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height);
+
+// Split interleaved RGB plane into separate R, G and B planes.
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height);
+
+// Merge separate R, G and B planes into one interleaved RGB plane.
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height);
 
 // Copy I400.  Supports inverting.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 #define J400ToJ400 I400ToI400
 
 // Copy I422 to I422.
 #define I422ToI422 I422Copy
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
 
 // Copy I444 to I444.
 #define I444ToI444 I444Copy
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height);
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height);
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
-
-LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
-
-LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height);
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height);
 
 // Convert I420 to I400. (calls CopyPlane ignoring u/v).
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Alias
 #define J420ToJ400 I420ToI400
@@ -125,13 +251,20 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
 
 // I420 mirror.
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height);
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
 
 // Alias
 #define I400ToI400Mirror I400Mirror
@@ -139,87 +272,139 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
 // I400 mirror.  A single plane is mirrored horizontally.
 // Pass negative height to achieve 180 degree rotation.
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Alias
 #define ARGBToARGBMirror ARGBMirror
 
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Convert NV12 to RGB565.
 LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height);
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height);
 
 // I422ToARGB is in convert_argb.h
 // Convert I422 to BGRA.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height);
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height);
 
 // Convert I422 to ABGR.
 LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height);
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height);
 
 // Convert I422 to RGBA.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height);
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height);
 
 // Alias
 #define RGB24ToRAW RAWToRGB24
 
 LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height);
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height);
 
 // Draw a rectangle into I420.
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y, int width, int height,
-             int value_y, int value_u, int value_v);
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v);
 
 // Draw a rectangle into ARGB.
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height, uint32 value);
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value);
 
 // Convert ARGB to gray scale ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height);
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height);
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int x, int y, int width, int height);
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height);
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int x, int y, int width, int height);
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height);
 
 // Apply a matrix rotation to each ARGB pixel.
 // matrix_argb is 4 signed ARGB values. -128 to 127 representing -2 to 2.
@@ -228,10 +413,13 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 // The next 4 coefficients apply to B, G, R, A and produce R of the output.
 // The last 4 coefficients apply to B, G, R, A and produce A of the output.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height);
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height);
 
 // Deprecated. Use ARGBColorMatrix instead.
 // Apply a matrix rotation to each ARGB pixel.
@@ -240,32 +428,47 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
 // The next 4 coefficients apply to B, G, R, A and produce G of the output.
 // The last 4 coefficients apply to B, G, R, A and produce R of the output.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int x, int y, int width, int height);
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int x, int y, int width, int height);
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height);
 
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int x, int y, int width, int height);
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height);
 
 // Apply a luma/color table each ARGB pixel but preserve destination alpha.
 // Table contains 32768 values indexed by [Y][C] where 7 it 7 bit luma from
 // RGB (YJ style) and C is an 8 bit color component (R, G or B).
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma_rgb_table,
-                       int width, int height);
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height);
 
 // Apply a 3 term polynomial to ARGB values.
 // poly points to a 4x4 matrix.  The first row is constants.  The 2nd row is
@@ -276,46 +479,84 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
 // A polynomial approximation can be dirived using software such as 'R'.
 
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height);
+                   int width,
+                   int height);
+
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height);
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
 
 // Quantize a rectangle of ARGB. Alpha unaffected.
 // scale is a 16 bit fractional fixed point scaler between 0 and 65535.
 // interval_size should be a value between 1 and 255.
 // interval_offset should be a value between 0 and 255.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int x, int y, int width, int height);
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height);
 
 // Copy ARGB to ARGB.
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height);
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height);
 
 // Copy Alpha channel of ARGB to alpha of ARGB.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
 
 // Extract the alpha channel from ARGB.
 LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_a, int dst_stride_a,
-                     int width, int height);
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height);
 
 // Copy Y channel to Alpha of ARGB.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height);
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height);
 
-typedef void (*ARGBBlendRow)(const uint8* src_argb0, const uint8* src_argb1,
-                             uint8* dst_argb, int width);
+typedef void (*ARGBBlendRow)(const uint8_t* src_argb0,
+                             const uint8_t* src_argb1,
+                             uint8_t* dst_argb,
+                             int width);
 
 // Get function to Alpha Blend ARGB pixels and store to destination.
 LIBYUV_API
@@ -325,92 +566,143 @@ ARGBBlendRow GetARGBBlend();
 // Source is pre-multiplied by alpha using ARGBAttenuate.
 // Alpha of destination is set to 255.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // Alpha Blend plane and store to destination.
 // Source is not pre-multiplied by alpha.
 LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height);
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height);
 
 // Alpha Blend YUV images and store to destination.
 // Source is not pre-multiplied by alpha.
 // Alpha is full width x height and subsampled to half size to apply to UV.
 LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height);
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height);
 
 // Multiply ARGB image by ARGB image. Shifted down by 8. Saturates to 255.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // Add ARGB image with ARGB image. Saturates to 255.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height);
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height);
 
 // Subtract ARGB image (argb1) from ARGB image (argb0). Saturates to 0.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height);
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height);
 
 // Convert I422 to YUY2.
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height);
 
 // Convert I422 to UYVY.
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_frame, int dst_stride_frame,
-               int width, int height);
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height);
 
 // Convert unattentuated ARGB to preattenuated ARGB.
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height);
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height);
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height);
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height);
 
 // Internal function - do not call directly.
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height);
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height);
 
 // Blur ARGB image.
 // dst_cumsum table of width * (height + 1) * 16 bytes aligned to
@@ -419,103 +711,137 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // radius is number of pixels around the center.  e.g. 1 = 3x3. 2=5x5.
 // Blur is optimized for radius of 5 (11x11) or less.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius);
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius);
 
 // Multiply ARGB image by ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value);
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value);
 
 // Interpolate between two images using specified amount of interpolation
 // (0 to 255) and store to destination.
 // 'interpolation' is specified as 8 bit fraction where 0 means 100% src0
 // and 255 means 1% src0 and 99% src1.
 LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation);
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation);
 
 // Interpolate between two ARGB images using specified amount of interpolation
 // Internally calls InterpolatePlane with width * 4 (bpp).
 LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation);
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation);
 
 // Interpolate between two YUV images using specified amount of interpolation
 // Internally calls InterpolatePlane on each plane where the U and V planes
 // are half width and half height.
 LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation);
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
-#define LIBYUV_DISABLE_X86
-#endif
-// MemorySanitizer does not support assembly code yet. http://crbug.com/344505
-#if defined(__has_feature)
-#if __has_feature(memory_sanitizer)
-#define LIBYUV_DISABLE_X86
-#endif
-#endif
-// The following are available on all x86 platforms:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
-#define HAS_ARGBAFFINEROW_SSE2
-#endif
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation);
 
 // Row function for copying pixels from a source with a slope to a row
 // of destination. Useful for scaling, rotation, mirror, texture mapping.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
+// TODO(fbarchard): Move ARGBAffineRow_SSE2 to row.h
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* uv_dudv,
+                        int width);
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 // shuffler is 16 bytes and must be aligned.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height);
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height);
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height);
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height);
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height);
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height);
 
 // Sobel ARGB effect w/ Sobel X, Sobel, Sobel Y in ARGB.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height);
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h
index 8af60b89550..76b692be8b0 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_H_
 #define INCLUDE_LIBYUV_ROTATE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,8 +20,8 @@ extern "C" {
 
 // Supported rotation.
 typedef enum RotationMode {
-  kRotate0 = 0,  // No rotation.
-  kRotate90 = 90,  // Rotate 90 degrees clockwise.
+  kRotate0 = 0,      // No rotation.
+  kRotate90 = 90,    // Rotate 90 degrees clockwise.
   kRotate180 = 180,  // Rotate 180 degrees.
   kRotate270 = 270,  // Rotate 270 degrees clockwise.
 
@@ -33,85 +33,132 @@ typedef enum RotationMode {
 
 // Rotate I420 frame.
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int src_width, int src_height, enum RotationMode mode);
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
 
 // Rotate NV12 input and store in I420.
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int src_width, int src_height, enum RotationMode mode);
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
+                     enum RotationMode mode);
 
 // Rotate a plane by 0, 90, 180, or 270.
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int src_width, int src_height, enum RotationMode mode);
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
+                enum RotationMode mode);
 
 // Rotate planes by 90, 180, 270. Deprecated.
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height);
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height);
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height);
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height);
 
 // Rotations for when U and V are interleaved.
 // These functions take one input pointer and
 // split the data into two buffers while
 // rotating them. Deprecated.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 // The 90 and 270 functions are based on transposes.
 // Doing a transpose with reversing the read/write
 // order will result in a rotation by +- 90 degrees.
 // Deprecated.
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height);
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height);
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
index 660ff5573ec..20432949ab4 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_
 #define INCLUDE_LIBYUV_ROTATE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -21,13 +21,17 @@ extern "C" {
 
 // Rotate ARGB frame
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int src_width, int src_height, enum RotationMode mode);
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               enum RotationMode mode);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ARGB_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
index ebc487f9abf..5edc0fcf13a 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/rotate_row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_
 #define INCLUDE_LIBYUV_ROTATE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -18,10 +18,14 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
@@ -29,93 +33,162 @@ extern "C" {
 #endif
 #endif
 // The following are available for Visual C and clangcl 32 bit:
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 #define HAS_TRANSPOSEWX8_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-// The following are available for GCC 32 or 64 bit but not NaCL for 64 bit:
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+// The following are available for GCC 32 or 64 bit:
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__))
 #define HAS_TRANSPOSEWX8_SSSE3
 #endif
 
-// The following are available for 64 bit GCC but not NaCL:
-#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
-    defined(__x86_64__)
+// The following are available for 64 bit GCC:
+#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__)
 #define HAS_TRANSPOSEWX8_FAST_SSSE3
 #define HAS_TRANSPOSEUVWX8_SSE2
 #endif
 
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_TRANSPOSEWX8_NEON
 #define HAS_TRANSPOSEUVWX8_NEON
 #endif
 
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_TRANSPOSEWX8_DSPR2
-#define HAS_TRANSPOSEUVWX8_DSPR2
-#endif  // defined(__mips__)
-
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height);
-
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width);
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width);
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width);
-
-void TransposeWx8_Any_NEON(const uint8* src, int src_stride,
-                           uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride,
-                                 uint8* dst, int dst_stride, int width);
-void TransposeWx8_Any_DSPR2(const uint8* src, int src_stride,
-                            uint8* dst, int dst_stride, int width);
-
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height);
-
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b, int width);
-
-void TransposeUVWx8_Any_SSE2(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_NEON(const uint8* src, int src_stride,
-                             uint8* dst_a, int dst_stride_a,
-                             uint8* dst_b, int dst_stride_b, int width);
-void TransposeUVWx8_Any_DSPR2(const uint8* src, int src_stride,
-                              uint8* dst_a, int dst_stride_a,
-                              uint8* dst_b, int dst_stride_b, int width);
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_TRANSPOSEWX16_MSA
+#define HAS_TRANSPOSEUVWX16_MSA
+#endif
+
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height);
+
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width);
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width);
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width);
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width);
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width);
+
+void TransposeWx8_Any_NEON(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+void TransposeWx8_Any_SSSE3(const uint8_t* src,
+                            int src_stride,
+                            uint8_t* dst,
+                            int dst_stride,
+                            int width);
+void TransposeWx8_Fast_Any_SSSE3(const uint8_t* src,
+                                 int src_stride,
+                                 uint8_t* dst,
+                                 int dst_stride,
+                                 int width);
+void TransposeWx16_Any_MSA(const uint8_t* src,
+                           int src_stride,
+                           uint8_t* dst,
+                           int dst_stride,
+                           int width);
+
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height);
+
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width);
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width);
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width);
+
+void TransposeUVWx8_Any_SSE2(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx8_Any_NEON(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
+void TransposeUVWx16_Any_MSA(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst_a,
+                             int dst_stride_a,
+                             uint8_t* dst_b,
+                             int dst_stride_b,
+                             int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROTATE_ROW_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/row.h b/libs/libvpx/third_party/libyuv/include/libyuv/row.h
index 013a7e53e32..65ef448b8ce 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/row.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_ROW_H_
 #define INCLUDE_LIBYUV_ROW_H_
 
 #include <stdlib.h>  // For malloc.
@@ -20,41 +20,20 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
-
-#ifdef __cplusplus
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = reinterpret_cast<uint8*>(malloc((size) + 63));            \
-  uint8* var = reinterpret_cast<uint8*>                                        \
-      ((reinterpret_cast<intptr_t>(var##_mem) + 63) & ~63)
-#else
-#define align_buffer_64(var, size)                                             \
-  uint8* var##_mem = (uint8*)(malloc((size) + 63));               /* NOLINT */ \
-  uint8* var = (uint8*)(((intptr_t)(var##_mem) + 63) & ~63)       /* NOLINT */
-#endif
-
-#define free_aligned_buffer_64(var) \
-  free(var##_mem);  \
-  var = 0
-
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-// True if compiling for SSSE3 as a requirement.
-#if defined(__SSSE3__) || (defined(_M_IX86_FP) && (_M_IX86_FP >= 3))
-#define LIBYUV_SSSE3_ONLY
-#endif
-
-#if defined(__native_client__)
-#define LIBYUV_DISABLE_NEON
-#endif
 // clang >= 3.5.0 required for Arm64.
 #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON)
 #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5))
@@ -76,9 +55,19 @@ extern "C" {
 #endif  // clang >= 3.4
 #endif  // __clang__
 
+// clang >= 6.0.0 required for AVX512.
+// TODO(fbarchard): fix xcode 9 ios b/789.
+#if 0  // Build fails in libvpx on Mac
+#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
+#if (__clang_major__ >= 7) && !defined(__APPLE_EMBEDDED_SIMULATOR__)
+#define CLANG_HAS_AVX512 1
+#endif  // clang >= 7
+#endif  // __clang__
+#endif  // 0
+
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
@@ -90,8 +79,8 @@ extern "C" {
 #define HAS_ABGRTOYROW_SSSE3
 #define HAS_ARGB1555TOARGBROW_SSE2
 #define HAS_ARGB4444TOARGBROW_SSE2
+#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_ARGBSETROW_X86
-#define HAS_ARGBSHUFFLEROW_SSE2
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
@@ -104,12 +93,12 @@ extern "C" {
 #define HAS_ARGBTOUVROW_SSSE3
 #define HAS_ARGBTOYJROW_SSSE3
 #define HAS_ARGBTOYROW_SSSE3
-#define HAS_ARGBEXTRACTALPHAROW_SSE2
 #define HAS_BGRATOUVROW_SSSE3
 #define HAS_BGRATOYROW_SSSE3
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_H422TOARGBROW_SSSE3
+#define HAS_HALFFLOATROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I422TOARGB1555ROW_SSSE3
 #define HAS_I422TOARGB4444ROW_SSSE3
@@ -126,8 +115,10 @@ extern "C" {
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_NV12TOARGBROW_SSSE3
+#define HAS_NV12TORGB24ROW_SSSE3
 #define HAS_NV12TORGB565ROW_SSSE3
 #define HAS_NV21TOARGBROW_SSSE3
+#define HAS_NV21TORGB24ROW_SSSE3
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RAWTORGB24ROW_SSSE3
 #define HAS_RAWTOYROW_SSSE3
@@ -180,11 +171,8 @@ extern "C" {
 
 // The following functions fail on gcc/clang 32 bit with fpic and framepointer.
 // caveat: clangcl uses row_win.cc which works.
-#if defined(NDEBUG) || !(defined(_DEBUG) && defined(__i386__)) || \
-    !defined(__i386__) || defined(_MSC_VER)
-// TODO(fbarchard): fix build error on x86 debug
-// https://code.google.com/p/libyuv/issues/detail?id=524
-#define HAS_I411TOARGBROW_SSSE3
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
 // TODO(fbarchard): fix build error on android_full_debug=1
 // https://code.google.com/p/libyuv/issues/detail?id=517
 #define HAS_I422ALPHATOARGBROW_SSSE3
@@ -193,11 +181,12 @@ extern "C" {
 
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
-// The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_ARGBCOPYALPHAROW_AVX2
 #define HAS_ARGBCOPYYTOALPHAROW_AVX2
+#define HAS_ARGBEXTRACTALPHAROW_AVX2
 #define HAS_ARGBMIRRORROW_AVX2
 #define HAS_ARGBPOLYNOMIALROW_AVX2
 #define HAS_ARGBSHUFFLEROW_AVX2
@@ -208,13 +197,9 @@ extern "C" {
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_COPYROW_AVX
 #define HAS_H422TOARGBROW_AVX2
+#define HAS_HALFFLOATROW_AVX2
+//  #define HAS_HALFFLOATROW_F16C  // Enable to test halffloat cast
 #define HAS_I400TOARGBROW_AVX2
-#if !(defined(_DEBUG) && defined(__i386__))
-// TODO(fbarchard): fix build error on android_full_debug=1
-// https://code.google.com/p/libyuv/issues/detail?id=517
-#define HAS_I422ALPHATOARGBROW_AVX2
-#endif
-#define HAS_I411TOARGBROW_AVX2
 #define HAS_I422TOARGB1555ROW_AVX2
 #define HAS_I422TOARGB4444ROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
@@ -227,8 +212,10 @@ extern "C" {
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_NV12TOARGBROW_AVX2
+#define HAS_NV12TORGB24ROW_AVX2
 #define HAS_NV12TORGB565ROW_AVX2
 #define HAS_NV21TOARGBROW_AVX2
+#define HAS_NV21TORGB24ROW_AVX2
 #define HAS_SPLITUVROW_AVX2
 #define HAS_UYVYTOARGBROW_AVX2
 #define HAS_UYVYTOUV422ROW_AVX2
@@ -246,11 +233,18 @@ extern "C" {
 #define HAS_ARGBSUBTRACTROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #define HAS_BLENDPLANEROW_AVX2
+
+#if defined(__x86_64__) || !defined(__pic__) || defined(__clang__) || \
+    defined(_MSC_VER)
+// TODO(fbarchard): fix build error on android_full_debug=1
+// https://code.google.com/p/libyuv/issues/detail?id=517
+#define HAS_I422ALPHATOARGBROW_AVX2
+#endif
 #endif
 
 // The following are available for AVX2 Visual C and clangcl 32 bit:
 // TODO(fbarchard): Port to gcc.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) && \
     (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2))
 #define HAS_ARGB1555TOARGBROW_AVX2
 #define HAS_ARGB4444TOARGBROW_AVX2
@@ -268,6 +262,51 @@ extern "C" {
 #define HAS_I422TOARGBROW_SSSE3
 #endif
 
+// The following are available for gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) && \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
+#define HAS_ABGRTOAR30ROW_SSSE3
+#define HAS_ARGBTOAR30ROW_SSSE3
+#define HAS_CONVERT16TO8ROW_SSSE3
+#define HAS_CONVERT8TO16ROW_SSE2
+// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
+#define HAS_I210TOAR30ROW_SSSE3
+#define HAS_I210TOARGBROW_SSSE3
+#define HAS_I422TOAR30ROW_SSSE3
+#define HAS_MERGERGBROW_SSSE3
+#define HAS_SPLITRGBROW_SSSE3
+#endif
+
+// The following are available for AVX2 gcc/clang x86 platforms:
+// TODO(fbarchard): Port to Visual C
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ARGBTOAR30ROW_AVX2
+#define HAS_ARGBTORAWROW_AVX2
+#define HAS_ARGBTORGB24ROW_AVX2
+#define HAS_CONVERT16TO8ROW_AVX2
+#define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_I210TOAR30ROW_AVX2
+#define HAS_I210TOARGBROW_AVX2
+#define HAS_I422TOAR30ROW_AVX2
+#define HAS_I422TOUYVYROW_AVX2
+#define HAS_I422TOYUY2ROW_AVX2
+#define HAS_MERGEUVROW_16_AVX2
+#define HAS_MULTIPLYROW_16_AVX2
+#endif
+
+// The following are available for AVX512 clang x86 platforms:
+// TODO(fbarchard): Port to GCC and Visual C
+// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+#if !defined(LIBYUV_DISABLE_X86) &&                                       \
+    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
+    (defined(CLANG_HAS_AVX512))
+#define HAS_ARGBTORGB24ROW_AVX512VBMI
+#endif
+
 // The following are available on Neon platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -279,6 +318,7 @@ extern "C" {
 #define HAS_ARGB4444TOARGBROW_NEON
 #define HAS_ARGB4444TOUVROW_NEON
 #define HAS_ARGB4444TOYROW_NEON
+#define HAS_ARGBEXTRACTALPHAROW_NEON
 #define HAS_ARGBSETROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
@@ -286,18 +326,17 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565DITHERROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
-#define HAS_ARGBTOUV411ROW_NEON
 #define HAS_ARGBTOUV444ROW_NEON
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
-#define HAS_ARGBEXTRACTALPHAROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
+#define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
+#define HAS_HALFFLOATROW_NEON
 #define HAS_I400TOARGBROW_NEON
-#define HAS_I411TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
 #define HAS_I422TOARGB4444ROW_NEON
@@ -313,8 +352,10 @@ extern "C" {
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
+#define HAS_NV12TORGB24ROW_NEON
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
+#define HAS_NV21TORGB24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
 #define HAS_RAWTOUVROW_NEON
@@ -328,6 +369,7 @@ extern "C" {
 #define HAS_RGBATOUVROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
+#define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
@@ -359,17 +401,87 @@ extern "C" {
 #define HAS_SOBELYROW_NEON
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-#define HAS_COPYROW_MIPS
-#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_I422TOARGBROW_DSPR2
-#define HAS_INTERPOLATEROW_DSPR2
-#define HAS_MIRRORROW_DSPR2
-#define HAS_MIRRORUVROW_DSPR2
-#define HAS_SPLITUVROW_DSPR2
+// The following are available on AArch64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_SCALESUMSAMPLES_NEON
 #endif
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_ABGRTOUVROW_MSA
+#define HAS_ABGRTOYROW_MSA
+#define HAS_ARGB1555TOARGBROW_MSA
+#define HAS_ARGB1555TOUVROW_MSA
+#define HAS_ARGB1555TOYROW_MSA
+#define HAS_ARGB4444TOARGBROW_MSA
+#define HAS_ARGBADDROW_MSA
+#define HAS_ARGBATTENUATEROW_MSA
+#define HAS_ARGBBLENDROW_MSA
+#define HAS_ARGBCOLORMATRIXROW_MSA
+#define HAS_ARGBEXTRACTALPHAROW_MSA
+#define HAS_ARGBGRAYROW_MSA
+#define HAS_ARGBMIRRORROW_MSA
+#define HAS_ARGBMULTIPLYROW_MSA
+#define HAS_ARGBQUANTIZEROW_MSA
+#define HAS_ARGBSEPIAROW_MSA
+#define HAS_ARGBSETROW_MSA
+#define HAS_ARGBSHADEROW_MSA
+#define HAS_ARGBSHUFFLEROW_MSA
+#define HAS_ARGBSUBTRACTROW_MSA
+#define HAS_ARGBTOARGB1555ROW_MSA
+#define HAS_ARGBTOARGB4444ROW_MSA
+#define HAS_ARGBTORAWROW_MSA
+#define HAS_ARGBTORGB24ROW_MSA
+#define HAS_ARGBTORGB565DITHERROW_MSA
+#define HAS_ARGBTORGB565ROW_MSA
+#define HAS_ARGBTOUV444ROW_MSA
+#define HAS_ARGBTOUVJROW_MSA
+#define HAS_ARGBTOUVROW_MSA
+#define HAS_ARGBTOYJROW_MSA
+#define HAS_ARGBTOYROW_MSA
+#define HAS_BGRATOUVROW_MSA
+#define HAS_BGRATOYROW_MSA
+#define HAS_HALFFLOATROW_MSA
+#define HAS_I400TOARGBROW_MSA
+#define HAS_I422ALPHATOARGBROW_MSA
+#define HAS_I422TOARGBROW_MSA
+#define HAS_I422TORGB24ROW_MSA
+#define HAS_I422TORGBAROW_MSA
+#define HAS_I422TOUYVYROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I444TOARGBROW_MSA
+#define HAS_INTERPOLATEROW_MSA
+#define HAS_J400TOARGBROW_MSA
+#define HAS_MERGEUVROW_MSA
+#define HAS_MIRRORROW_MSA
+#define HAS_MIRRORUVROW_MSA
+#define HAS_NV12TOARGBROW_MSA
+#define HAS_NV12TORGB565ROW_MSA
+#define HAS_NV21TOARGBROW_MSA
+#define HAS_RAWTOARGBROW_MSA
+#define HAS_RAWTORGB24ROW_MSA
+#define HAS_RAWTOUVROW_MSA
+#define HAS_RAWTOYROW_MSA
+#define HAS_RGB24TOARGBROW_MSA
+#define HAS_RGB24TOUVROW_MSA
+#define HAS_RGB24TOYROW_MSA
+#define HAS_RGB565TOARGBROW_MSA
+#define HAS_RGB565TOUVROW_MSA
+#define HAS_RGB565TOYROW_MSA
+#define HAS_RGBATOUVROW_MSA
+#define HAS_RGBATOYROW_MSA
+#define HAS_SETROW_MSA
+#define HAS_SOBELROW_MSA
+#define HAS_SOBELTOPLANEROW_MSA
+#define HAS_SOBELXROW_MSA
+#define HAS_SOBELXYROW_MSA
+#define HAS_SOBELYROW_MSA
+#define HAS_SPLITUVROW_MSA
+#define HAS_UYVYTOARGBROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_YUY2TOARGBROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOYROW_MSA
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -378,18 +490,18 @@ extern "C" {
 #else
 #define SIMD_ALIGNED(var) __declspec(align(16)) var
 #endif
-typedef __declspec(align(16)) int16 vec16[8];
-typedef __declspec(align(16)) int32 vec32[4];
-typedef __declspec(align(16)) int8 vec8[16];
-typedef __declspec(align(16)) uint16 uvec16[8];
-typedef __declspec(align(16)) uint32 uvec32[4];
-typedef __declspec(align(16)) uint8 uvec8[16];
-typedef __declspec(align(32)) int16 lvec16[16];
-typedef __declspec(align(32)) int32 lvec32[8];
-typedef __declspec(align(32)) int8 lvec8[32];
-typedef __declspec(align(32)) uint16 ulvec16[16];
-typedef __declspec(align(32)) uint32 ulvec32[8];
-typedef __declspec(align(32)) uint8 ulvec8[32];
+typedef __declspec(align(16)) int16_t vec16[8];
+typedef __declspec(align(16)) int32_t vec32[4];
+typedef __declspec(align(16)) int8_t vec8[16];
+typedef __declspec(align(16)) uint16_t uvec16[8];
+typedef __declspec(align(16)) uint32_t uvec32[4];
+typedef __declspec(align(16)) uint8_t uvec8[16];
+typedef __declspec(align(32)) int16_t lvec16[16];
+typedef __declspec(align(32)) int32_t lvec32[8];
+typedef __declspec(align(32)) int8_t lvec8[32];
+typedef __declspec(align(32)) uint16_t ulvec16[16];
+typedef __declspec(align(32)) uint32_t ulvec32[8];
+typedef __declspec(align(32)) uint8_t ulvec8[32];
 #elif !defined(__pnacl__) && (defined(__GNUC__) || defined(__clang__))
 // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const.
 #if defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)
@@ -397,32 +509,32 @@ typedef __declspec(align(32)) uint8 ulvec8[32];
 #else
 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 #endif
-typedef int16 __attribute__((vector_size(16))) vec16;
-typedef int32 __attribute__((vector_size(16))) vec32;
-typedef int8 __attribute__((vector_size(16))) vec8;
-typedef uint16 __attribute__((vector_size(16))) uvec16;
-typedef uint32 __attribute__((vector_size(16))) uvec32;
-typedef uint8 __attribute__((vector_size(16))) uvec8;
-typedef int16 __attribute__((vector_size(32))) lvec16;
-typedef int32 __attribute__((vector_size(32))) lvec32;
-typedef int8 __attribute__((vector_size(32))) lvec8;
-typedef uint16 __attribute__((vector_size(32))) ulvec16;
-typedef uint32 __attribute__((vector_size(32))) ulvec32;
-typedef uint8 __attribute__((vector_size(32))) ulvec8;
+typedef int16_t __attribute__((vector_size(16))) vec16;
+typedef int32_t __attribute__((vector_size(16))) vec32;
+typedef int8_t __attribute__((vector_size(16))) vec8;
+typedef uint16_t __attribute__((vector_size(16))) uvec16;
+typedef uint32_t __attribute__((vector_size(16))) uvec32;
+typedef uint8_t __attribute__((vector_size(16))) uvec8;
+typedef int16_t __attribute__((vector_size(32))) lvec16;
+typedef int32_t __attribute__((vector_size(32))) lvec32;
+typedef int8_t __attribute__((vector_size(32))) lvec8;
+typedef uint16_t __attribute__((vector_size(32))) ulvec16;
+typedef uint32_t __attribute__((vector_size(32))) ulvec32;
+typedef uint8_t __attribute__((vector_size(32))) ulvec8;
 #else
 #define SIMD_ALIGNED(var) var
-typedef int16 vec16[8];
-typedef int32 vec32[4];
-typedef int8 vec8[16];
-typedef uint16 uvec16[8];
-typedef uint32 uvec32[4];
-typedef uint8 uvec8[16];
-typedef int16 lvec16[16];
-typedef int32 lvec32[8];
-typedef int8 lvec8[32];
-typedef uint16 ulvec16[16];
-typedef uint32 ulvec32[8];
-typedef uint8 ulvec8[32];
+typedef int16_t vec16[8];
+typedef int32_t vec32[4];
+typedef int8_t vec8[16];
+typedef uint16_t uvec16[8];
+typedef uint32_t uvec32[4];
+typedef uint8_t uvec8[16];
+typedef int16_t lvec16[16];
+typedef int32_t lvec32[8];
+typedef int8_t lvec8[32];
+typedef uint16_t ulvec16[16];
+typedef uint32_t ulvec32[8];
+typedef uint8_t ulvec8[32];
 #endif
 
 #if defined(__aarch64__)
@@ -446,23 +558,23 @@ struct YuvConstants {
 #else
 // This struct is for Intel color conversion.
 struct YuvConstants {
-  int8 kUVToB[32];
-  int8 kUVToG[32];
-  int8 kUVToR[32];
-  int16 kUVBiasB[16];
-  int16 kUVBiasG[16];
-  int16 kUVBiasR[16];
-  int16 kYToRgb[16];
+  int8_t kUVToB[32];
+  int8_t kUVToG[32];
+  int8_t kUVToR[32];
+  int16_t kUVBiasB[16];
+  int16_t kUVBiasG[16];
+  int16_t kUVBiasR[16];
+  int16_t kYToRgb[16];
 };
 
 // Offsets into YuvConstants structure
-#define KUVTOB   0
-#define KUVTOG   32
-#define KUVTOR   64
+#define KUVTOB 0
+#define KUVTOG 32
+#define KUVTOR 64
 #define KUVBIASB 96
 #define KUVBIASG 128
 #define KUVBIASR 160
-#define KYTORGB  192
+#define KYTORGB 192
 #endif
 
 // Conversion matrix for YUV to RGB
@@ -475,6 +587,16 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants);  // BT.601
 extern const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants);  // JPeg
 extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
 
+#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1)))
+
+#define align_buffer_64(var, size)                                           \
+  uint8_t* var##_mem = (uint8_t*)(malloc((size) + 63));         /* NOLINT */ \
+  uint8_t* var = (uint8_t*)(((intptr_t)(var##_mem) + 63) & ~63) /* NOLINT */
+
+#define free_aligned_buffer_64(var) \
+  free(var##_mem);                  \
+  var = 0
+
 #if defined(__APPLE__) || defined(__x86_64__) || defined(__llvm__)
 #define OMITFP
 #else
@@ -487,1458 +609,2863 @@ extern const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants);  // BT.709
 #else
 #define LABELALIGN
 #endif
-#if defined(__native_client__) && defined(__x86_64__)
-// r14 is used for MEMOP macros.
-#define NACL_R14 "r14",
-#define BUNDLELOCK ".bundle_lock\n"
-#define BUNDLEUNLOCK ".bundle_unlock\n"
-#define MEMACCESS(base) "%%nacl:(%%r15,%q" #base ")"
-#define MEMACCESS2(offset, base) "%%nacl:" #offset "(%%r15,%q" #base ")"
-#define MEMLEA(offset, base) #offset "(%q" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%q" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%q" #base ",%q" #index "," #scale ")"
-#define MEMMOVESTRING(s, d) "%%nacl:(%q" #s "),%%nacl:(%q" #d "), %%r15"
-#define MEMSTORESTRING(reg, d) "%%" #reg ",%%nacl:(%q" #d "), %%r15"
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg "\n" \
-    BUNDLEUNLOCK
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " %%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%" #arg "\n" \
-    BUNDLEUNLOCK
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #opcode " (%%r15,%%r14),%%" #reg1 ",%%" #reg2 "\n" \
-    BUNDLEUNLOCK
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    BUNDLELOCK \
-    "lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
-    #op " $" #sel ",%%" #reg ",(%%r15,%%r14)\n" \
-    BUNDLEUNLOCK
-#else  // defined(__native_client__) && defined(__x86_64__)
-#define NACL_R14
-#define BUNDLEALIGN
-#define MEMACCESS(base) "(%" #base ")"
-#define MEMACCESS2(offset, base) #offset "(%" #base ")"
-#define MEMLEA(offset, base) #offset "(%" #base ")"
-#define MEMLEA3(offset, index, scale) \
-    #offset "(,%" #index "," #scale ")"
-#define MEMLEA4(offset, base, index, scale) \
-    #offset "(%" #base ",%" #index "," #scale ")"
-#define MEMMOVESTRING(s, d)
-#define MEMSTORESTRING(reg, d)
-#define MEMOPREG(opcode, offset, base, index, scale, reg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg "\n"
-#define MEMOPMEM(opcode, reg, offset, base, index, scale) \
-    #opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#define MEMOPARG(opcode, offset, base, index, scale, arg) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
-#define VMEMOPREG(opcode, offset, base, index, scale, reg1, reg2) \
-    #opcode " " #offset "(%" #base ",%" #index "," #scale "),%%" #reg1 ",%%" \
-    #reg2 "\n"
-#define VEXTOPMEM(op, sel, reg, offset, base, index, scale) \
-    #op " $" #sel ",%%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
-#endif  // defined(__native_client__) && defined(__x86_64__)
-
-#if defined(__arm__) || defined(__aarch64__)
-#undef MEMACCESS
-#if defined(__native_client__)
-#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
-#else
-#define MEMACCESS(base)
-#endif
+
+// Intel Code Analizer markers.  Insert IACA_START IACA_END around code to be
+// measured and then run with iaca -64 libyuv_unittest.
+// IACA_ASM_START amd IACA_ASM_END are equivalents that can be used within
+// inline assembly blocks.
+// example of iaca:
+// ~/iaca-lin64/bin/iaca.sh -64 -analysis LATENCY out/Release/libyuv_unittest
+
+#if defined(__x86_64__) || defined(__i386__)
+
+#define IACA_ASM_START  \
+  ".byte 0x0F, 0x0B\n"  \
+  " movl $111, %%ebx\n" \
+  ".byte 0x64, 0x67, 0x90\n"
+
+#define IACA_ASM_END         \
+  " movl $222, %%ebx\n"      \
+  ".byte 0x64, 0x67, 0x90\n" \
+  ".byte 0x0F, 0x0B\n"
+
+#define IACA_SSC_MARK(MARK_ID)                        \
+  __asm__ __volatile__("\n\t  movl $" #MARK_ID        \
+                       ", %%ebx"                      \
+                       "\n\t  .byte 0x64, 0x67, 0x90" \
+                       :                              \
+                       :                              \
+                       : "memory");
+
+#define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B");
+
+#else /* Visual C */
+#define IACA_UD_BYTES \
+  { __asm _emit 0x0F __asm _emit 0x0B }
+
+#define IACA_SSC_MARK(x) \
+  { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 }
+
+#define IACA_VC64_START __writegsbyte(111, 111);
+#define IACA_VC64_END __writegsbyte(222, 222);
 #endif
 
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define IACA_START     \
+  {                    \
+    IACA_UD_BYTES      \
+    IACA_SSC_MARK(111) \
+  }
+#define IACA_END       \
+  {                    \
+    IACA_SSC_MARK(222) \
+    IACA_UD_BYTES      \
+  }
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422AlphaToARGBRow_NEON(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width);
 
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width);
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width);
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width);
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_C(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_C(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_C(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_C(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_C(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_C(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width);
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width);
-void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_SSSE3(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_SSSE3(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_SSSE3(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_SSSE3(const uint8* src_raw, uint8* dst_y, int width);
-void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void ARGBToYJRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int width);
-void BGRAToYRow_Any_NEON(const uint8* src_bgra, uint8* dst_y, int width);
-void ABGRToYRow_Any_NEON(const uint8* src_abgr, uint8* dst_y, int width);
-void RGBAToYRow_Any_NEON(const uint8* src_rgba, uint8* dst_y, int width);
-void RGB24ToYRow_Any_NEON(const uint8* src_rgb24, uint8* dst_y, int width);
-void RAWToYRow_Any_NEON(const uint8* src_raw, uint8* dst_y, int width);
-void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int width);
-void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y,
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width);
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width);
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width);
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
+void RAWToYRow_Any_SSSE3(const uint8_t* src_raw, uint8_t* dst_y, int width);
+void ARGBToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void BGRAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              int width);
-void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y,
+void ARGB4444ToYRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              int width);
+void BGRAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGBAToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYJRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB24ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGB1555ToYRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 
-void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_AVX2(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_SSSE3(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_SSSE3(const uint8* src_argb, int src_stride_argb,
-                            uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_SSSE3(const uint8* src_bgra, int src_stride_bgra,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba,
-                           uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUVJRow_Any_SSSE3(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void BGRAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ABGRToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RGBAToUVRow_Any_SSSE3(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGBToUV444Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
                              int width);
-void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGBToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ARGBToUVJRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void BGRAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ABGRToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGBAToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RGB24ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                            int src_stride_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void ARGB1555ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGB4444ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                              int src_stride_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+void ARGBToUVJRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void BGRAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void ABGRToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGBAToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void RGB24ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void RAWToUVRow_Any_MSA(const uint8_t* src_ptr,
+                        int src_stride_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void RGB565ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                           int src_stride_ptr,
+                           uint8_t* dst_u,
+                           uint8_t* dst_v,
+                           int width);
+void ARGB1555ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                             int src_stride_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
                              int width);
-void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_Any_NEON(const uint8* src_argb, int src_stride_argb,
-                           uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba,
-                          uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                           uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw,
-                         uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                            uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555,
-                              int src_stride_argb1555,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444,
-                              int src_stride_argb4444,
-                              uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUVJRow_C(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_u, uint8* dst_v, int width);
-void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra,
-                   uint8* dst_u, uint8* dst_v, int width);
-void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba,
-                   uint8* dst_u, uint8* dst_v, int width);
-void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24,
-                    uint8* dst_u, uint8* dst_v, int width);
-void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw,
-                  uint8* dst_u, uint8* dst_v, int width);
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width);
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width);
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_SSSE3(const uint8* src_argb,
-                          uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV444Row_Any_SSSE3(const uint8* src_argb,
-                              uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width);
-
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width);
-void MirrorRow_C(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSSE3(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void MirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void MirrorUVRow_SSSE3(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void ARGBToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void BGRAToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void ABGRToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGBAToUVRow_C(const uint8_t* src_rgb0,
+                   int src_stride_rgb,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void RAWToUVRow_C(const uint8_t* src_rgb0,
+                  int src_stride_rgb,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width);
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width);
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width);
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void ARGBToUV444Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_u,
+                              uint8_t* dst_v,
+                              int width);
+
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
                       int width);
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void MirrorUVRow_SSSE3(const uint8_t* src,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width);
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_AVX2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_SSE2(const uint8* src, uint8* dst, int width);
-void ARGBMirrorRow_Any_NEON(const uint8* src, uint8* dst, int width);
-
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width);
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBMirrorRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width);
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width);
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width);
-void SplitUVRow_Any_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width);
+void SplitUVRow_Any_SSE2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void SplitUVRow_Any_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_AVX2(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void SplitUVRow_Any_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_Any_NEON(const uint8_t* src_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void SplitUVRow_Any_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                          int width);
+void SplitUVRow_Any_MSA(const uint8_t* src_ptr,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
                   int width);
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width);
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width);
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width);
-void MergeUVRow_Any_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width);
+void MergeUVRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
                          int width);
-void MergeUVRow_Any_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
                          int width);
-void MergeUVRow_Any_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width);
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width);
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width);
+void SplitRGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                           uint8_t* dst_r,
+                           uint8_t* dst_g,
+                           uint8_t* dst_b,
+                           int width);
+void SplitRGBRow_Any_NEON(const uint8_t* src_ptr,
+                          uint8_t* dst_r,
+                          uint8_t* dst_g,
+                          uint8_t* dst_b,
+                          int width);
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width);
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width);
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width);
+void MergeRGBRow_Any_SSSE3(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void MergeRGBRow_Any_NEON(const uint8_t* src_r,
+                          const uint8_t* src_g,
+                          const uint8_t* src_b,
+                          uint8_t* dst_rgb,
+                          int width);
+
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale, /* 64 for 10 bit */
+                     int width);
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width);
+
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
                          int width);
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width);
 
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count);
-void CopyRow_NEON(const uint8* src, uint8* dst, int count);
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
-void CopyRow_C(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_SSE2(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_AVX(const uint8* src, uint8* dst, int count);
-void CopyRow_Any_NEON(const uint8* src, uint8* dst, int count);
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width);
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width);
+void Convert8To16Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
+void Convert8To16Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint16_t* dst_ptr,
+                              int scale,
+                              int width);
 
-void CopyRow_16_C(const uint16* src, uint16* dst, int count);
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width);
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width);
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width);
+void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int scale,
+                               int width);
+void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int scale,
+                              int width);
 
-void ARGBCopyAlphaRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBCopyAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width);
+void CopyRow_MIPS(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count);
+void CopyRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_AVX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void CopyRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count);
+
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
-void ARGBCopyAlphaRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBCopyAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
 
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width);
-void ARGBExtractAlphaRow_Any_SSE2(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width);
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width);
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width);
+void ARGBExtractAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
-void ARGBExtractAlphaRow_Any_NEON(const uint8* src_argb, uint8* dst_a,
+void ARGBExtractAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
+void ARGBExtractAlphaRow_Any_NEON(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int width);
+void ARGBExtractAlphaRow_Any_MSA(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
+                                 int width);
 
-void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void ARGBCopyYToAlphaRow_Any_SSE2(const uint8* src_y, uint8* dst_argb,
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBCopyYToAlphaRow_Any_SSE2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
-void ARGBCopyYToAlphaRow_Any_AVX2(const uint8* src_y, uint8* dst_argb,
+void ARGBCopyYToAlphaRow_Any_AVX2(const uint8_t* src_ptr,
+                                  uint8_t* dst_ptr,
                                   int width);
 
-void SetRow_C(uint8* dst, uint8 v8, int count);
-void SetRow_X86(uint8* dst, uint8 v8, int count);
-void SetRow_ERMS(uint8* dst, uint8 v8, int count);
-void SetRow_NEON(uint8* dst, uint8 v8, int count);
-void SetRow_Any_X86(uint8* dst, uint8 v8, int count);
-void SetRow_Any_NEON(uint8* dst, uint8 v8, int count);
-
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count);
-void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count);
+void SetRow_C(uint8_t* dst, uint8_t v8, int width);
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width);
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width);
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width);
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width);
+void SetRow_Any_X86(uint8_t* dst_ptr, uint8_t v32, int width);
+void SetRow_Any_NEON(uint8_t* dst_ptr, uint8_t v32, int width);
+
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width);
+void ARGBSetRow_Any_NEON(uint8_t* dst_ptr, uint32_t v32, int width);
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width);
+void ARGBSetRow_Any_MSA(uint8_t* dst_ptr, uint32_t v32, int width);
 
 // ARGBShufflers for BGRAToARGB etc.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width);
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width);
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-void ARGBShuffleRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const uint8* shuffler, int width);
-
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width);
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width);
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width);
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width);
-void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb,
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width);
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width);
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width);
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width);
+void ARGBShuffleRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              const uint8_t* param,
+                              int width);
+void ARGBShuffleRow_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
+void ARGBShuffleRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             const uint8_t* param,
+                             int width);
+void ARGBShuffleRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            const uint8_t* param,
+                            int width);
+
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width);
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width);
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width);
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width);
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width);
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width);
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565, uint8_t* dst_argb, int width);
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
+                         int width);
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width);
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width);
+
+void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width);
+void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
 
-void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb,
+void RGB565ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb,
+void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
 
-void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb,
+void RGB24ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              int width);
-void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int width);
-void RAWToRGB24Row_Any_NEON(const uint8* src_raw, uint8* dst_rgb24, int width);
-void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb,
+void RGB24ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RAWToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void RAWToRGB24Row_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void RGB565ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
                               int width);
-void ARGB1555ToARGBRow_Any_NEON(const uint8* src_argb1555, uint8* dst_argb,
+void RGB565ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGB1555ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+void ARGB1555ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGB4444ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
+                                int width);
+
+void ARGB4444ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width);
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width);
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
                                 int width);
-void ARGB4444ToARGBRow_Any_NEON(const uint8* src_argb4444, uint8* dst_argb,
+
+void ARGBToRGB565Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_rgb,
+                            int width);
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width);
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width);
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width);
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width);
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width);
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
                                 int width);
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width);
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               const uint32_t dither4,
+                               int width);
+
+void ARGBToRGBARow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width);
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width);
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width);
+
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void J400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void J400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width);
-
-void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width);
-
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
-
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422AlphaToARGBRow_C(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          const uint8* a_buf,
-                          uint8* dst_argb,
-                          const struct YuvConstants* yuvconstants,
-                          int width);
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_argb,
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_argb,
-                       const struct YuvConstants* yuvconstants,
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width);
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
+                       const struct YuvConstants* yuvconstants,
                        int width);
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* dst_argb,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* dst_argb,
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width);
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* dst_argb,
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_rgba,
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width);
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* dst_rgb24,
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width);
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I444ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I444ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width);
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width);
-void I422ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+
+void I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_ar30,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_ar30,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I411ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
+void I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                         const uint16_t* u_buf,
+                         const uint16_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I411ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                        const uint8_t* u_buf,
+                        const uint8_t* v_buf,
+                        uint8_t* dst_ar30,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
+void I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                        const uint16_t* u_buf,
+                        const uint16_t* v_buf,
+                        uint8_t* dst_ar30,
+                        const struct YuvConstants* yuvconstants,
+                        int width);
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             const uint8_t* a_buf,
+                             uint8_t* dst_argb,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void NV12ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_argb,
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width);
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_argb,
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width);
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void NV21ToARGBRow_SSSE3(const uint8* src_y,
-                         const uint8* src_uv,
-                         uint8* dst_argb,
+void NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* vu_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void NV21ToARGBRow_AVX2(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                        const uint8_t* vu_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
+void YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
+void UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
+void YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width);
-void I422ToRGBARow_SSSE3(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgba,
+void I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_rgba,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_argb,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width);
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_argb,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_SSSE3(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb24,
+void I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                          const uint8_t* u_buf,
+                          const uint8_t* v_buf,
+                          uint8_t* dst_rgb24,
                           const struct YuvConstants* yuvconstants,
                           int width);
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
-void I422ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToRGBARow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I444ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I444ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I444ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I444ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
-                                  const uint8* u_buf,
-                                  const uint8* v_buf,
-                                  const uint8* a_buf,
-                                  uint8* dst_argb,
+void I422ToAR30Row_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToAR30Row_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf,
+                             const uint16_t* u_buf,
+                             const uint16_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToARGBRow_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf,
+                            const uint16_t* u_buf,
+                            const uint16_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                                  const uint8_t* u_buf,
+                                  const uint8_t* v_buf,
+                                  const uint8_t* a_buf,
+                                  uint8_t* dst_ptr,
                                   const struct YuvConstants* yuvconstants,
                                   int width);
-void I422AlphaToARGBRow_Any_AVX2(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 const uint8* a_buf,
-                                 uint8* dst_argb,
+void I422AlphaToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I411ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void NV12ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I411ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void NV12ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_uv,
-                             uint8* dst_argb,
+void NV21ToARGBRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV12ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void NV21ToARGBRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV21ToARGBRow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_vu,
-                             uint8* dst_argb,
+void NV12ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV21ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              const struct YuvConstants* yuvconstants,
+                              int width);
+void NV12ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void NV21ToARGBRow_Any_AVX2(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_uv,
-                               uint8* dst_argb,
+void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
-void NV12ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+void NV12ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void YUY2ToARGBRow_Any_SSSE3(const uint8* src_yuy2,
-                             uint8* dst_argb,
+void YUY2ToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void UYVYToARGBRow_Any_SSSE3(const uint8* src_uyvy,
-                             uint8* dst_argb,
+void UYVYToARGBRow_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2,
-                            uint8* dst_argb,
+void YUY2ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy,
-                            uint8* dst_argb,
+void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGBARow_Any_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_rgba,
+void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void I422ToARGB4444Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB4444Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB4444Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToARGB1555Row_Any_SSSE3(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 uint8* dst_rgba,
+void I422ToARGB1555Row_Any_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I422ToARGB1555Row_Any_AVX2(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_rgba,
+void I422ToARGB1555Row_Any_AVX2(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToRGB565Row_Any_SSSE3(const uint8* src_y,
-                               const uint8* src_u,
-                               const uint8* src_v,
-                               uint8* dst_rgba,
+void I422ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
                                const struct YuvConstants* yuvconstants,
                                int width);
-void I422ToRGB565Row_Any_AVX2(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_rgba,
+void I422ToRGB565Row_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_SSSE3(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToRGB24Row_Any_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void I422ToRGB24Row_Any_AVX2(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width);
-void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width);
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width);
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width);
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width);
+void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 
 // ARGB preattenuated alpha blend.
-void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
-                        uint8* dst_argb, int width);
-void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
-                    uint8* dst_argb, int width);
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width);
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width);
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
 
 // Unattenuated planar alpha blend.
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_SSSE3(const uint8* src0, const uint8* src1,
-                             const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_Any_AVX2(const uint8* src0, const uint8* src1,
-                            const uint8* alpha, uint8* dst, int width);
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width);
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width);
+void BlendPlaneRow_Any_SSSE3(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width);
+void BlendPlaneRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width);
 
 // ARGB multiply images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBMultiplyRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBMultiplyRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBMultiplyRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBMultiplyRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
 
 // ARGB add images.
-void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
-                  uint8* dst_argb, int width);
-void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
-void ARGBAddRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                     uint8* dst_argb, int width);
-void ARGBAddRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                         uint8* dst_argb, int width);
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width);
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_AVX2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width);
+void ARGBAddRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width);
+void ARGBAddRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
 
 // ARGB subtract images. Same API as Blend, but these require
 // pointer and width alignment for SSE2.
-void ARGBSubtractRow_C(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width);
-void ARGBSubtractRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_AVX2(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-void ARGBSubtractRow_NEON(const uint8* src_argb, const uint8* src_argb1,
-                          uint8* dst_argb, int width);
-void ARGBSubtractRow_Any_NEON(const uint8* src_argb, const uint8* src_argb1,
-                              uint8* dst_argb, int width);
-
-void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width);
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_AVX2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBSubtractRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width);
+void ARGBSubtractRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+
+void ARGBToRGB24Row_Any_SSSE3(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToRAWRow_Any_SSSE3(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRGB565Row_Any_SSE2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-
-void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRGB24Row_Any_AVX512VBMI(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   int width);
+void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB565DitherRow_Any_AVX2(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+
+void ARGBToRGB565Row_Any_AVX2(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
+void ABGRToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToAR30Row_Any_AVX2(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
 
-void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int width);
-void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToRGB24Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToRAWRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_NEON(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
+void ARGBToARGB1555Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
+void ARGBToARGB4444Row_Any_NEON(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                    const uint32 dither4, int width);
-
-void I444ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void ARGBToRGB565DitherRow_Any_NEON(const uint8_t* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    const uint32_t param,
+                                    int width);
+void ARGBToRGB24Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
+                            int width);
+void ARGBToRAWRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ARGBToRGB565Row_Any_MSA(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int width);
+void ARGBToARGB1555Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToARGB4444Row_Any_MSA(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
+                               int width);
+void ARGBToRGB565DitherRow_Any_MSA(const uint8_t* src_ptr,
+                                   uint8_t* dst_ptr,
+                                   const uint32_t param,
+                                   int width);
+
+void I444ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422AlphaToARGBRow_Any_NEON(const uint8* src_y,
-                                 const uint8* src_u,
-                                 const uint8* src_v,
-                                 const uint8* src_a,
-                                 uint8* dst_argb,
+void I422AlphaToARGBRow_Any_NEON(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 const uint8_t* a_buf,
+                                 uint8_t* dst_ptr,
                                  const struct YuvConstants* yuvconstants,
                                  int width);
-void I411ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
-                            const struct YuvConstants* yuvconstants,
-                            int width);
-void I422ToRGBARow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb,
+void I422ToRGBARow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToRGB24Row_Any_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb,
+void I422ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
-void I422ToARGB4444Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void I422ToARGB4444Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToARGB1555Row_Any_NEON(const uint8* src_y,
-                                const uint8* src_u,
-                                const uint8* src_v,
-                                uint8* dst_argb,
+void I422ToARGB1555Row_Any_NEON(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ptr,
                                 const struct YuvConstants* yuvconstants,
                                 int width);
-void I422ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_u,
-                              const uint8* src_v,
-                              uint8* dst_argb,
+void I422ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void NV12ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_uv,
-                            uint8* dst_argb,
+void NV12ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV21ToARGBRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_vu,
-                            uint8* dst_argb,
+void NV21ToARGBRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* uv_buf,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void NV12ToRGB565Row_Any_NEON(const uint8* src_y,
-                              const uint8* src_uv,
-                              uint8* dst_argb,
+void NV12ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
                               const struct YuvConstants* yuvconstants,
                               int width);
-void YUY2ToARGBRow_Any_NEON(const uint8* src_yuy2,
-                            uint8* dst_argb,
+void YUY2ToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void UYVYToARGBRow_Any_NEON(const uint8* src_uyvy,
-                            uint8* dst_argb,
+void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             const struct YuvConstants* yuvconstants,
                             int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
+void I444ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422ToRGBARow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void I422AlphaToARGBRow_Any_MSA(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                const uint8_t* a_buf,
+                                uint8_t* dst_ptr,
+                                const struct YuvConstants* yuvconstants,
+                                int width);
+void I422ToRGB24Row_Any_MSA(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            const struct YuvConstants* yuvconstants,
+                            int width);
+void I422ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* u_buf,
+                             const uint8_t* v_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void I422ToARGB4444Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void I422ToARGB1555Row_Any_MSA(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ptr,
+                               const struct YuvConstants* yuvconstants,
+                               int width);
+void NV12ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void NV12ToRGB565Row_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             const struct YuvConstants* yuvconstants,
+                             int width);
+void NV21ToARGBRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* uv_buf,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void YUY2ToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+void UYVYToARGBRow_Any_MSA(const uint8_t* src_ptr,
+                           uint8_t* dst_ptr,
+                           const struct YuvConstants* yuvconstants,
+                           int width);
+
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
-void I422ToARGBRow_DSPR2(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width);
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width);
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void YUY2ToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void YUY2ToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void YUY2ToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void YUY2ToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width);
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width);
 
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_AVX2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_AVX2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int width);
-void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
-                          uint8* dst_u, uint8* dst_v, int width);
-void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_NEON(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width);
-
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_AVX2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int width);
-void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
-                          uint8* dst_u, uint8* dst_v, int width);
-void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
-                             uint8* dst_u, uint8* dst_v, int width);
-
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_yuy2, int width);
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_uyvy, int width);
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_SSE2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width);
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width);
-void I422ToYUY2Row_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_yuy2, int width);
-void I422ToUYVYRow_Any_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_uyvy, int width);
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width);
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width);
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void UYVYToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_AVX2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_SSE2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_SSE2(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_NEON(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
+void UYVYToUV422Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_u,
+                             uint8_t* dst_v,
+                             int width);
+void UYVYToYRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void UYVYToUVRow_Any_MSA(const uint8_t* src_ptr,
+                         int src_stride_ptr,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width);
+void UYVYToUV422Row_Any_MSA(const uint8_t* src_ptr,
+                            uint8_t* dst_u,
+                            uint8_t* dst_v,
+                            int width);
+
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width);
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_SSE2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_AVX2(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width);
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width);
+void I422ToYUY2Row_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToUYVYRow_Any_NEON(const uint8_t* y_buf,
+                            const uint8_t* u_buf,
+                            const uint8_t* v_buf,
+                            uint8_t* dst_ptr,
+                            int width);
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width);
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width);
+void I422ToYUY2Row_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
+void I422ToUYVYRow_Any_MSA(const uint8_t* y_buf,
+                           const uint8_t* u_buf,
+                           const uint8_t* v_buf,
+                           uint8_t* dst_ptr,
+                           int width);
 
 // Effects related row functions.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
-                               int width);
-void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width);
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width);
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8_t* src_ptr,
+                                uint8_t* dst_ptr,
                                 int width);
-void ARGBAttenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
-void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+void ARGBAttenuateRow_Any_NEON(const uint8_t* src_ptr,
+                               uint8_t* dst_ptr,
                                int width);
+void ARGBAttenuateRow_Any_MSA(const uint8_t* src_ptr,
+                              uint8_t* dst_ptr,
+                              int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
-extern const uint32 fixed_invtbl8[256];
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBUnattenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+extern const uint32_t fixed_invtbl8[256];
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width);
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width);
+void ARGBUnattenuateRow_Any_SSE2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
-void ARGBUnattenuateRow_Any_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_Any_AVX2(const uint8_t* src_ptr,
+                                 uint8_t* dst_ptr,
                                  int width);
 
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
-
-void ARGBSepiaRow_C(uint8* dst_argb, int width);
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width);
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width);
-
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width);
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width);
-
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width);
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width);
-
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width);
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width);
-
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value);
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value);
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width);
+
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width);
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width);
+
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width);
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width);
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width);
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width);
+
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width);
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
+                           int width);
+
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width);
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width);
+
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width);
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width);
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width);
+
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value);
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value);
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value);
 
 // Used for blur.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
+                                    int count);
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width);
 
-void CumulativeSumToAverageRow_C(const int32* topleft, const int32* botleft,
-                                 int width, int area, uint8* dst, int count);
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width);
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count);
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width);
 
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width);
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width);
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width);
 
 // Used for I420Scale, ARGBScale, and ARGBInterpolate.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
-                      ptrdiff_t src_stride_ptr,
-                      int width, int source_y_fraction);
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      int width,
+                      int source_y_fraction);
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction);
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction);
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride_ptr, int width,
-                          int source_y_fraction);
-void InterpolateRow_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int source_y_fraction);
+void InterpolateRow_Any_NEON(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_SSSE3(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              ptrdiff_t src_stride_ptr,
+                              int width,
                               int source_y_fraction);
-void InterpolateRow_Any_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                             ptrdiff_t src_stride_ptr, int width,
+void InterpolateRow_Any_AVX2(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             ptrdiff_t src_stride_ptr,
+                             int width,
                              int source_y_fraction);
-void InterpolateRow_Any_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                              ptrdiff_t src_stride_ptr, int width,
-                              int source_y_fraction);
-
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         ptrdiff_t src_stride_ptr,
-                         int width, int source_y_fraction);
+void InterpolateRow_Any_MSA(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            ptrdiff_t src_stride_ptr,
+                            int width,
+                            int source_y_fraction);
+
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int width,
+                         int source_y_fraction);
 
 // Sobel images.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width);
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width);
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width);
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width);
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width);
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width);
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width);
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width);
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width);
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width);
-void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_argb, int width);
-void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                              uint8* dst_y, int width);
-void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                         uint8* dst_argb, int width);
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb, const float* poly,
-                         int width);
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width);
-
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff);
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width);
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width);
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int width);
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width);
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width);
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int width);
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width);
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width);
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width);
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width);
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width);
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width);
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width);
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width);
+void SobelRow_Any_SSE2(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_NEON(const uint8_t* y_buf,
+                       const uint8_t* uv_buf,
+                       uint8_t* dst_ptr,
+                       int width);
+void SobelRow_Any_MSA(const uint8_t* y_buf,
+                      const uint8_t* uv_buf,
+                      uint8_t* dst_ptr,
+                      int width);
+void SobelToPlaneRow_Any_SSE2(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_NEON(const uint8_t* y_buf,
+                              const uint8_t* uv_buf,
+                              uint8_t* dst_ptr,
+                              int width);
+void SobelToPlaneRow_Any_MSA(const uint8_t* y_buf,
+                             const uint8_t* uv_buf,
+                             uint8_t* dst_ptr,
+                             int width);
+void SobelXYRow_Any_SSE2(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_NEON(const uint8_t* y_buf,
+                         const uint8_t* uv_buf,
+                         uint8_t* dst_ptr,
+                         int width);
+void SobelXYRow_Any_MSA(const uint8_t* y_buf,
+                        const uint8_t* uv_buf,
+                        uint8_t* dst_ptr,
+                        int width);
+
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const float* poly,
+                         int width);
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width);
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
+                            int width);
+
+// Scale and convert to half float.
+void HalfFloatRow_C(const uint16_t* src, uint16_t* dst, float scale, int width);
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_SSE2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_AVX2(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_F16C(const uint16_t* src,
+                           uint16_t* dst,
+                           float scale,
+                           int width);
+void HalfFloat1Row_F16C(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_F16C(const uint16_t* src,
+                            uint16_t* dst,
+                            float scale,
+                            int width);
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width);
+void HalfFloatRow_Any_NEON(const uint16_t* src_ptr,
+                           uint16_t* dst_ptr,
+                           float param,
+                           int width);
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float scale,
+                        int width);
+void HalfFloat1Row_Any_NEON(const uint16_t* src_ptr,
+                            uint16_t* dst_ptr,
+                            float param,
+                            int width);
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width);
+void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          float param,
+                          int width);
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width);
+void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
+                             float* dst_ptr,
+                             float param,
+                             int width);
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff);
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff);
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff);
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width);
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width);
+void ScaleSamples_C(const float* src, float* dst, float scale, int width);
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_ROW_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h
index 102158d1ab2..b937d348cab 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/scale.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_H_
 #define INCLUDE_LIBYUV_SCALE_H_
 
 #include "libyuv/basic_types.h"
@@ -20,25 +20,33 @@ extern "C" {
 
 // Supported filtering.
 typedef enum FilterMode {
-  kFilterNone = 0,  // Point sample; Fastest.
-  kFilterLinear = 1,  // Filter horizontally only.
+  kFilterNone = 0,      // Point sample; Fastest.
+  kFilterLinear = 1,    // Filter horizontally only.
   kFilterBilinear = 2,  // Faster than box, but lower quality scaling down.
-  kFilterBox = 3  // Highest quality.
+  kFilterBox = 3        // Highest quality.
 } FilterModeEnum;
 
 // Scale a YUV plane.
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering);
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                   int src_width, int src_height,
-                   uint16* dst, int dst_stride,
-                   int dst_width, int dst_height,
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
                    enum FilterMode filtering);
 
 // Scales a YUV 4:2:0 image from the src width and height to the
@@ -52,44 +60,64 @@ void ScalePlane_16(const uint16* src, int src_stride,
 // Returns 0 if successful.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering);
 
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
           LIBYUV_BOOL interpolate);
 
-// Legacy API.  Deprecated.
-LIBYUV_API
-int ScaleOffset(const uint8* src_i420, int src_width, int src_height,
-                uint8* dst_i420, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate);
-
 // For testing, allow disabling of specialized scalers.
 LIBYUV_API
 void SetUseReferenceImpl(LIBYUV_BOOL use);
@@ -100,4 +128,4 @@ void SetUseReferenceImpl(LIBYUV_BOOL use);
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
index b56cf520993..7641f18e341 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_argb.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_
 #define INCLUDE_LIBYUV_SCALE_ARGB_H_
 
 #include "libyuv/basic_types.h"
@@ -20,32 +20,52 @@ extern "C" {
 #endif
 
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering);
 
 // Clipped scale takes destination rectangle coordinates for clip values.
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering);
 
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering);
 
 #ifdef __cplusplus
@@ -53,4 +73,4 @@ int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ARGB_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h
index df699e6c228..7194ba09f84 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/scale_row.h
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_
 #define INCLUDE_LIBYUV_SCALE_ROW_H_
 
 #include "libyuv/basic_types.h"
@@ -19,17 +19,20 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#if defined(__pnacl__) || defined(__CLR_VER) || \
-    (defined(__i386__) && !defined(__SSE2__))
+#if defined(__pnacl__) || defined(__CLR_VER) ||            \
+    (defined(__native_client__) && defined(__x86_64__)) || \
+    (defined(__i386__) && !defined(__SSE__) && !defined(__clang__))
 #define LIBYUV_DISABLE_X86
 #endif
+#if defined(__native_client__)
+#define LIBYUV_DISABLE_NEON
+#endif
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
 #define LIBYUV_DISABLE_X86
 #endif
 #endif
-
 // GCC >= 4.7.0 required for AVX2.
 #if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
 #if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -45,8 +48,8 @@ extern "C" {
 #endif  // __clang__
 
 // Visual C 2012 required for AVX2.
-#if defined(_M_IX86) && !defined(__clang__) && \
-    defined(_MSC_VER) && _MSC_VER >= 1700
+#if defined(_M_IX86) && !defined(__clang__) && defined(_MSC_VER) && \
+    _MSC_VER >= 1700
 #define VISUALC_HAS_AVX2 1
 #endif  // VisualStudio >= 2012
 
@@ -72,15 +75,16 @@ extern "C" {
 // The following are available on all x86 platforms, but
 // require VS2012, clang 3.4 or gcc 4.7.
 // The code supports NaCL but requires a new compiler and validator.
-#if !defined(LIBYUV_DISABLE_X86) && (defined(VISUALC_HAS_AVX2) || \
-    defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
+#if !defined(LIBYUV_DISABLE_X86) &&                          \
+    (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \
+     defined(GCC_HAS_AVX2))
 #define HAS_SCALEADDROW_AVX2
 #define HAS_SCALEROWDOWN2_AVX2
 #define HAS_SCALEROWDOWN4_AVX2
 #endif
 
 // The following are available on Neon platforms:
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+#if !defined(LIBYUV_DISABLE_NEON) && \
     (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
 #define HAS_SCALEARGBCOLS_NEON
 #define HAS_SCALEARGBROWDOWN2_NEON
@@ -93,33 +97,51 @@ extern "C" {
 #define HAS_SCALEARGBFILTERCOLS_NEON
 #endif
 
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
-    defined(__mips__) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-#define HAS_SCALEROWDOWN2_DSPR2
-#define HAS_SCALEROWDOWN4_DSPR2
-#define HAS_SCALEROWDOWN34_DSPR2
-#define HAS_SCALEROWDOWN38_DSPR2
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#define HAS_SCALEADDROW_MSA
+#define HAS_SCALEARGBCOLS_MSA
+#define HAS_SCALEARGBFILTERCOLS_MSA
+#define HAS_SCALEARGBROWDOWN2_MSA
+#define HAS_SCALEARGBROWDOWNEVEN_MSA
+#define HAS_SCALEFILTERCOLS_MSA
+#define HAS_SCALEROWDOWN2_MSA
+#define HAS_SCALEROWDOWN34_MSA
+#define HAS_SCALEROWDOWN38_MSA
+#define HAS_SCALEROWDOWN4_MSA
 #endif
 
 // Scale ARGB vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering);
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering);
 
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering);
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering);
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering);
 
 // Divide num by div and return as 16.16 fixed point result.
@@ -137,367 +159,786 @@ int FixedDiv1_X86(int num, int div);
 #endif
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy);
-
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width);
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width);
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width);
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width);
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width);
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx);
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx);
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int, int);
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int, int);
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                          int dst_width, int x, int dx);
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x, int dx);
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                            int dst_width, int x, int dx);
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width);
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width);
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+                int* x,
+                int* y,
+                int* dx,
+                int* dy);
+
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width);
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width);
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+                            uint8_t* d,
+                            int dst_width);
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width);
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width);
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+                               uint16_t* d,
+                               int dst_width);
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx);
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx);
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int,
+                    int);
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int,
+                       int);
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx);
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width);
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+                         uint16_t* dst,
+                         int dst_width);
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width);
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width);
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_argb,
+                         int dst_width);
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx);
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x, int dx);
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int, int);
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x, int dx);
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int,
+                        int);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx);
 
 // Specialized scalers for x86.
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width);
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown2Linear_Any_SSSE3(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown2Box_Any_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2Box_Odd_SSSE3(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown2_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_AVX2(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_SSSE3(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown4Box_Any_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown2Box_Odd_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleRowDown4_Any_AVX2(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_AVX2(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown34_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown34_1_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown34_0_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_Any_SSSE3(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr,
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+void ScaleRowDown38_2_Box_Any_SSSE3(const uint8_t* src_ptr,
                                     ptrdiff_t src_stride,
-                                    uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx);
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx);
-
+                                    uint8_t* dst_ptr,
+                                    int dst_width);
+
+void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_SSE2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+void ScaleAddRow_Any_AVX2(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx);
 
 // ARGB Column functions
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx);
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx);
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx);
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx);
-void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                                  int dst_width, int x, int dx);
-void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb,
-                            int dst_width, int x, int dx);
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx);
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx);
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx);
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx);
+void ScaleARGBFilterCols_Any_NEON(uint8_t* dst_ptr,
+                                  const uint8_t* src_ptr,
+                                  int dst_width,
+                                  int x,
+                                  int dx);
+void ScaleARGBCols_Any_NEON(uint8_t* dst_ptr,
+                            const uint8_t* src_ptr,
+                            int dst_width,
+                            int x,
+                            int dx);
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx);
+void ScaleARGBFilterCols_Any_MSA(uint8_t* dst_ptr,
+                                 const uint8_t* src_ptr,
+                                 int dst_width,
+                                 int x,
+                                 int dx);
+void ScaleARGBCols_Any_MSA(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx);
 
 // ARGB Row functions
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width);
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width);
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width);
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDown2_Any_SSE2(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst, int dst_width);
-void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_NEON(const uint8_t* src_ptr,
+                                ptrdiff_t src_stride,
+                                uint8_t* dst_ptr,
+                                int dst_width);
+void ScaleARGBRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst, int dst_width);
-
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDown2_Any_MSA(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleARGBRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
+void ScaleARGBRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                               uint8_t* dst_argb,
+                               int dst_width);
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width);
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width);
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width);
+void ScaleARGBRowDownEven_Any_SSE2(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
                                    int src_stepx,
-                                   uint8* dst_argb, int dst_width);
-void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleARGBRowDownEvenBox_Any_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
                                       int src_stepx,
-                                      uint8* dst_argb, int dst_width);
+                                      uint8_t* dst_ptr,
+                                      int dst_width);
+void ScaleARGBRowDownEven_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  int32_t src_stepx,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleARGBRowDownEvenBox_Any_MSA(const uint8_t* src_ptr,
+                                     ptrdiff_t src_stride,
+                                     int src_stepx,
+                                     uint8_t* dst_ptr,
+                                     int dst_width);
 
 // ScaleRowDown2Box also used by planar functions
 // NEON downscalers with interpolation.
 
 // Note - not static due to reuse in convert for 444 to 420.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width);
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst, int dst_width);
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width);
-
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width);
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width);
+
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width);
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 //  to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+                         uint8_t* dst_ptr,
+                         int dst_width);
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width);
+                         uint8_t* dst_ptr,
+                         int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+
+void ScaleRowDown2_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown2Linear_Any_NEON(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown2Box_Any_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown2Box_Odd_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown4_Any_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown4Box_Any_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                  uint8* dst, int dst_width);
-void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown2Box_Odd_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width);
-void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width);
-void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
-void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                                   uint8* dst_ptr, int dst_width);
+                               uint8_t* dst_ptr,
+                               int dst_width);
+void ScaleRowDown34_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
+void ScaleRowDown34_0_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+void ScaleRowDown34_1_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32 -> 12
-void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                             uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_Any_NEON(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst_ptr,
+                             int dst_width);
 // 32x3 -> 12x1
-void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_3_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width);
-
-void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width);
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx);
-
-void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                              int dst_width, int x, int dx);
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width);
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width);
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width);
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width);
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width);
+void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
+                                   ptrdiff_t src_stride,
+                                   uint8_t* dst_ptr,
+                                   int dst_width);
+
+void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
+                          uint16_t* dst_ptr,
+                          int src_width);
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx);
+
+void ScaleFilterCols_Any_NEON(uint8_t* dst_ptr,
+                              const uint8_t* src_ptr,
+                              int dst_width,
+                              int x,
+                              int dx);
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width);
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width);
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width);
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx);
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width);
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width);
+
+void ScaleRowDown2_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown2Linear_Any_MSA(const uint8_t* src_ptr,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_ptr,
+                                 int dst_width);
+void ScaleRowDown2Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown4_Any_MSA(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width);
+void ScaleRowDown4Box_Any_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width);
+void ScaleRowDown38_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown38_2_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown38_3_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleAddRow_Any_MSA(const uint8_t* src_ptr,
+                         uint16_t* dst_ptr,
+                         int src_width);
+void ScaleFilterCols_Any_MSA(uint8_t* dst_ptr,
+                             const uint8_t* src_ptr,
+                             int dst_width,
+                             int x,
+                             int dx);
+void ScaleRowDown34_Any_MSA(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width);
+void ScaleRowDown34_0_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
+void ScaleRowDown34_1_Box_Any_MSA(const uint8_t* src_ptr,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_ptr,
+                                  int dst_width);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_SCALE_ROW_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/version.h b/libs/libvpx/third_party/libyuv/include/libyuv/version.h
index 0fbdc022d57..7022785d8ca 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/version.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/version.h
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1616
+#define LIBYUV_VERSION 1711
 
-#endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h
index ad934e42419..bcef378b5a4 100644
--- a/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h
+++ b/libs/libvpx/third_party/libyuv/include/libyuv/video_common.h
@@ -10,7 +10,7 @@
 
 // Common definitions for video, including fourcc and VideoFormat.
 
-#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_  // NOLINT
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_
 #define INCLUDE_LIBYUV_VIDEO_COMMON_H_
 
 #include "libyuv/basic_types.h"
@@ -28,13 +28,13 @@ extern "C" {
 // Needs to be a macro otherwise the OS X compiler complains when the kFormat*
 // constants are used in a switch.
 #ifdef __cplusplus
-#define FOURCC(a, b, c, d) ( \
-    (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
-    (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#define FOURCC(a, b, c, d)                                        \
+  ((static_cast<uint32_t>(a)) | (static_cast<uint32_t>(b) << 8) | \
+   (static_cast<uint32_t>(c) << 16) | (static_cast<uint32_t>(d) << 24))
 #else
-#define FOURCC(a, b, c, d) ( \
-    ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
-    ((uint32)(c) << 16) | ((uint32)(d) << 24))  /* NOLINT */
+#define FOURCC(a, b, c, d)                                     \
+  (((uint32_t)(a)) | ((uint32_t)(b) << 8) |       /* NOLINT */ \
+   ((uint32_t)(c) << 16) | ((uint32_t)(d) << 24)) /* NOLINT */
 #endif
 
 // Some pages discussing FourCC codes:
@@ -53,38 +53,33 @@ enum FourCC {
   FOURCC_I420 = FOURCC('I', '4', '2', '0'),
   FOURCC_I422 = FOURCC('I', '4', '2', '2'),
   FOURCC_I444 = FOURCC('I', '4', '4', '4'),
-  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
   FOURCC_I400 = FOURCC('I', '4', '0', '0'),
   FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
   FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
   FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
   FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+  FOURCC_H010 = FOURCC('H', '0', '1', '0'),  // unofficial fourcc. 10 bit lsb
 
-  // 2 Secondary YUV formats: row biplanar.
+  // 1 Secondary YUV format: row biplanar.
   FOURCC_M420 = FOURCC('M', '4', '2', '0'),
-  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),  // deprecated.
 
-  // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+  // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
   FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
   FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
   FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+  FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),  // 10 bit per channel. 2101010.
+  FOURCC_AB30 = FOURCC('A', 'B', '3', '0'),  // ABGR version of 10 bit
   FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
-  FOURCC_RAW  = FOURCC('r', 'a', 'w', ' '),
+  FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
   FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
   FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'),  // rgb565 LE.
   FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'),  // argb1555 LE.
   FOURCC_R444 = FOURCC('R', '4', '4', '4'),  // argb4444 LE.
 
-  // 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
-  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
-  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
-  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
-  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
-
   // 1 Primary Compressed YUV format.
   FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
 
-  // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+  // 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
   FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
   FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
   FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
@@ -112,7 +107,13 @@ enum FourCC {
   FOURCC_L565 = FOURCC('L', '5', '6', '5'),  // Alias for RGBP.
   FOURCC_5551 = FOURCC('5', '5', '5', '1'),  // Alias for RGBO.
 
-  // 1 Auxiliary compressed YUV format set aside for capturer.
+  // deprecated formats.  Not supported, but defined for backward compatibility.
+  FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+  FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+  FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+  FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+  FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+  FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
   FOURCC_H264 = FOURCC('H', '2', '6', '4'),
 
   // Match any fourcc.
@@ -136,8 +137,10 @@ enum FourCCBpp {
   FOURCC_BPP_BGRA = 32,
   FOURCC_BPP_ABGR = 32,
   FOURCC_BPP_RGBA = 32,
+  FOURCC_BPP_AR30 = 32,
+  FOURCC_BPP_AB30 = 32,
   FOURCC_BPP_24BG = 24,
-  FOURCC_BPP_RAW  = 24,
+  FOURCC_BPP_RAW = 24,
   FOURCC_BPP_RGBP = 16,
   FOURCC_BPP_RGBO = 16,
   FOURCC_BPP_R444 = 16,
@@ -152,6 +155,7 @@ enum FourCCBpp {
   FOURCC_BPP_J420 = 12,
   FOURCC_BPP_J400 = 8,
   FOURCC_BPP_H420 = 12,
+  FOURCC_BPP_H010 = 24,
   FOURCC_BPP_MJPG = 0,  // 0 means unknown.
   FOURCC_BPP_H264 = 0,
   FOURCC_BPP_IYUV = 12,
@@ -170,15 +174,15 @@ enum FourCCBpp {
   FOURCC_BPP_CM24 = 24,
 
   // Match any fourcc.
-  FOURCC_BPP_ANY  = 0,  // 0 means unknown.
+  FOURCC_BPP_ANY = 0,  // 0 means unknown.
 };
 
 // Converts fourcc aliases into canonical ones.
-LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+LIBYUV_API uint32_t CanonicalFourCC(uint32_t fourcc);
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
 
-#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_  NOLINT
+#endif  // INCLUDE_LIBYUV_VIDEO_COMMON_H_
diff --git a/libs/libvpx/third_party/libyuv/source/compare.cc b/libs/libvpx/third_party/libyuv/source/compare.cc
index e3846bdfdd1..50e3abd0556 100644
--- a/libs/libvpx/third_party/libyuv/source/compare.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare.cc
@@ -29,10 +29,10 @@ extern "C" {
 
 // hash seed of 5381 recommended.
 LIBYUV_API
-uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+uint32_t HashDjb2(const uint8_t* src, uint64_t count, uint32_t seed) {
   const int kBlockSize = 1 << 15;  // 32768;
   int remainder;
-  uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) =
+  uint32_t (*HashDjb2_SSE)(const uint8_t* src, int count, uint32_t seed) =
       HashDjb2_C;
 #if defined(HAS_HASHDJB2_SSE41)
   if (TestCpuFlag(kCpuHasSSE41)) {
@@ -45,25 +45,25 @@ uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
   }
 #endif
 
-  while (count >= (uint64)(kBlockSize)) {
+  while (count >= (uint64_t)(kBlockSize)) {
     seed = HashDjb2_SSE(src, kBlockSize, seed);
     src += kBlockSize;
     count -= kBlockSize;
   }
-  remainder = (int)(count) & ~15;
+  remainder = (int)count & ~15;
   if (remainder) {
     seed = HashDjb2_SSE(src, remainder, seed);
     src += remainder;
     count -= remainder;
   }
-  remainder = (int)(count) & 15;
+  remainder = (int)count & 15;
   if (remainder) {
     seed = HashDjb2_C(src, remainder, seed);
   }
   return seed;
 }
 
-static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
+static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
@@ -94,8 +94,11 @@ static uint32 ARGBDetectRow_C(const uint8* argb, int width) {
 // Scan an opaque argb image and return fourcc based on alpha offset.
 // Returns FOURCC_ARGB, FOURCC_BGRA, or 0 if unknown.
 LIBYUV_API
-uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
-  uint32 fourcc = 0;
+uint32_t ARGBDetect(const uint8_t* argb,
+                    int stride_argb,
+                    int width,
+                    int height) {
+  uint32_t fourcc = 0;
   int h;
 
   // Coalesce rows.
@@ -111,19 +114,80 @@ uint32 ARGBDetect(const uint8* argb, int stride_argb, int width, int height) {
   return fourcc;
 }
 
+// NEON version accumulates in 16 bit shorts which overflow at 65536 bytes.
+// So actual maximum is 1 less loop, which is 64436 - 32 bytes.
+
+LIBYUV_API
+uint64_t ComputeHammingDistance(const uint8_t* src_a,
+                                const uint8_t* src_b,
+                                int count) {
+  const int kBlockSize = 1 << 15;  // 32768;
+  const int kSimdSize = 64;
+  // SIMD for multiple of 64, and C for remainder
+  int remainder = count & (kBlockSize - 1) & ~(kSimdSize - 1);
+  uint64_t diff = 0;
+  int i;
+  uint32_t (*HammingDistance)(const uint8_t* src_a, const uint8_t* src_b,
+                              int count) = HammingDistance_C;
+#if defined(HAS_HAMMINGDISTANCE_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HammingDistance = HammingDistance_NEON;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    HammingDistance = HammingDistance_SSSE3;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_SSE42)
+  if (TestCpuFlag(kCpuHasSSE42)) {
+    HammingDistance = HammingDistance_SSE42;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HammingDistance = HammingDistance_AVX2;
+  }
+#endif
+#if defined(HAS_HAMMINGDISTANCE_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HammingDistance = HammingDistance_MSA;
+  }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+ : diff)
+#endif
+  for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+    diff += HammingDistance(src_a + i, src_b + i, kBlockSize);
+  }
+  src_a += count & ~(kBlockSize - 1);
+  src_b += count & ~(kBlockSize - 1);
+  if (remainder) {
+    diff += HammingDistance(src_a, src_b, remainder);
+    src_a += remainder;
+    src_b += remainder;
+  }
+  remainder = count & (kSimdSize - 1);
+  if (remainder) {
+    diff += HammingDistance_C(src_a, src_b, remainder);
+  }
+  return diff;
+}
+
 // TODO(fbarchard): Refactor into row function.
 LIBYUV_API
-uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
-                             int count) {
+uint64_t ComputeSumSquareError(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
   // SumSquareError returns values 0 to 65535 for each squared difference.
-  // Up to 65536 of those can be summed and remain within a uint32.
-  // After each block of 65536 pixels, accumulate into a uint64.
+  // Up to 65536 of those can be summed and remain within a uint32_t.
+  // After each block of 65536 pixels, accumulate into a uint64_t.
   const int kBlockSize = 65536;
   int remainder = count & (kBlockSize - 1) & ~31;
-  uint64 sse = 0;
+  uint64_t sse = 0;
   int i;
-  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
-      SumSquareError_C;
+  uint32_t (*SumSquareError)(const uint8_t* src_a, const uint8_t* src_b,
+                             int count) = SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     SumSquareError = SumSquareError_NEON;
@@ -141,8 +205,13 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
     SumSquareError = SumSquareError_AVX2;
   }
 #endif
+#if defined(HAS_SUMSQUAREERROR_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SumSquareError = SumSquareError_MSA;
+  }
+#endif
 #ifdef _OPENMP
-#pragma omp parallel for reduction(+: sse)
+#pragma omp parallel for reduction(+ : sse)
 #endif
   for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
     sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
@@ -162,14 +231,16 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
 }
 
 LIBYUV_API
-uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
-                                  const uint8* src_b, int stride_b,
-                                  int width, int height) {
-  uint64 sse = 0;
+uint64_t ComputeSumSquareErrorPlane(const uint8_t* src_a,
+                                    int stride_a,
+                                    const uint8_t* src_b,
+                                    int stride_b,
+                                    int width,
+                                    int height) {
+  uint64_t sse = 0;
   int h;
   // Coalesce rows.
-  if (stride_a == width &&
-      stride_b == width) {
+  if (stride_a == width && stride_b == width) {
     width *= height;
     height = 1;
     stride_a = stride_b = 0;
@@ -183,66 +254,76 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+double SumSquareErrorToPsnr(uint64_t sse, uint64_t count) {
   double psnr;
   if (sse > 0) {
-    double mse = (double)(count) / (double)(sse);
+    double mse = (double)count / (double)sse;
     psnr = 10.0 * log10(255.0 * 255.0 * mse);
   } else {
-    psnr = kMaxPsnr;      // Limit to prevent divide by 0
+    psnr = kMaxPsnr;  // Limit to prevent divide by 0
   }
 
-  if (psnr > kMaxPsnr)
+  if (psnr > kMaxPsnr) {
     psnr = kMaxPsnr;
+  }
 
   return psnr;
 }
 
 LIBYUV_API
-double CalcFramePsnr(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
-  const uint64 samples = width * height;
-  const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
-                                                src_b, stride_b,
-                                                width, height);
+double CalcFramePsnr(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
+  const uint64_t samples = (uint64_t)width * (uint64_t)height;
+  const uint64_t sse = ComputeSumSquareErrorPlane(src_a, stride_a, src_b,
+                                                  stride_b, width, height);
   return SumSquareErrorToPsnr(sse, samples);
 }
 
 LIBYUV_API
-double I420Psnr(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
-                                                  src_y_b, stride_y_b,
-                                                  width, height);
+double I420Psnr(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const uint64_t sse_y = ComputeSumSquareErrorPlane(
+      src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
-                                                  src_u_b, stride_u_b,
-                                                  width_uv, height_uv);
-  const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
-                                                  src_v_b, stride_v_b,
-                                                  width_uv, height_uv);
-  const uint64 samples = width * height + 2 * (width_uv * height_uv);
-  const uint64 sse = sse_y + sse_u + sse_v;
+  const uint64_t sse_u = ComputeSumSquareErrorPlane(
+      src_u_a, stride_u_a, src_u_b, stride_u_b, width_uv, height_uv);
+  const uint64_t sse_v = ComputeSumSquareErrorPlane(
+      src_v_a, stride_v_a, src_v_b, stride_v_b, width_uv, height_uv);
+  const uint64_t samples = (uint64_t)width * (uint64_t)height +
+                           2 * ((uint64_t)width_uv * (uint64_t)height_uv);
+  const uint64_t sse = sse_y + sse_u + sse_v;
   return SumSquareErrorToPsnr(sse, samples);
 }
 
-static const int64 cc1 =  26634;  // (64^2*(.01*255)^2
-static const int64 cc2 = 239708;  // (64^2*(.03*255)^2
+static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
-static double Ssim8x8_C(const uint8* src_a, int stride_a,
-                        const uint8* src_b, int stride_b) {
-  int64 sum_a = 0;
-  int64 sum_b = 0;
-  int64 sum_sq_a = 0;
-  int64 sum_sq_b = 0;
-  int64 sum_axb = 0;
+static double Ssim8x8_C(const uint8_t* src_a,
+                        int stride_a,
+                        const uint8_t* src_b,
+                        int stride_b) {
+  int64_t sum_a = 0;
+  int64_t sum_b = 0;
+  int64_t sum_sq_a = 0;
+  int64_t sum_sq_b = 0;
+  int64_t sum_axb = 0;
 
   int i;
   for (i = 0; i < 8; ++i) {
@@ -260,22 +341,22 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
   }
 
   {
-    const int64 count = 64;
+    const int64_t count = 64;
     // scale the constants by number of pixels
-    const int64 c1 = (cc1 * count * count) >> 12;
-    const int64 c2 = (cc2 * count * count) >> 12;
+    const int64_t c1 = (cc1 * count * count) >> 12;
+    const int64_t c2 = (cc2 * count * count) >> 12;
 
-    const int64 sum_a_x_sum_b = sum_a * sum_b;
+    const int64_t sum_a_x_sum_b = sum_a * sum_b;
 
-    const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
-                         (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+    const int64_t ssim_n = (2 * sum_a_x_sum_b + c1) *
+                           (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
 
-    const int64 sum_a_sq = sum_a*sum_a;
-    const int64 sum_b_sq = sum_b*sum_b;
+    const int64_t sum_a_sq = sum_a * sum_a;
+    const int64_t sum_b_sq = sum_b * sum_b;
 
-    const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
-                         (count * sum_sq_a - sum_a_sq +
-                          count * sum_sq_b - sum_b_sq + c2);
+    const int64_t ssim_d =
+        (sum_a_sq + sum_b_sq + c1) *
+        (count * sum_sq_a - sum_a_sq + count * sum_sq_b - sum_b_sq + c2);
 
     if (ssim_d == 0.0) {
       return DBL_MAX;
@@ -288,13 +369,16 @@ static double Ssim8x8_C(const uint8* src_a, int stride_a,
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
 // block boundaries to penalize blocking artifacts.
 LIBYUV_API
-double CalcFrameSsim(const uint8* src_a, int stride_a,
-                     const uint8* src_b, int stride_b,
-                     int width, int height) {
+double CalcFrameSsim(const uint8_t* src_a,
+                     int stride_a,
+                     const uint8_t* src_b,
+                     int stride_b,
+                     int width,
+                     int height) {
   int samples = 0;
   double ssim_total = 0;
-  double (*Ssim8x8)(const uint8* src_a, int stride_a,
-                    const uint8* src_b, int stride_b) = Ssim8x8_C;
+  double (*Ssim8x8)(const uint8_t* src_a, int stride_a, const uint8_t* src_b,
+                    int stride_b) = Ssim8x8_C;
 
   // sample point start with each 4x4 location
   int i;
@@ -314,22 +398,27 @@ double CalcFrameSsim(const uint8* src_a, int stride_a,
 }
 
 LIBYUV_API
-double I420Ssim(const uint8* src_y_a, int stride_y_a,
-                const uint8* src_u_a, int stride_u_a,
-                const uint8* src_v_a, int stride_v_a,
-                const uint8* src_y_b, int stride_y_b,
-                const uint8* src_u_b, int stride_u_b,
-                const uint8* src_v_b, int stride_v_b,
-                int width, int height) {
-  const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
-                                      src_y_b, stride_y_b, width, height);
+double I420Ssim(const uint8_t* src_y_a,
+                int stride_y_a,
+                const uint8_t* src_u_a,
+                int stride_u_a,
+                const uint8_t* src_v_a,
+                int stride_v_a,
+                const uint8_t* src_y_b,
+                int stride_y_b,
+                const uint8_t* src_u_b,
+                int stride_u_b,
+                const uint8_t* src_v_b,
+                int stride_v_b,
+                int width,
+                int height) {
+  const double ssim_y =
+      CalcFrameSsim(src_y_a, stride_y_a, src_y_b, stride_y_b, width, height);
   const int width_uv = (width + 1) >> 1;
   const int height_uv = (height + 1) >> 1;
-  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
-                                      src_u_b, stride_u_b,
+  const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a, src_u_b, stride_u_b,
                                       width_uv, height_uv);
-  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
-                                      src_v_b, stride_v_b,
+  const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a, src_v_b, stride_v_b,
                                       width_uv, height_uv);
   return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
 }
diff --git a/libs/libvpx/third_party/libyuv/source/compare_common.cc b/libs/libvpx/third_party/libyuv/source/compare_common.cc
index 42fc5893543..d4b170ad986 100644
--- a/libs/libvpx/third_party/libyuv/source/compare_common.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare_common.cc
@@ -17,20 +17,80 @@ namespace libyuv {
 extern "C" {
 #endif
 
-uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse = 0u;
+#if ORIGINAL_OPT
+uint32_t HammingDistance_C1(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count; ++i) {
+    int x = src_a[i] ^ src_b[i];
+    if (x & 1)
+      ++diff;
+    if (x & 2)
+      ++diff;
+    if (x & 4)
+      ++diff;
+    if (x & 8)
+      ++diff;
+    if (x & 16)
+      ++diff;
+    if (x & 32)
+      ++diff;
+    if (x & 64)
+      ++diff;
+    if (x & 128)
+      ++diff;
+  }
+  return diff;
+}
+#endif
+
+// Hakmem method for hamming distance.
+uint32_t HammingDistance_C(const uint8_t* src_a,
+                           const uint8_t* src_b,
+                           int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((const uint32_t*)src_a) ^ *((const uint32_t*)src_b);
+    uint32_t u = x - ((x >> 1) & 0x55555555);
+    u = ((u >> 2) & 0x33333333) + (u & 0x33333333);
+    diff += ((((u + (u >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24);
+    src_a += 4;
+    src_b += 4;
+  }
+
+  for (; i < count; ++i) {
+    uint32_t x = *src_a ^ *src_b;
+    uint32_t u = x - ((x >> 1) & 0x55);
+    u = ((u >> 2) & 0x33) + (u & 0x33);
+    diff += (u + (u >> 4)) & 0x0f;
+    src_a += 1;
+    src_b += 1;
+  }
+
+  return diff;
+}
+
+uint32_t SumSquareError_C(const uint8_t* src_a,
+                          const uint8_t* src_b,
+                          int count) {
+  uint32_t sse = 0u;
   int i;
   for (i = 0; i < count; ++i) {
     int diff = src_a[i] - src_b[i];
-    sse += (uint32)(diff * diff);
+    sse += (uint32_t)(diff * diff);
   }
   return sse;
 }
 
 // hash seed of 5381 recommended.
 // Internal C version of HashDjb2 with int sized count for efficiency.
-uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
-  uint32 hash = seed;
+uint32_t HashDjb2_C(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash = seed;
   int i;
   for (i = 0; i < count; ++i) {
     hash += (hash << 5) + src[i];
diff --git a/libs/libvpx/third_party/libyuv/source/compare_gcc.cc b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc
index 1b83edb166a..676527c1b1b 100644
--- a/libs/libvpx/third_party/libyuv/source/compare_gcc.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare_gcc.cc
@@ -22,124 +22,334 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
-  uint32 sse;
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10, 1) ",%1          \n"
-    "movdqa    %%xmm1,%%xmm3                   \n"
-    "psubusb   %%xmm2,%%xmm1                   \n"
-    "psubusb   %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpckhbw %%xmm5,%%xmm2                   \n"
-    "pmaddwd   %%xmm1,%%xmm1                   \n"
-    "pmaddwd   %%xmm2,%%xmm2                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-
-    "pshufd    $0xee,%%xmm0,%%xmm1             \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "pshufd    $0x1,%%xmm0,%%xmm1              \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0,%3                       \n"
-
-  : "+r"(src_a),      // %0
-    "+r"(src_b),      // %1
-    "+r"(count),      // %2
-    "=g"(sse)         // %3
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+#if defined(__x86_64__)
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint64_t diff = 0u;
+
+  asm volatile(
+      "xor        %3,%3                          \n"
+      "xor        %%r8,%%r8                      \n"
+      "xor        %%r9,%%r9                      \n"
+      "xor        %%r10,%%r10                    \n"
+
+      // Process 32 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%rcx                     \n"
+      "mov        0x8(%0),%%rdx                  \n"
+      "xor        (%1),%%rcx                     \n"
+      "xor        0x8(%1),%%rdx                  \n"
+      "popcnt     %%rcx,%%rcx                    \n"
+      "popcnt     %%rdx,%%rdx                    \n"
+      "mov        0x10(%0),%%rsi                 \n"
+      "mov        0x18(%0),%%rdi                 \n"
+      "xor        0x10(%1),%%rsi                 \n"
+      "xor        0x18(%1),%%rdi                 \n"
+      "popcnt     %%rsi,%%rsi                    \n"
+      "popcnt     %%rdi,%%rdi                    \n"
+      "add        $0x20,%0                       \n"
+      "add        $0x20,%1                       \n"
+      "add        %%rcx,%3                       \n"
+      "add        %%rdx,%%r8                     \n"
+      "add        %%rsi,%%r9                     \n"
+      "add        %%rdi,%%r10                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "add        %%r8, %3                       \n"
+      "add        %%r9, %3                       \n"
+      "add        %%r10, %3                      \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=r"(diff)    // %3
+      :
+      : "memory", "cc", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10");
+
+  return static_cast<uint32_t>(diff);
+}
+#else
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      // Process 16 bytes per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "mov        (%0),%%ecx                     \n"
+      "mov        0x4(%0),%%edx                  \n"
+      "xor        (%1),%%ecx                     \n"
+      "xor        0x4(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "mov        0x8(%0),%%ecx                  \n"
+      "mov        0xc(%0),%%edx                  \n"
+      "xor        0x8(%1),%%ecx                  \n"
+      "xor        0xc(%1),%%edx                  \n"
+      "popcnt     %%ecx,%%ecx                    \n"
+      "add        %%ecx,%3                       \n"
+      "popcnt     %%edx,%%edx                    \n"
+      "add        %%edx,%3                       \n"
+      "add        $0x10,%0                       \n"
+      "add        $0x10,%1                       \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "+r"(diff)    // %3
+      :
+      : "memory", "cc", "ecx", "edx");
+
+  return diff;
+}
+#endif
+
+static const vec8 kNibbleMask = {15, 15, 15, 15, 15, 15, 15, 15,
+                                 15, 15, 15, 15, 15, 15, 15, 15};
+static const vec8 kBitCount = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+
+uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "movdqa     %4,%%xmm2                      \n"
+      "movdqa     %5,%%xmm3                      \n"
+      "pxor       %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm1,%%xmm1                  \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa     (%0),%%xmm4                    \n"
+      "movdqa     0x10(%0), %%xmm5               \n"
+      "pxor       (%0,%1), %%xmm4                \n"
+      "movdqa     %%xmm4,%%xmm6                  \n"
+      "pand       %%xmm2,%%xmm6                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm6,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm6                  \n"
+      "pshufb     %%xmm4,%%xmm6                  \n"
+      "paddb      %%xmm7,%%xmm6                  \n"
+      "pxor       0x10(%0,%1),%%xmm5             \n"
+      "add        $0x20,%0                       \n"
+      "movdqa     %%xmm5,%%xmm4                  \n"
+      "pand       %%xmm2,%%xmm5                  \n"
+      "psrlw      $0x4,%%xmm4                    \n"
+      "movdqa     %%xmm3,%%xmm7                  \n"
+      "pshufb     %%xmm5,%%xmm7                  \n"
+      "pand       %%xmm2,%%xmm4                  \n"
+      "movdqa     %%xmm3,%%xmm5                  \n"
+      "pshufb     %%xmm4,%%xmm5                  \n"
+      "paddb      %%xmm7,%%xmm5                  \n"
+      "paddb      %%xmm5,%%xmm6                  \n"
+      "psadbw     %%xmm1,%%xmm6                  \n"
+      "paddd      %%xmm6,%%xmm0                  \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+
+      "pshufd     $0xaa,%%xmm0,%%xmm1            \n"
+      "paddd      %%xmm1,%%xmm0                  \n"
+      "movd       %%xmm0, %3                     \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+
+  return diff;
+}
+
+#ifdef HAS_HAMMINGDISTANCE_AVX2
+uint32_t HammingDistance_AVX2(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff = 0u;
+
+  asm volatile(
+      "vbroadcastf128 %4,%%ymm2                  \n"
+      "vbroadcastf128 %5,%%ymm3                  \n"
+      "vpxor      %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm1,%%ymm1,%%ymm1           \n"
+      "sub        %0,%1                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqa    (%0),%%ymm4                    \n"
+      "vmovdqa    0x20(%0), %%ymm5               \n"
+      "vpxor      (%0,%1), %%ymm4, %%ymm4        \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm6           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm6           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm4,%%ymm6,%%ymm6           \n"
+      "vpxor      0x20(%0,%1),%%ymm5,%%ymm4      \n"
+      "add        $0x40,%0                       \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm5           \n"
+      "vpsrlw     $0x4,%%ymm4,%%ymm4             \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm5           \n"
+      "vpand      %%ymm2,%%ymm4,%%ymm4           \n"
+      "vpshufb    %%ymm4,%%ymm3,%%ymm4           \n"
+      "vpaddb     %%ymm5,%%ymm4,%%ymm4           \n"
+      "vpaddb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsadbw    %%ymm1,%%ymm4,%%ymm4           \n"
+      "vpaddd     %%ymm0,%%ymm4,%%ymm0           \n"
+      "sub        $0x40,%2                       \n"
+      "jg         1b                             \n"
+
+      "vpermq     $0xb1,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xaa,%%ymm0,%%ymm1            \n"
+      "vpaddd     %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovd      %%xmm0, %3                     \n"
+      "vzeroupper                                \n"
+      : "+r"(src_a),       // %0
+        "+r"(src_b),       // %1
+        "+r"(count),       // %2
+        "=r"(diff)         // %3
+      : "m"(kNibbleMask),  // %4
+        "m"(kBitCount)     // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+
+  return diff;
+}
+#endif  // HAS_HAMMINGDISTANCE_AVX2
+
+uint32_t SumSquareError_SSE2(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm1,%%xmm3                   \n"
+      "psubusb   %%xmm2,%%xmm1                   \n"
+      "psubusb   %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpckhbw %%xmm5,%%xmm2                   \n"
+      "pmaddwd   %%xmm1,%%xmm1                   \n"
+      "pmaddwd   %%xmm2,%%xmm2                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+
+      "pshufd    $0xee,%%xmm0,%%xmm1             \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "pshufd    $0x1,%%xmm0,%%xmm1              \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,%3                       \n"
+
+      : "+r"(src_a),  // %0
+        "+r"(src_b),  // %1
+        "+r"(count),  // %2
+        "=g"(sse)     // %3
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
   return sse;
 }
 
-static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
-static uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+static const uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
+static const uvec32 kHashMul0 = {
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
-static uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+static const uvec32 kHashMul1 = {
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
-static uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+static const uvec32 kHashMul2 = {
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
-static uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+static const uvec32 kHashMul3 = {
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
-  uint32 hash;
-  asm volatile (
-    "movd      %2,%%xmm0                       \n"
-    "pxor      %%xmm7,%%xmm7                   \n"
-    "movdqa    %4,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10, 0) ",%0          \n"
-    "pmulld    %%xmm6,%%xmm0                   \n"
-    "movdqa    %5,%%xmm5                       \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm7,%%xmm3                   \n"
-    "pmulld    %%xmm5,%%xmm3                   \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpckhwd %%xmm7,%%xmm4                   \n"
-    "pmulld    %%xmm5,%%xmm4                   \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "punpckhbw %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm7,%%xmm2                   \n"
-    "pmulld    %%xmm5,%%xmm2                   \n"
-    "movdqa    %8,%%xmm5                       \n"
-    "punpckhwd %%xmm7,%%xmm1                   \n"
-    "pmulld    %%xmm5,%%xmm1                   \n"
-    "paddd     %%xmm4,%%xmm3                   \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm1                   \n"
-    "pshufd    $0xe,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "pshufd    $0x1,%%xmm1,%%xmm2              \n"
-    "paddd     %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm1,%%xmm0                   \n"
-    "sub       $0x10,%1                        \n"
-    "jg        1b                              \n"
-    "movd      %%xmm0,%3                       \n"
-  : "+r"(src),        // %0
-    "+r"(count),      // %1
-    "+rm"(seed),      // %2
-    "=g"(hash)        // %3
-  : "m"(kHash16x33),  // %4
-    "m"(kHashMul0),   // %5
-    "m"(kHashMul1),   // %6
-    "m"(kHashMul2),   // %7
-    "m"(kHashMul3)    // %8
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
+  uint32_t hash;
+  asm volatile(
+      "movd      %2,%%xmm0                       \n"
+      "pxor      %%xmm7,%%xmm7                   \n"
+      "movdqa    %4,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmulld    %%xmm6,%%xmm0                   \n"
+      "movdqa    %5,%%xmm5                       \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm7,%%xmm3                   \n"
+      "pmulld    %%xmm5,%%xmm3                   \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpckhwd %%xmm7,%%xmm4                   \n"
+      "pmulld    %%xmm5,%%xmm4                   \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "punpckhbw %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm7,%%xmm2                   \n"
+      "pmulld    %%xmm5,%%xmm2                   \n"
+      "movdqa    %8,%%xmm5                       \n"
+      "punpckhwd %%xmm7,%%xmm1                   \n"
+      "pmulld    %%xmm5,%%xmm1                   \n"
+      "paddd     %%xmm4,%%xmm3                   \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm1                   \n"
+      "pshufd    $0xe,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "pshufd    $0x1,%%xmm1,%%xmm2              \n"
+      "paddd     %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm1,%%xmm0                   \n"
+      "sub       $0x10,%1                        \n"
+      "jg        1b                              \n"
+      "movd      %%xmm0,%3                       \n"
+      : "+r"(src),        // %0
+        "+r"(count),      // %1
+        "+rm"(seed),      // %2
+        "=g"(hash)        // %3
+      : "m"(kHash16x33),  // %4
+        "m"(kHashMul0),   // %5
+        "m"(kHashMul1),   // %6
+        "m"(kHashMul2),   // %7
+        "m"(kHashMul3)    // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
   return hash;
 }
 #endif  // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
@@ -148,4 +358,3 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/libs/libvpx/third_party/libyuv/source/compare_msa.cc b/libs/libvpx/third_party/libyuv/source/compare_msa.cc
new file mode 100644
index 00000000000..0b807d37bee
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/compare_msa.cc
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2017 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#include "libyuv/compare_row.h"
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32_t HammingDistance_MSA(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t diff = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v2i64 vec0 = {0}, vec1 = {0};
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    src0 ^= src2;
+    src1 ^= src3;
+    vec0 += __msa_pcnt_d((v2i64)src0);
+    vec1 += __msa_pcnt_d((v2i64)src1);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  vec0 += vec1;
+  diff = (uint32_t)__msa_copy_u_w((v4i32)vec0, 0);
+  diff += (uint32_t)__msa_copy_u_w((v4i32)vec0, 2);
+  return diff;
+}
+
+uint32_t SumSquareError_MSA(const uint8_t* src_a,
+                            const uint8_t* src_b,
+                            int count) {
+  uint32_t sse = 0u;
+  int i;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2, vec3;
+  v4i32 reg0 = {0}, reg1 = {0}, reg2 = {0}, reg3 = {0};
+  v2i64 tmp0;
+
+  for (i = 0; i < count; i += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_a, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_a, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_b, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_b, 16);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    reg0 = __msa_dpadd_s_w(reg0, vec0, vec0);
+    reg1 = __msa_dpadd_s_w(reg1, vec1, vec1);
+    reg2 = __msa_dpadd_s_w(reg2, vec2, vec2);
+    reg3 = __msa_dpadd_s_w(reg3, vec3, vec3);
+    src_a += 32;
+    src_b += 32;
+  }
+
+  reg0 += reg1;
+  reg2 += reg3;
+  reg0 += reg2;
+  tmp0 = __msa_hadd_s_d(reg0, reg0);
+  sse = (uint32_t)__msa_copy_u_w((v4i32)tmp0, 0);
+  sse += (uint32_t)__msa_copy_u_w((v4i32)tmp0, 2);
+  return sse;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon.cc b/libs/libvpx/third_party/libyuv/source/compare_neon.cc
index 49aa3b4eefe..2a2181e0cb3 100644
--- a/libs/libvpx/third_party/libyuv/source/compare_neon.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare_neon.cc
@@ -21,40 +21,70 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "vmov.u8    q8, #0                         \n"
-    "vmov.u8    q10, #0                        \n"
-    "vmov.u8    q9, #0                         \n"
-    "vmov.u8    q11, #0                        \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"
-    "subs       %2, %2, #16                    \n"
-    "vsubl.u8   q2, d0, d2                     \n"
-    "vsubl.u8   q3, d1, d3                     \n"
-    "vmlal.s16  q8, d4, d4                     \n"
-    "vmlal.s16  q9, d6, d6                     \n"
-    "vmlal.s16  q10, d5, d5                    \n"
-    "vmlal.s16  q11, d7, d7                    \n"
-    "bgt        1b                             \n"
+  asm volatile(
+      "vmov.u16   q4, #0                         \n"  // accumulator
 
-    "vadd.u32   q8, q8, q9                     \n"
-    "vadd.u32   q10, q10, q11                  \n"
-    "vadd.u32   q11, q8, q10                   \n"
-    "vpaddl.u32 q1, q11                        \n"
-    "vadd.u64   d0, d2, d3                     \n"
-    "vmov.32    %3, d0[0]                      \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"
+      "vld1.8     {q2, q3}, [%1]!                \n"
+      "veor.32    q0, q0, q2                     \n"
+      "veor.32    q1, q1, q3                     \n"
+      "vcnt.i8    q0, q0                         \n"
+      "vcnt.i8    q1, q1                         \n"
+      "subs       %2, %2, #32                    \n"
+      "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
+      "vpadal.u8  q4, q0                         \n"  // 8 shorts
+      "bgt        1b                             \n"
+
+      "vpaddl.u16 q0, q4                         \n"  // 4 ints
+      "vpadd.u32  d0, d0, d1                     \n"
+      "vpadd.u32  d0, d0, d0                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "q0", "q1", "q2", "q3", "q4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "vmov.u8    q8, #0                         \n"
+      "vmov.u8    q10, #0                        \n"
+      "vmov.u8    q9, #0                         \n"
+      "vmov.u8    q11, #0                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"
+      "vld1.8     {q1}, [%1]!                    \n"
+      "subs       %2, %2, #16                    \n"
+      "vsubl.u8   q2, d0, d2                     \n"
+      "vsubl.u8   q3, d1, d3                     \n"
+      "vmlal.s16  q8, d4, d4                     \n"
+      "vmlal.s16  q9, d6, d6                     \n"
+      "vmlal.s16  q10, d5, d5                    \n"
+      "vmlal.s16  q11, d7, d7                    \n"
+      "bgt        1b                             \n"
+
+      "vadd.u32   q8, q8, q9                     \n"
+      "vadd.u32   q10, q10, q11                  \n"
+      "vadd.u32   q11, q8, q10                   \n"
+      "vpaddl.u32 q1, q11                        \n"
+      "vadd.u64   d0, d2, d3                     \n"
+      "vmov.32    %3, d0[0]                      \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
   return sse;
 }
 
diff --git a/libs/libvpx/third_party/libyuv/source/compare_neon64.cc b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc
index f9c7df98c89..6e8f672ab73 100644
--- a/libs/libvpx/third_party/libyuv/source/compare_neon64.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare_neon64.cc
@@ -20,39 +20,65 @@ extern "C" {
 
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
-  volatile uint32 sse;
-  asm volatile (
-    "eor        v16.16b, v16.16b, v16.16b      \n"
-    "eor        v18.16b, v18.16b, v18.16b      \n"
-    "eor        v17.16b, v17.16b, v17.16b      \n"
-    "eor        v19.16b, v19.16b, v19.16b      \n"
+// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
+uint32_t HammingDistance_NEON(const uint8_t* src_a,
+                              const uint8_t* src_b,
+                              int count) {
+  uint32_t diff;
+  asm volatile(
+      "movi       v4.8h, #0                      \n"
 
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"
-    "subs       %w2, %w2, #16                  \n"
-    "usubl      v2.8h, v0.8b, v1.8b            \n"
-    "usubl2     v3.8h, v0.16b, v1.16b          \n"
-    "smlal      v16.4s, v2.4h, v2.4h           \n"
-    "smlal      v17.4s, v3.4h, v3.4h           \n"
-    "smlal2     v18.4s, v2.8h, v2.8h           \n"
-    "smlal2     v19.4s, v3.8h, v3.8h           \n"
-    "b.gt       1b                             \n"
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
+      "eor        v0.16b, v0.16b, v2.16b         \n"
+      "eor        v1.16b, v1.16b, v3.16b         \n"
+      "cnt        v0.16b, v0.16b                 \n"
+      "cnt        v1.16b, v1.16b                 \n"
+      "subs       %w2, %w2, #32                  \n"
+      "add        v0.16b, v0.16b, v1.16b         \n"
+      "uadalp     v4.8h, v0.16b                  \n"
+      "b.gt       1b                             \n"
 
-    "add        v16.4s, v16.4s, v17.4s         \n"
-    "add        v18.4s, v18.4s, v19.4s         \n"
-    "add        v19.4s, v16.4s, v18.4s         \n"
-    "addv       s0, v19.4s                     \n"
-    "fmov       %w3, s0                        \n"
-    : "+r"(src_a),
-      "+r"(src_b),
-      "+r"(count),
-      "=r"(sse)
-    :
-    : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
+      "uaddlv     s4, v4.8h                      \n"
+      "fmov       %w3, s4                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(diff)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v4");
+  return diff;
+}
+
+uint32_t SumSquareError_NEON(const uint8_t* src_a,
+                             const uint8_t* src_b,
+                             int count) {
+  uint32_t sse;
+  asm volatile(
+      "eor        v16.16b, v16.16b, v16.16b      \n"
+      "eor        v18.16b, v18.16b, v18.16b      \n"
+      "eor        v17.16b, v17.16b, v17.16b      \n"
+      "eor        v19.16b, v19.16b, v19.16b      \n"
+
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"
+      "ld1        {v1.16b}, [%1], #16            \n"
+      "subs       %w2, %w2, #16                  \n"
+      "usubl      v2.8h, v0.8b, v1.8b            \n"
+      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "smlal      v16.4s, v2.4h, v2.4h           \n"
+      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "smlal2     v18.4s, v2.8h, v2.8h           \n"
+      "smlal2     v19.4s, v3.8h, v3.8h           \n"
+      "b.gt       1b                             \n"
+
+      "add        v16.4s, v16.4s, v17.4s         \n"
+      "add        v18.4s, v18.4s, v19.4s         \n"
+      "add        v19.4s, v16.4s, v18.4s         \n"
+      "addv       s0, v19.4s                     \n"
+      "fmov       %w3, s0                        \n"
+      : "+r"(src_a), "+r"(src_b), "+r"(count), "=r"(sse)
+      :
+      : "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
   return sse;
 }
 
diff --git a/libs/libvpx/third_party/libyuv/source/compare_win.cc b/libs/libvpx/third_party/libyuv/source/compare_win.cc
index dc86fe25b19..d57d3d9d1c8 100644
--- a/libs/libvpx/third_party/libyuv/source/compare_win.cc
+++ b/libs/libvpx/third_party/libyuv/source/compare_win.cc
@@ -13,20 +13,39 @@
 #include "libyuv/compare_row.h"
 #include "libyuv/row.h"
 
+#if defined(_MSC_VER)
+#include <intrin.h>  // For __popcnt
+#endif
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+uint32_t HammingDistance_SSE42(const uint8_t* src_a,
+                               const uint8_t* src_b,
+                               int count) {
+  uint32_t diff = 0u;
+
+  int i;
+  for (i = 0; i < count - 3; i += 4) {
+    uint32_t x = *((uint32_t*)src_a) ^ *((uint32_t*)src_b);  // NOLINT
+    src_a += 4;
+    src_b += 4;
+    diff += __popcnt(x);
+  }
+  return diff;
+}
 
-__declspec(naked)
-uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+__declspec(naked) uint32_t
+    SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     pxor       xmm0, xmm0
     pxor       xmm5, xmm5
 
@@ -61,13 +80,13 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
-#pragma warning(disable: 4752)
-__declspec(naked)
-uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+#pragma warning(disable : 4752)
+__declspec(naked) uint32_t
+    SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) {
   __asm {
-    mov        eax, [esp + 4]    // src_a
-    mov        edx, [esp + 8]    // src_b
-    mov        ecx, [esp + 12]   // count
+    mov        eax, [esp + 4]  // src_a
+    mov        edx, [esp + 8]  // src_b
+    mov        ecx, [esp + 12]  // count
     vpxor      ymm0, ymm0, ymm0  // sum
     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
     sub        edx, eax
@@ -101,65 +120,65 @@ uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
 }
 #endif  // _MSC_VER >= 1700
 
-uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
+uvec32 kHash16x33 = {0x92d9e201, 0, 0, 0};  // 33 ^ 16
 uvec32 kHashMul0 = {
-  0x0c3525e1,  // 33 ^ 15
-  0xa3476dc1,  // 33 ^ 14
-  0x3b4039a1,  // 33 ^ 13
-  0x4f5f0981,  // 33 ^ 12
+    0x0c3525e1,  // 33 ^ 15
+    0xa3476dc1,  // 33 ^ 14
+    0x3b4039a1,  // 33 ^ 13
+    0x4f5f0981,  // 33 ^ 12
 };
 uvec32 kHashMul1 = {
-  0x30f35d61,  // 33 ^ 11
-  0x855cb541,  // 33 ^ 10
-  0x040a9121,  // 33 ^ 9
-  0x747c7101,  // 33 ^ 8
+    0x30f35d61,  // 33 ^ 11
+    0x855cb541,  // 33 ^ 10
+    0x040a9121,  // 33 ^ 9
+    0x747c7101,  // 33 ^ 8
 };
 uvec32 kHashMul2 = {
-  0xec41d4e1,  // 33 ^ 7
-  0x4cfa3cc1,  // 33 ^ 6
-  0x025528a1,  // 33 ^ 5
-  0x00121881,  // 33 ^ 4
+    0xec41d4e1,  // 33 ^ 7
+    0x4cfa3cc1,  // 33 ^ 6
+    0x025528a1,  // 33 ^ 5
+    0x00121881,  // 33 ^ 4
 };
 uvec32 kHashMul3 = {
-  0x00008c61,  // 33 ^ 3
-  0x00000441,  // 33 ^ 2
-  0x00000021,  // 33 ^ 1
-  0x00000001,  // 33 ^ 0
+    0x00008c61,  // 33 ^ 3
+    0x00000441,  // 33 ^ 2
+    0x00000021,  // 33 ^ 1
+    0x00000001,  // 33 ^ 0
 };
 
-__declspec(naked)
-uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     movd       xmm0, [esp + 12]  // seed
 
-    pxor       xmm7, xmm7        // constant 0 for unpck
+    pxor       xmm7, xmm7  // constant 0 for unpck
     movdqa     xmm6, xmmword ptr kHash16x33
 
   wloop:
-    movdqu     xmm1, [eax]       // src[0-15]
+    movdqu     xmm1, [eax]  // src[0-15]
     lea        eax, [eax + 16]
-    pmulld     xmm0, xmm6        // hash *= 33 ^ 16
+    pmulld     xmm0, xmm6  // hash *= 33 ^ 16
     movdqa     xmm5, xmmword ptr kHashMul0
     movdqa     xmm2, xmm1
-    punpcklbw  xmm2, xmm7        // src[0-7]
+    punpcklbw  xmm2, xmm7  // src[0-7]
     movdqa     xmm3, xmm2
-    punpcklwd  xmm3, xmm7        // src[0-3]
+    punpcklwd  xmm3, xmm7  // src[0-3]
     pmulld     xmm3, xmm5
     movdqa     xmm5, xmmword ptr kHashMul1
     movdqa     xmm4, xmm2
-    punpckhwd  xmm4, xmm7        // src[4-7]
+    punpckhwd  xmm4, xmm7  // src[4-7]
     pmulld     xmm4, xmm5
     movdqa     xmm5, xmmword ptr kHashMul2
-    punpckhbw  xmm1, xmm7        // src[8-15]
+    punpckhbw  xmm1, xmm7  // src[8-15]
     movdqa     xmm2, xmm1
-    punpcklwd  xmm2, xmm7        // src[8-11]
+    punpcklwd  xmm2, xmm7  // src[8-11]
     pmulld     xmm2, xmm5
     movdqa     xmm5, xmmword ptr kHashMul3
-    punpckhwd  xmm1, xmm7        // src[12-15]
+    punpckhwd  xmm1, xmm7  // src[12-15]
     pmulld     xmm1, xmm5
-    paddd      xmm3, xmm4        // add 16 results
+    paddd      xmm3, xmm4  // add 16 results
     paddd      xmm1, xmm2
     paddd      xmm1, xmm3
 
@@ -171,18 +190,18 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
     sub        ecx, 16
     jg         wloop
 
-    movd       eax, xmm0         // return hash
+    movd       eax, xmm0  // return hash
     ret
   }
 }
 
 // Visual C 2012 required for AVX2.
 #if _MSC_VER >= 1700
-__declspec(naked)
-uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+__declspec(naked) uint32_t
+    HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) {
   __asm {
-    mov        eax, [esp + 4]    // src
-    mov        ecx, [esp + 8]    // count
+    mov        eax, [esp + 4]  // src
+    mov        ecx, [esp + 8]  // count
     vmovd      xmm0, [esp + 12]  // seed
 
   wloop:
@@ -196,7 +215,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     vpmulld    xmm2, xmm2, xmmword ptr kHashMul2
     lea        eax, [eax + 16]
     vpmulld    xmm1, xmm1, xmmword ptr kHashMul3
-    vpaddd     xmm3, xmm3, xmm4        // add 16 results
+    vpaddd     xmm3, xmm3, xmm4  // add 16 results
     vpaddd     xmm1, xmm1, xmm2
     vpaddd     xmm1, xmm1, xmm3
     vpshufd    xmm2, xmm1, 0x0e  // upper 2 dwords
@@ -207,7 +226,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
     sub        ecx, 16
     jg         wloop
 
-    vmovd      eax, xmm0         // return hash
+    vmovd      eax, xmm0  // return hash
     vzeroupper
     ret
   }
diff --git a/libs/libvpx/third_party/libyuv/source/convert.cc b/libs/libvpx/third_party/libyuv/source/convert.cc
index a33742d24d9..375cc732c1d 100644
--- a/libs/libvpx/third_party/libyuv/source/convert.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert.cc
@@ -14,8 +14,8 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
-#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/row.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -28,14 +28,22 @@ static __inline int Abs(int v) {
 }
 
 // Any I4xx To I420 format with mirroring.
-static int I4xxToI420(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int src_uv_width, int src_uv_height) {
+static int I4xxToI420(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int src_uv_width,
+                      int src_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
@@ -44,35 +52,37 @@ static int I4xxToI420(const uint8* src_y, int src_stride_y,
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-               dst_y, dst_stride_y, dst_y_width, dst_y_height,
-               kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
   return 0;
 }
 
-// Copy I420 with optional flipping
+// Copy I420 with optional flipping.
 // TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
 // is does row coalescing.
 LIBYUV_API
-int I420Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I420Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -96,79 +106,152 @@ int I420Copy(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// 422 chroma is 1/2 width, 1x height
-// 420 chroma is 1/2 width, 1/2 height
+// Copy I010 with optional flipping.
 LIBYUV_API
-int I422ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 1, 1);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I010Copy(const uint16_t* src_y,
+             int src_stride_y,
+             const uint16_t* src_u,
+             int src_stride_u,
+             const uint16_t* src_v,
+             int src_stride_v,
+             uint16_t* dst_y,
+             int dst_stride_y,
+             uint16_t* dst_u,
+             int dst_stride_u,
+             uint16_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  if (dst_y) {
+    CopyPlane_16(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  // Copy UV planes.
+  CopyPlane_16(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+  CopyPlane_16(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+  return 0;
 }
 
-// 444 chroma is 1x width, 1x height
+// Convert 10 bit YUV to 8 bit.
+LIBYUV_API
+int I010ToI420(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert16To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, 16384, width,
+                    height);
+  // Convert UV planes.
+  Convert16To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, 16384, halfwidth,
+                    halfheight);
+  Convert16To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, 16384, halfwidth,
+                    halfheight);
+  return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I444ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    width, height);
+int I422ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  const int src_uv_width = SUBSAMPLE(width, 1, 1);
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, src_uv_width, height);
 }
 
-// 411 chroma is 1/4 width, 1x height
+// 444 chroma is 1x width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
-int I411ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int src_uv_width = SUBSAMPLE(width, 3, 2);
-  return I4xxToI420(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    src_uv_width, height);
+int I444ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return I4xxToI420(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, width, height);
 }
 
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
-int I400ToI420(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I400ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -186,11 +269,15 @@ int I400ToI420(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
-                       uint8* dst, int dst_stride,
-                       int width, int height) {
+static void CopyPlane2(const uint8_t* src,
+                       int src_stride_0,
+                       int src_stride_1,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width,
+                       int height) {
   int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -211,11 +298,6 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height - 1; y += 2) {
@@ -238,17 +320,22 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
 // src_stride_m420 is row planar. Normally this will be the width in pixels.
 //   The UV plane is half width, but 2 values, so src_stride_m420 applies to
 //   this as well as the two Y planes.
-static int X420ToI420(const uint8* src_y,
-                      int src_stride_y0, int src_stride_y1,
-                      const uint8* src_uv, int src_stride_uv,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int width, int height) {
+static int X420ToI420(const uint8_t* src_y,
+                      int src_stride_y0,
+                      int src_stride_y1,
+                      const uint8_t* src_uv,
+                      int src_stride_uv,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int width,
+                      int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_uv || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_uv || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -265,16 +352,14 @@ static int X420ToI420(const uint8* src_y,
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
+  if (src_stride_y0 == width && src_stride_y1 == width &&
       dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
   }
   // Coalesce rows.
-  if (src_stride_uv == halfwidth * 2 &&
-      dst_stride_u == halfwidth &&
+  if (src_stride_uv == halfwidth * 2 && dst_stride_u == halfwidth &&
       dst_stride_v == halfwidth) {
     halfwidth *= halfheight;
     halfheight = 1;
@@ -299,63 +384,78 @@ static int X420ToI420(const uint8* src_y,
 
 // Convert NV12 to I420.
 LIBYUV_API
-int NV12ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_uv, src_stride_uv,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height);
+int NV12ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_uv, src_stride_uv,
+                    dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                    dst_stride_v, width, height);
 }
 
 // Convert NV21 to I420.  Same as NV12 but u and v pointers swapped.
 LIBYUV_API
-int NV21ToI420(const uint8* src_y, int src_stride_y,
-               const uint8* src_vu, int src_stride_vu,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  return X420ToI420(src_y, src_stride_y, src_stride_y,
-                    src_vu, src_stride_vu,
-                    dst_y, dst_stride_y,
-                    dst_v, dst_stride_v,
-                    dst_u, dst_stride_u,
-                    width, height);
+int NV21ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return X420ToI420(src_y, src_stride_y, src_stride_y, src_vu, src_stride_vu,
+                    dst_y, dst_stride_y, dst_v, dst_stride_v, dst_u,
+                    dst_stride_u, width, height);
 }
 
 // Convert M420 to I420.
 LIBYUV_API
-int M420ToI420(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int M420ToI420(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
-                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
+                    src_m420 + src_stride_m420 * 2, src_stride_m420 * 3, dst_y,
+                    dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                     width, height);
 }
 
 // Convert YUY2 to I420.
 LIBYUV_API
-int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI420(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
-      uint8* dst_u, uint8* dst_v, int width) = YUY2ToUVRow_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2,
-      uint8* dst_y, int width) = YUY2ToYRow_C;
+  void (*YUY2ToUVRow)(const uint8_t* src_yuy2, int src_stride_yuy2,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      YUY2ToUVRow_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -392,6 +492,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -411,16 +521,22 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I420.
 LIBYUV_API
-int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI420(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
-      uint8* dst_u, uint8* dst_v, int width) = UYVYToUVRow_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-      uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUVRow)(const uint8_t* src_uyvy, int src_stride_uyvy,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      UYVYToUVRow_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -457,6 +573,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
@@ -476,19 +602,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
 
 // Convert ARGB to I420.
 LIBYUV_API
-int ARGBToI420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -533,6 +663,22 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -552,19 +698,23 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
 
 // Convert BGRA to I420.
 LIBYUV_API
-int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int BGRAToI420(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
-      uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
-  void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int width) =
+  void (*BGRAToUVRow)(const uint8_t* src_bgra0, int src_stride_bgra,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      BGRAToUVRow_C;
+  void (*BGRAToYRow)(const uint8_t* src_bgra, uint8_t* dst_y, int width) =
       BGRAToYRow_C;
-  if (!src_bgra ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_bgra || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -592,12 +742,28 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
   }
 #endif
 #if defined(HAS_BGRATOUVROW_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
-      BGRAToUVRow = BGRAToUVRow_Any_NEON;
-      if (IS_ALIGNED(width, 16)) {
-        BGRAToUVRow = BGRAToUVRow_NEON;
-      }
+  if (TestCpuFlag(kCpuHasNEON)) {
+    BGRAToUVRow = BGRAToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_NEON;
     }
+  }
+#endif
+#if defined(HAS_BGRATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToYRow = BGRAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToYRow = BGRAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_BGRATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    BGRAToUVRow = BGRAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      BGRAToUVRow = BGRAToUVRow_MSA;
+    }
+  }
 #endif
 
   for (y = 0; y < height - 1; y += 2) {
@@ -618,19 +784,23 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
 
 // Convert ABGR to I420.
 LIBYUV_API
-int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ABGRToI420(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
-      uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
-  void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int width) =
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
       ABGRToYRow_C;
-  if (!src_abgr ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_abgr || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -665,6 +835,22 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
     }
   }
 #endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
@@ -684,19 +870,23 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
 
 // Convert RGBA to I420.
 LIBYUV_API
-int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int RGBAToI420(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
-      uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
-  void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int width) =
+  void (*RGBAToUVRow)(const uint8_t* src_rgba0, int src_stride_rgba,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGBAToUVRow_C;
+  void (*RGBAToYRow)(const uint8_t* src_rgba, uint8_t* dst_y, int width) =
       RGBAToYRow_C;
-  if (!src_rgba ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgba || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -731,6 +921,22 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
     }
   }
 #endif
+#if defined(HAS_RGBATOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToYRow = RGBAToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToYRow = RGBAToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_RGBATOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGBAToUVRow = RGBAToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGBAToUVRow = RGBAToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
@@ -750,27 +956,33 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
 
 // Convert RGB24 to I420.
 LIBYUV_API
-int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_y, int dst_stride_y,
-                uint8* dst_u, int dst_stride_u,
-                uint8* dst_v, int dst_stride_v,
-                int width, int height) {
+int RGB24ToI420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
   int y;
-#if defined(HAS_RGB24TOYROW_NEON)
-  void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
-      uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
-  void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int width) =
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
+  void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVRow_C;
+  void (*RGB24ToYRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
       RGB24ToYRow_C;
 #else
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -792,6 +1004,15 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       }
     }
   }
+#elif defined(HAS_RGB24TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
+    RGB24ToYRow = RGB24ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYRow = RGB24ToYRow_MSA;
+      RGB24ToUVRow = RGB24ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB24 to ARGB.
 #else
 #if defined(HAS_RGB24TOARGBROW_SSSE3)
@@ -822,14 +1043,17 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
       RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
       RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@@ -846,7 +1070,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB24TOYROW_NEON)
+#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
       RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
       RGB24ToYRow(src_rgb24, dst_y, width);
 #else
@@ -855,36 +1079,41 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB24TOYROW_NEON)
+#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert RAW to I420.
 LIBYUV_API
-int RAWToI420(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int RAWToI420(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
-#if defined(HAS_RAWTOYROW_NEON)
-  void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
-      uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
-  void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int width) =
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
+  void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
+                     uint8_t* dst_v, int width) = RAWToUVRow_C;
+  void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
       RAWToYRow_C;
 #else
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_raw || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -906,6 +1135,15 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       }
     }
   }
+#elif defined(HAS_RAWTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToUVRow = RAWToUVRow_Any_MSA;
+    RAWToYRow = RAWToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToYRow = RAWToYRow_MSA;
+      RAWToUVRow = RAWToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RAW to ARGB.
 #else
 #if defined(HAS_RAWTOARGBROW_SSSE3)
@@ -936,14 +1174,17 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#endif
+
   {
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
       RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
       RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@@ -960,7 +1201,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RAWTOYROW_NEON)
+#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
       RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
       RAWToYRow(src_raw, dst_y, width);
 #else
@@ -969,36 +1210,42 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RAWTOYROW_NEON)
+#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert RGB565 to I420.
 LIBYUV_API
-int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_y, int dst_stride_y,
-                 uint8* dst_u, int dst_stride_u,
-                 uint8* dst_v, int dst_stride_v,
-                 int width, int height) {
+int RGB565ToI420(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 uint8_t* dst_u,
+                 int dst_stride_u,
+                 uint8_t* dst_v,
+                 int dst_stride_v,
+                 int width,
+                 int height) {
   int y;
-#if defined(HAS_RGB565TOYROW_NEON)
-  void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
-      uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
-  void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int width) =
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
+  void (*RGB565ToUVRow)(const uint8_t* src_rgb565, int src_stride_rgb565,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB565ToUVRow_C;
+  void (*RGB565ToYRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width) =
       RGB565ToYRow_C;
 #else
-  void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_rgb565 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1020,6 +1267,15 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       }
     }
   }
+#elif defined(HAS_RGB565TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
+    RGB565ToYRow = RGB565ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToYRow = RGB565ToYRow_MSA;
+      RGB565ToUVRow = RGB565ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from RGB565 to ARGB.
 #else
 #if defined(HAS_RGB565TOARGBROW_SSE2)
@@ -1057,15 +1313,16 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
+#endif
 #endif
   {
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
-
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
       RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
       RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
@@ -1082,7 +1339,7 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_RGB565TOYROW_NEON)
+#if (defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
       RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
       RGB565ToYRow(src_rgb565, dst_y, width);
 #else
@@ -1091,36 +1348,43 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_RGB565TOYROW_NEON)
+#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB1555 to I420.
 LIBYUV_API
-int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB1555ToI420(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
-#if defined(HAS_ARGB1555TOYROW_NEON)
-  void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
-  void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int width) =
-      ARGB1555ToYRow_C;
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
+  void (*ARGB1555ToUVRow)(const uint8_t* src_argb1555, int src_stride_argb1555,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB1555ToUVRow_C;
+  void (*ARGB1555ToYRow)(const uint8_t* src_argb1555, uint8_t* dst_y,
+                         int width) = ARGB1555ToYRow_C;
 #else
-  void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB1555ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb1555 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1142,6 +1406,15 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       }
     }
   }
+#elif defined(HAS_ARGB1555TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
+    ARGB1555ToYRow = ARGB1555ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToYRow = ARGB1555ToYRow_MSA;
+      ARGB1555ToUVRow = ARGB1555ToUVRow_MSA;
+    }
+  }
 // Other platforms do intermediate conversion from ARGB1555 to ARGB.
 #else
 #if defined(HAS_ARGB1555TOARGBROW_SSE2)
@@ -1179,15 +1452,17 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       ARGBToYRow = ARGBToYRow_AVX2;
     }
   }
+#endif
 #endif
   {
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 #endif
 
     for (y = 0; y < height - 1; y += 2) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
       ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
       ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
@@ -1206,7 +1481,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       dst_v += dst_stride_v;
     }
     if (height & 1) {
-#if defined(HAS_ARGB1555TOYROW_NEON)
+#if (defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
       ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
       ARGB1555ToYRow(src_argb1555, dst_y, width);
 #else
@@ -1215,36 +1490,43 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
       ARGBToYRow(row, dst_y, width);
 #endif
     }
-#if !defined(HAS_ARGB1555TOYROW_NEON)
+#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA))
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
 // Convert ARGB4444 to I420.
 LIBYUV_API
-int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_y, int dst_stride_y,
-                   uint8* dst_u, int dst_stride_u,
-                   uint8* dst_v, int dst_stride_v,
-                   int width, int height) {
+int ARGB4444ToI420(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_y,
+                   int dst_stride_y,
+                   uint8_t* dst_u,
+                   int dst_stride_u,
+                   uint8_t* dst_v,
+                   int dst_stride_v,
+                   int width,
+                   int height) {
   int y;
 #if defined(HAS_ARGB4444TOYROW_NEON)
-  void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
-      uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
-  void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int width) =
-      ARGB4444ToYRow_C;
+  void (*ARGB4444ToUVRow)(const uint8_t* src_argb4444, int src_stride_argb4444,
+                          uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGB4444ToUVRow_C;
+  void (*ARGB4444ToYRow)(const uint8_t* src_argb4444, uint8_t* dst_y,
+                         int width) = ARGB4444ToYRow_C;
 #else
-  void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
-      ARGB4444ToARGBRow_C;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
 #endif
-  if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb4444 || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1284,6 +1566,14 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
@@ -1304,7 +1594,22 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+      if (IS_ALIGNED(width, 32)) {
+        ARGBToUVRow = ARGBToUVRow_MSA;
+      }
+    }
+  }
+#endif
+#endif
+
   {
+#if !defined(HAS_ARGB4444TOYROW_NEON)
     // Allocate 2 rows of ARGB.
     const int kRowSize = (width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
@@ -1341,13 +1646,15 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
     }
 #if !defined(HAS_ARGB4444TOYROW_NEON)
     free_aligned_buffer_64(row);
-  }
 #endif
+  }
   return 0;
 }
 
-static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv,
-                        uint8* dst_u, int width) {
+static void SplitPixels(const uint8_t* src_u,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_u,
+                        int width) {
   int i;
   for (i = 0; i < width; ++i) {
     *dst_u = *src_u;
@@ -1358,21 +1665,26 @@ static void SplitPixels(const uint8* src_u, int src_pixel_stride_uv,
 
 // Convert Android420 to I420.
 LIBYUV_API
-int Android420ToI420(const uint8* src_y, int src_stride_y,
-                     const uint8* src_u, int src_stride_u,
-                     const uint8* src_v, int src_stride_v,
+int Android420ToI420(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
                      int src_pixel_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height) {
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height) {
   int y;
-  const int vu_off = src_v - src_u;
+  const ptrdiff_t vu_off = src_v - src_u;
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1396,15 +1708,16 @@ int Android420ToI420(const uint8* src_y, int src_stride_y,
     CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
     CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
     return 0;
-  // Split UV planes - NV21
-  } else if (src_pixel_stride_uv == 2 && vu_off == -1 &&
-             src_stride_u == src_stride_v) {
+    // Split UV planes - NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
     SplitUVPlane(src_v, src_stride_v, dst_v, dst_stride_v, dst_u, dst_stride_u,
                  halfwidth, halfheight);
     return 0;
-  // Split UV planes - NV12
-  } else if (src_pixel_stride_uv == 2 && vu_off == 1 &&
-             src_stride_u == src_stride_v) {
+    // Split UV planes - NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
     SplitUVPlane(src_u, src_stride_u, dst_u, dst_stride_u, dst_v, dst_stride_v,
                  halfwidth, halfheight);
     return 0;
diff --git a/libs/libvpx/third_party/libyuv/source/convert_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_argb.cc
index fb9582d627e..f2fe474f704 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_argb.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_argb.cc
@@ -26,11 +26,13 @@ extern "C" {
 
 // Copy ARGB with optional flipping
 LIBYUV_API
-int ARGBCopy(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int width, int height) {
-  if (!src_argb || !dst_argb ||
-      width <= 0 || height == 0) {
+int ARGBCopy(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int width,
+             int height) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -40,27 +42,29 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
 
-  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
-            width * 4, height);
+  CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width * 4,
+            height);
   return 0;
 }
 
-// Convert I422 to ARGB with matrix
-static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+// Convert I420 to ARGB with matrix
+static int I420ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -93,13 +97,12 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
@@ -117,111 +120,130 @@ static int I420ToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to ARGB.
 LIBYUV_API
-int I420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to ABGR.
 LIBYUV_API
-int I420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J420 to ARGB.
 LIBYUV_API
-int J420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J420 to ABGR.
 LIBYUV_API
-int J420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H420 to ARGB.
 LIBYUV_API
-int H420ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H420ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H420 to ABGR.
 LIBYUV_API
-int H420ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I420ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H420ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I420ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I422 to ARGB with matrix
-static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+static int I422ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -231,10 +253,8 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
@@ -263,13 +283,12 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
@@ -285,225 +304,380 @@ static int I422ToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I422 to ARGB.
 LIBYUV_API
-int I422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to ABGR.
 LIBYUV_API
-int I422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert J422 to ARGB.
 LIBYUV_API
-int J422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int J422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
 }
 
 // Convert J422 to ABGR.
 LIBYUV_API
-int J422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int J422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuJPEGConstants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert H422 to ARGB.
 LIBYUV_API
-int H422ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvH709Constants,
-                          width, height);
+int H422ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
 // Convert H422 to ABGR.
 LIBYUV_API
-int H422ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I422ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int H422ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I422ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuH709Constants,  // Use Yvu matrix
                           width, height);
 }
 
-// Convert I444 to ARGB with matrix
-static int I444ToARGBMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_argb, int dst_stride_argb,
+// Convert 10 bit YUV to ARGB with matrix
+// TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to
+// multiply 10 bit yuv into high bits to allow any number of bits.
+static int I010ToAR30Matrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I444ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I444ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I210ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToAR30Row_C;
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
-    dst_stride_argb = -dst_stride_argb;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
   }
-  // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u == width &&
-      src_stride_v == width &&
-      dst_stride_argb == width * 4) {
-    width *= height;
-    height = 1;
-    src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
-  }
-#if defined(HAS_I444TOARGBROW_SSSE3)
+#if defined(HAS_I210TOAR30ROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+    I210ToAR30Row = I210ToAR30Row_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_SSSE3;
+      I210ToAR30Row = I210ToAR30Row_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_AVX2)
+#if defined(HAS_I210TOAR30ROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
+    I210ToAR30Row = I210ToAR30Row_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I444ToARGBRow = I444ToARGBRow_AVX2;
+      I210ToAR30Row = I210ToAR30Row_AVX2;
     }
   }
 #endif
-#if defined(HAS_I444TOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+  for (y = 0; y < height; ++y) {
+    I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I010 to AR30.
+LIBYUV_API
+int I010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H010 to AR30.
+LIBYUV_API
+int H010ToAR30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvH709Constants, width, height);
+}
+
+// Convert I010 to AB30.
+LIBYUV_API
+int I010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuI601Constants, width, height);
+}
+
+// Convert H010 to AB30.
+LIBYUV_API
+int H010ToAB30(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  return I010ToAR30Matrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                          src_stride_u, dst_ab30, dst_stride_ab30,
+                          &kYvuH709Constants, width, height);
+}
+
+// Convert 10 bit YUV to ARGB with matrix
+static int I010ToARGBMatrix(const uint16_t* src_y,
+                            int src_stride_y,
+                            const uint16_t* src_u,
+                            int src_stride_u,
+                            const uint16_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I210ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf,
+                        const uint16_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I210ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+#if defined(HAS_I210TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I444ToARGBRow = I444ToARGBRow_NEON;
+      I210ToARGBRow = I210ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I210TOARGBROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I210ToARGBRow = I210ToARGBRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I210ToARGBRow = I210ToARGBRow_AVX2;
     }
   }
 #endif
-
   for (y = 0; y < height; ++y) {
-    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
+    I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
-    src_u += src_stride_u;
-    src_v += src_stride_v;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
   }
   return 0;
 }
 
-// Convert I444 to ARGB.
+// Convert I010 to ARGB.
 LIBYUV_API
-int I444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvI601Constants,
-                          width, height);
+int I010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
 }
 
-// Convert I444 to ABGR.
+// Convert I010 to ABGR.
 LIBYUV_API
-int I444ToABGR(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_abgr, int dst_stride_abgr,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_abgr, dst_stride_abgr,
+int I010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
-// Convert J444 to ARGB.
+// Convert H010 to ARGB.
 LIBYUV_API
-int J444ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return I444ToARGBMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_argb, dst_stride_argb,
-                          &kYuvJPEGConstants,
-                          width, height);
+int H010ToARGB(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvH709Constants, width, height);
 }
 
-// Convert I411 to ARGB.
+// Convert H010 to ABGR.
 LIBYUV_API
-int I411ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int H010ToABGR(const uint16_t* src_y,
+               int src_stride_y,
+               const uint16_t* src_u,
+               int src_stride_u,
+               const uint16_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I010ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuH709Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert I444 to ARGB with matrix
+static int I444ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
   int y;
-  void (*I411ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I411ToARGBRow_C;
-  if (!src_y || !src_u || !src_v ||
-      !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I444ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I444ToARGBRow_C;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -513,41 +687,47 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 4 == width &&
-      src_stride_v * 4 == width &&
+  if (src_stride_y == width && src_stride_u == width && src_stride_v == width &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
   }
-#if defined(HAS_I411TOARGBROW_SSSE3)
+#if defined(HAS_I444TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+    I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_SSSE3;
+      I444ToARGBRow = I444ToARGBRow_SSSE3;
     }
   }
 #endif
-#if defined(HAS_I411TOARGBROW_AVX2)
+#if defined(HAS_I444TOARGBROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    I411ToARGBRow = I411ToARGBRow_Any_AVX2;
+    I444ToARGBRow = I444ToARGBRow_Any_AVX2;
     if (IS_ALIGNED(width, 16)) {
-      I411ToARGBRow = I411ToARGBRow_AVX2;
+      I444ToARGBRow = I444ToARGBRow_AVX2;
     }
   }
 #endif
-#if defined(HAS_I411TOARGBROW_NEON)
+#if defined(HAS_I444TOARGBROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    I411ToARGBRow = I411ToARGBRow_Any_NEON;
+    I444ToARGBRow = I444ToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I444ToARGBRow = I444ToARGBRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I444ToARGBRow = I444ToARGBRow_Any_MSA;
     if (IS_ALIGNED(width, 8)) {
-      I411ToARGBRow = I411ToARGBRow_NEON;
+      I444ToARGBRow = I444ToARGBRow_MSA;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    I411ToARGBRow(src_y, src_u, src_v, dst_argb, &kYuvI601Constants, width);
+    I444ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     src_u += src_stride_u;
@@ -556,26 +736,83 @@ int I411ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert I444 to ABGR.
+LIBYUV_API
+int I444ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_abgr, dst_stride_abgr,
+                          &kYvuI601Constants,  // Use Yvu matrix
+                          width, height);
+}
+
+// Convert J444 to ARGB.
+LIBYUV_API
+int J444ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return I444ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_argb, dst_stride_argb,
+                          &kYuvJPEGConstants, width, height);
+}
+
 // Convert I420 with Alpha to preattenuated ARGB.
-static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
-                                 const uint8* src_u, int src_stride_u,
-                                 const uint8* src_v, int src_stride_v,
-                                 const uint8* src_a, int src_stride_a,
-                                 uint8* dst_argb, int dst_stride_argb,
+static int I420AlphaToARGBMatrix(const uint8_t* src_y,
+                                 int src_stride_y,
+                                 const uint8_t* src_u,
+                                 int src_stride_u,
+                                 const uint8_t* src_v,
+                                 int src_stride_v,
+                                 const uint8_t* src_a,
+                                 int src_stride_a,
+                                 uint8_t* dst_argb,
+                                 int dst_stride_argb,
                                  const struct YuvConstants* yuvconstants,
-                                 int width, int height, int attenuate) {
+                                 int width,
+                                 int height,
+                                 int attenuate) {
   int y;
-  void (*I422AlphaToARGBRow)(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
+  void (*I422AlphaToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                             const uint8_t* v_buf, const uint8_t* a_buf,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) = I422AlphaToARGBRow_C;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-  if (!src_y || !src_u || !src_v || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -608,13 +845,12 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422ALPHATOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422AlphaToARGBRow = I422AlphaToARGBRow_DSPR2;
+#if defined(HAS_I422ALPHATOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422AlphaToARGBRow = I422AlphaToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422AlphaToARGBRow = I422AlphaToARGBRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
@@ -641,6 +877,14 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants,
@@ -661,49 +905,59 @@ static int I420AlphaToARGBMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 with Alpha to ARGB.
 LIBYUV_API
-int I420AlphaToARGB(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_u, src_stride_u,
-                               src_v, src_stride_v,
-                               src_a, src_stride_a,
-                               dst_argb, dst_stride_argb,
-                               &kYuvI601Constants,
-                               width, height, attenuate);
+int I420AlphaToARGB(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                               src_stride_v, src_a, src_stride_a, dst_argb,
+                               dst_stride_argb, &kYuvI601Constants, width,
+                               height, attenuate);
 }
 
 // Convert I420 with Alpha to ABGR.
 LIBYUV_API
-int I420AlphaToABGR(const uint8* src_y, int src_stride_y,
-                    const uint8* src_u, int src_stride_u,
-                    const uint8* src_v, int src_stride_v,
-                    const uint8* src_a, int src_stride_a,
-                    uint8* dst_abgr, int dst_stride_abgr,
-                    int width, int height, int attenuate) {
-  return I420AlphaToARGBMatrix(src_y, src_stride_y,
-                               src_v, src_stride_v,  // Swap U and V
-                               src_u, src_stride_u,
-                               src_a, src_stride_a,
-                               dst_abgr, dst_stride_abgr,
-                               &kYvuI601Constants,  // Use Yvu matrix
-                               width, height, attenuate);
+int I420AlphaToABGR(const uint8_t* src_y,
+                    int src_stride_y,
+                    const uint8_t* src_u,
+                    int src_stride_u,
+                    const uint8_t* src_v,
+                    int src_stride_v,
+                    const uint8_t* src_a,
+                    int src_stride_a,
+                    uint8_t* dst_abgr,
+                    int dst_stride_abgr,
+                    int width,
+                    int height,
+                    int attenuate) {
+  return I420AlphaToARGBMatrix(
+      src_y, src_stride_y, src_v, src_stride_v,  // Swap U and V
+      src_u, src_stride_u, src_a, src_stride_a, dst_abgr, dst_stride_abgr,
+      &kYvuI601Constants,  // Use Yvu matrix
+      width, height, attenuate);
 }
 
 // Convert I400 to ARGB.
 LIBYUV_API
-int I400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int I400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*I400ToARGBRow)(const uint8* y_buf,
-                     uint8* rgb_buf,
-                     int width) = I400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) =
+      I400ToARGBRow_C;
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -713,8 +967,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -743,6 +996,14 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I400ToARGBRow = I400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I400ToARGBRow = I400ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I400ToARGBRow(src_y, dst_argb, width);
@@ -754,14 +1015,16 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
 
 // Convert J400 to ARGB.
 LIBYUV_API
-int J400ToARGB(const uint8* src_y, int src_stride_y,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int J400ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int width) =
+  void (*J400ToARGBRow)(const uint8_t* src_y, uint8_t* dst_argb, int width) =
       J400ToARGBRow_C;
-  if (!src_y || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -771,8 +1034,7 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -800,6 +1062,14 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
       J400ToARGBRow = J400ToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_J400TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    J400ToARGBRow = J400ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      J400ToARGBRow = J400ToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     J400ToARGBRow(src_y, dst_argb, width);
@@ -810,85 +1080,89 @@ int J400ToARGB(const uint8* src_y, int src_stride_y,
 }
 
 // Shuffle table for converting BGRA to ARGB.
-static uvec8 kShuffleMaskBGRAToARGB = {
-  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
-};
+static const uvec8 kShuffleMaskBGRAToARGB = {
+    3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u};
 
 // Shuffle table for converting ABGR to ARGB.
-static uvec8 kShuffleMaskABGRToARGB = {
-  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
-};
+static const uvec8 kShuffleMaskABGRToARGB = {
+    2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u};
 
 // Shuffle table for converting RGBA to ARGB.
-static uvec8 kShuffleMaskRGBAToARGB = {
-  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
-};
+static const uvec8 kShuffleMaskRGBAToARGB = {
+    1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u};
 
 // Convert BGRA to ARGB.
 LIBYUV_API
-int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int BGRAToARGB(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ARGB to BGRA (same as BGRAToARGB).
 LIBYUV_API
-int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_bgra, src_stride_bgra,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskBGRAToARGB),
-                     width, height);
+int ARGBToBGRA(const uint8_t* src_bgra,
+               int src_stride_bgra,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height);
 }
 
 // Convert ABGR to ARGB.
 LIBYUV_API
-int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ABGRToARGB(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert ARGB to ABGR to (same as ABGRToARGB).
 LIBYUV_API
-int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_abgr, src_stride_abgr,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskABGRToARGB),
-                     width, height);
+int ARGBToABGR(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height);
 }
 
 // Convert RGBA to ARGB.
 LIBYUV_API
-int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
-  return ARGBShuffle(src_rgba, src_stride_rgba,
-                     dst_argb, dst_stride_argb,
-                     (const uint8*)(&kShuffleMaskRGBAToARGB),
-                     width, height);
+int RGBAToARGB(const uint8_t* src_rgba,
+               int src_stride_rgba,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb,
+                     (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height);
 }
 
 // Convert RGB24 to ARGB.
 LIBYUV_API
-int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
+int RGB24ToARGB(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
   int y;
-  void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RGB24ToARGBRow_C;
-  if (!src_rgb24 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_rgb24 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -898,8 +1172,7 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     src_stride_rgb24 = -src_stride_rgb24;
   }
   // Coalesce rows.
-  if (src_stride_rgb24 == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb24 == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb24 = dst_stride_argb = 0;
@@ -920,6 +1193,14 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
     }
   }
 #endif
+#if defined(HAS_RGB24TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB24ToARGBRow(src_rgb24, dst_argb, width);
@@ -931,14 +1212,16 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
 
 // Convert RAW to ARGB.
 LIBYUV_API
-int RAWToARGB(const uint8* src_raw, int src_stride_raw,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int RAWToARGB(const uint8_t* src_raw,
+              int src_stride_raw,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int width) =
+  void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
       RAWToARGBRow_C;
-  if (!src_raw || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -948,8 +1231,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_raw == width * 3 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_argb = 0;
@@ -970,6 +1252,14 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToARGBRow = RAWToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToARGBRow = RAWToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToARGBRow(src_raw, dst_argb, width);
@@ -981,14 +1271,16 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
 
 // Convert RGB565 to ARGB.
 LIBYUV_API
-int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int RGB565ToARGB(const uint8_t* src_rgb565,
+                 int src_stride_rgb565,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int width) =
-      RGB565ToARGBRow_C;
-  if (!src_rgb565 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*RGB565ToARGBRow)(const uint8_t* src_rgb565, uint8_t* dst_argb,
+                          int width) = RGB565ToARGBRow_C;
+  if (!src_rgb565 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -998,8 +1290,7 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     src_stride_rgb565 = -src_stride_rgb565;
   }
   // Coalesce rows.
-  if (src_stride_rgb565 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_rgb565 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_rgb565 = dst_stride_argb = 0;
@@ -1028,6 +1319,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
     }
   }
 #endif
+#if defined(HAS_RGB565TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB565ToARGBRow = RGB565ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB565ToARGBRow = RGB565ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RGB565ToARGBRow(src_rgb565, dst_argb, width);
@@ -1039,14 +1338,16 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
 
 // Convert ARGB1555 to ARGB.
 LIBYUV_API
-int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB1555ToARGB(const uint8_t* src_argb1555,
+                   int src_stride_argb1555,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
-      int width) = ARGB1555ToARGBRow_C;
-  if (!src_argb1555 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB1555ToARGBRow)(const uint8_t* src_argb1555, uint8_t* dst_argb,
+                            int width) = ARGB1555ToARGBRow_C;
+  if (!src_argb1555 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1056,8 +1357,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     src_stride_argb1555 = -src_stride_argb1555;
   }
   // Coalesce rows.
-  if (src_stride_argb1555 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb1555 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb1555 = dst_stride_argb = 0;
@@ -1086,6 +1386,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
     }
   }
 #endif
+#if defined(HAS_ARGB1555TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB1555ToARGBRow = ARGB1555ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
@@ -1097,14 +1405,16 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
 
 // Convert ARGB4444 to ARGB.
 LIBYUV_API
-int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
-                   uint8* dst_argb, int dst_stride_argb,
-                   int width, int height) {
+int ARGB4444ToARGB(const uint8_t* src_argb4444,
+                   int src_stride_argb4444,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
-      int width) = ARGB4444ToARGBRow_C;
-  if (!src_argb4444 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGB4444ToARGBRow)(const uint8_t* src_argb4444, uint8_t* dst_argb,
+                            int width) = ARGB4444ToARGBRow_C;
+  if (!src_argb4444 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1114,8 +1424,7 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     src_stride_argb4444 = -src_stride_argb4444;
   }
   // Coalesce rows.
-  if (src_stride_argb4444 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb4444 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb4444 = dst_stride_argb = 0;
@@ -1144,6 +1453,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
     }
   }
 #endif
+#if defined(HAS_ARGB4444TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGB4444ToARGBRow = ARGB4444ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
@@ -1153,20 +1470,117 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
   return 0;
 }
 
-// Convert NV12 to ARGB.
+// Convert AR30 to ARGB.
 LIBYUV_API
-int NV12ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int AR30ToARGB(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_ar30 || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_argb == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_argb = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToARGBRow_C(src_ar30, dst_argb, width);
+    src_ar30 += src_stride_ar30;
+    dst_argb += dst_stride_argb;
+  }
+  return 0;
+}
+
+// Convert AR30 to ABGR.
+LIBYUV_API
+int AR30ToABGR(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_abgr || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_abgr == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_abgr = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToABGRRow_C(src_ar30, dst_abgr, width);
+    src_ar30 += src_stride_ar30;
+    dst_abgr += dst_stride_abgr;
+  }
+  return 0;
+}
+
+// Convert AR30 to AB30.
+LIBYUV_API
+int AR30ToAB30(const uint8_t* src_ar30,
+               int src_stride_ar30,
+               uint8_t* dst_ab30,
+               int dst_stride_ab30,
+               int width,
+               int height) {
+  int y;
+  if (!src_ar30 || !dst_ab30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ar30 = src_ar30 + (height - 1) * src_stride_ar30;
+    src_stride_ar30 = -src_stride_ar30;
+  }
+  // Coalesce rows.
+  if (src_stride_ar30 == width * 4 && dst_stride_ab30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_ar30 = dst_stride_ab30 = 0;
+  }
+  for (y = 0; y < height; ++y) {
+    AR30ToAB30Row_C(src_ar30, dst_ab30, width);
+    src_ar30 += src_stride_ar30;
+    dst_ab30 += dst_stride_ab30;
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB with matrix
+static int NV12ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_uv,
+                            int src_stride_uv,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1199,9 +1613,17 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV12ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    NV12ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
     if (y & 1) {
@@ -1211,20 +1633,21 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// Convert NV21 to ARGB.
-LIBYUV_API
-int NV21ToARGB(const uint8* src_y, int src_stride_y,
-               const uint8* src_uv, int src_stride_uv,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+// Convert NV21 to ARGB with matrix
+static int NV21ToARGBMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_vu,
+                            int src_stride_vu,
+                            uint8_t* dst_argb,
+                            int dst_stride_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
   int y;
-  void (*NV21ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV21ToARGBRow_C;
-  if (!src_y || !src_uv || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV21ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToARGBRow_C;
+  if (!src_y || !src_vu || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1257,11 +1680,136 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV21TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV21ToARGBRow = NV21ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToARGBRow = NV21ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
-    NV21ToARGBRow(src_y, src_uv, dst_argb, &kYuvI601Constants, width);
+    NV21ToARGBRow(src_y, src_vu, dst_argb, yuvconstants, width);
     dst_argb += dst_stride_argb;
     src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_argb,
+                          dst_stride_argb, &kYuvI601Constants, width, height);
+}
+
+// Convert NV12 to ABGR.
+// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To swap the UV use NV12 instead of NV21.LIBYUV_API
+int NV12ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV21ToARGBMatrix(src_y, src_stride_y, src_uv, src_stride_uv, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// Convert NV21 to ABGR.
+LIBYUV_API
+int NV21ToABGR(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_abgr,
+               int dst_stride_abgr,
+               int width,
+               int height) {
+  return NV12ToARGBMatrix(src_y, src_stride_y, src_vu, src_stride_vu, dst_abgr,
+                          dst_stride_abgr, &kYvuI601Constants, width, height);
+}
+
+// TODO(fbarchard): Consider SSSE3 2 step conversion.
+// Convert NV12 to RGB24 with matrix
+static int NV12ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_uv,
+                             int src_stride_uv,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV12ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB24Row_C;
+  if (!src_y || !src_uv || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV12TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV12ToRGB24Row = NV12ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV12ToRGB24Row = NV12ToRGB24Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV12ToRGB24Row(src_y, src_uv, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
     if (y & 1) {
       src_uv += src_stride_uv;
     }
@@ -1269,19 +1817,109 @@ int NV21ToARGB(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert NV21 to RGB24 with matrix
+static int NV21ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_vu,
+                             int src_stride_vu,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
+                             const struct YuvConstants* yuvconstants,
+                             int width,
+                             int height) {
+  int y;
+  void (*NV21ToRGB24Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV21ToRGB24Row_C;
+  if (!src_y || !src_vu || !dst_rgb24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+    dst_stride_rgb24 = -dst_stride_rgb24;
+  }
+#if defined(HAS_NV21TORGB24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToRGB24Row = NV21ToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToRGB24Row = NV21ToRGB24Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    NV21ToRGB24Row(src_y, src_vu, dst_rgb24, yuvconstants, width);
+    dst_rgb24 += dst_stride_rgb24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
+// TODO(fbarchard): NV12ToRAW can be implemented by mirrored matrix.
+// Convert NV12 to RGB24.
+LIBYUV_API
+int NV12ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_uv,
+                int src_stride_uv,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV12ToRGB24Matrix(src_y, src_stride_y, src_uv, src_stride_uv,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
+// Convert NV21 to RGB24.
+LIBYUV_API
+int NV21ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return NV21ToRGB24Matrix(src_y, src_stride_y, src_vu, src_stride_vu,
+                           dst_rgb24, dst_stride_rgb24, &kYuvI601Constants,
+                           width, height);
+}
+
 // Convert M420 to ARGB.
 LIBYUV_API
-int M420ToARGB(const uint8* src_m420, int src_stride_m420,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int M420ToARGB(const uint8_t* src_m420,
+               int src_stride_m420,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*NV12ToARGBRow)(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = NV12ToARGBRow_C;
-  if (!src_m420 || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*NV12ToARGBRow)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToARGBRow_C;
+  if (!src_m420 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1314,6 +1952,14 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
     }
   }
 #endif
+#if defined(HAS_NV12TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToARGBRow = NV12ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToARGBRow = NV12ToARGBRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb,
@@ -1332,17 +1978,17 @@ int M420ToARGB(const uint8* src_m420, int src_stride_m420,
 
 // Convert YUY2 to ARGB.
 LIBYUV_API
-int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int YUY2ToARGB(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToARGBRow)(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*YUY2ToARGBRow)(const uint8_t* src_yuy2, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       YUY2ToARGBRow_C;
-  if (!src_yuy2 || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1352,8 +1998,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_argb = 0;
@@ -1381,6 +2026,14 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
       YUY2ToARGBRow = YUY2ToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToARGBRow = YUY2ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      YUY2ToARGBRow = YUY2ToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     YUY2ToARGBRow(src_yuy2, dst_argb, &kYuvI601Constants, width);
@@ -1392,17 +2045,17 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to ARGB.
 LIBYUV_API
-int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int UYVYToARGB(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToARGBRow)(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) =
+  void (*UYVYToARGBRow)(const uint8_t* src_uyvy, uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants, int width) =
       UYVYToARGBRow_C;
-  if (!src_uyvy || !dst_argb ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1412,8 +2065,7 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_uyvy == width * 2 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_argb = 0;
@@ -1441,6 +2093,14 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
       UYVYToARGBRow = UYVYToARGBRow_NEON;
     }
   }
+#endif
+#if defined(HAS_UYVYTOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToARGBRow = UYVYToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      UYVYToARGBRow = UYVYToARGBRow_MSA;
+    }
+  }
 #endif
   for (y = 0; y < height; ++y) {
     UYVYToARGBRow(src_uyvy, dst_argb, &kYuvI601Constants, width);
@@ -1449,6 +2109,121 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
   }
   return 0;
 }
+static void WeavePixels(const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        int src_pixel_stride_uv,
+                        uint8_t* dst_uv,
+                        int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    dst_uv[0] = *src_u;
+    dst_uv[1] = *src_v;
+    dst_uv += 2;
+    src_u += src_pixel_stride_uv;
+    src_v += src_pixel_stride_uv;
+  }
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGBMatrix(const uint8_t* src_y,
+                           int src_stride_y,
+                           const uint8_t* src_u,
+                           int src_stride_u,
+                           const uint8_t* src_v,
+                           int src_stride_v,
+                           int src_pixel_stride_uv,
+                           uint8_t* dst_argb,
+                           int dst_stride_argb,
+                           const struct YuvConstants* yuvconstants,
+                           int width,
+                           int height) {
+  int y;
+  uint8_t* dst_uv;
+  const ptrdiff_t vu_off = src_v - src_u;
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+    dst_stride_argb = -dst_stride_argb;
+  }
+
+  // I420
+  if (src_pixel_stride_uv == 1) {
+    return I420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                            src_stride_v, dst_argb, dst_stride_argb,
+                            yuvconstants, width, height);
+    // NV21
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == -1 &&
+      src_stride_u == src_stride_v) {
+    return NV21ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+    // NV12
+  }
+  if (src_pixel_stride_uv == 2 && vu_off == 1 && src_stride_u == src_stride_v) {
+    return NV12ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, dst_argb,
+                            dst_stride_argb, yuvconstants, width, height);
+  }
+
+  // General case fallback creates NV12
+  align_buffer_64(plane_uv, halfwidth * 2 * halfheight);
+  dst_uv = plane_uv;
+  for (y = 0; y < halfheight; ++y) {
+    WeavePixels(src_u, src_v, src_pixel_stride_uv, dst_uv, halfwidth);
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+    dst_uv += halfwidth * 2;
+  }
+  NV12ToARGBMatrix(src_y, src_stride_y, plane_uv, halfwidth * 2, dst_argb,
+                   dst_stride_argb, yuvconstants, width, height);
+  free_aligned_buffer_64(plane_uv);
+  return 0;
+}
+
+// Convert Android420 to ARGB.
+LIBYUV_API
+int Android420ToARGB(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                                src_stride_v, src_pixel_stride_uv, dst_argb,
+                                dst_stride_argb, &kYuvI601Constants, width,
+                                height);
+}
+
+// Convert Android420 to ABGR.
+LIBYUV_API
+int Android420ToABGR(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_u,
+                     int src_stride_u,
+                     const uint8_t* src_v,
+                     int src_stride_v,
+                     int src_pixel_stride_uv,
+                     uint8_t* dst_abgr,
+                     int dst_stride_abgr,
+                     int width,
+                     int height) {
+  return Android420ToARGBMatrix(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                                src_stride_u, src_pixel_stride_uv, dst_abgr,
+                                dst_stride_abgr, &kYvuI601Constants, width,
+                                height);
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/third_party/libyuv/source/convert_from.cc b/libs/libvpx/third_party/libyuv/source/convert_from.cc
index 3b2dca8163a..6fa253237ee 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_from.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_from.cc
@@ -15,9 +15,9 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/row.h"
 #include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
-#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -30,109 +30,144 @@ static __inline int Abs(int v) {
 }
 
 // I420 To any I4xx YUV format with mirroring.
-static int I420ToI4xx(const uint8* src_y, int src_stride_y,
-                      const uint8* src_u, int src_stride_u,
-                      const uint8* src_v, int src_stride_v,
-                      uint8* dst_y, int dst_stride_y,
-                      uint8* dst_u, int dst_stride_u,
-                      uint8* dst_v, int dst_stride_v,
-                      int src_y_width, int src_y_height,
-                      int dst_uv_width, int dst_uv_height) {
+static int I420ToI4xx(const uint8_t* src_y,
+                      int src_stride_y,
+                      const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      uint8_t* dst_u,
+                      int dst_stride_u,
+                      uint8_t* dst_v,
+                      int dst_stride_v,
+                      int src_y_width,
+                      int src_y_height,
+                      int dst_uv_width,
+                      int dst_uv_height) {
   const int dst_y_width = Abs(src_y_width);
   const int dst_y_height = Abs(src_y_height);
   const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
   const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
-  if (src_y_width == 0 || src_y_height == 0 ||
-      dst_uv_width <= 0 || dst_uv_height <= 0) {
+  if (src_y_width == 0 || src_y_height == 0 || dst_uv_width <= 0 ||
+      dst_uv_height <= 0) {
     return -1;
   }
   if (dst_y) {
-    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
-               dst_y, dst_stride_y, dst_y_width, dst_y_height,
-               kFilterBilinear);
-  }
-  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
-             dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
-  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
-             dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
-             kFilterBilinear);
+    ScalePlane(src_y, src_stride_y, src_y_width, src_y_height, dst_y,
+               dst_stride_y, dst_y_width, dst_y_height, kFilterBilinear);
+  }
+  ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height, dst_u,
+             dst_stride_u, dst_uv_width, dst_uv_height, kFilterBilinear);
+  ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height, dst_v,
+             dst_stride_v, dst_uv_width, dst_uv_height, kFilterBilinear);
+  return 0;
+}
+
+// Convert 8 bit YUV to 10 bit.
+LIBYUV_API
+int I420ToI010(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint16_t* dst_y,
+               int dst_stride_y,
+               uint16_t* dst_u,
+               int dst_stride_u,
+               uint16_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  Convert8To16Plane(src_y, src_stride_y, dst_y, dst_stride_y, 1024, width,
+                    height);
+  // Convert UV planes.
+  Convert8To16Plane(src_u, src_stride_u, dst_u, dst_stride_u, 1024, halfwidth,
+                    halfheight);
+  Convert8To16Plane(src_v, src_stride_v, dst_v, dst_stride_v, 1024, halfwidth,
+                    halfheight);
   return 0;
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 422 chroma is 1/2 width, 1x height
 LIBYUV_API
-int I420ToI422(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI422(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = (Abs(width) + 1) >> 1;
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // 420 chroma is 1/2 width, 1/2 height
 // 444 chroma is 1x width, 1x height
 LIBYUV_API
-int I420ToI444(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420ToI444(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   const int dst_uv_width = Abs(width);
   const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
-}
-
-// 420 chroma is 1/2 width, 1/2 height
-// 411 chroma is 1/4 width, 1x height
-LIBYUV_API
-int I420ToI411(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  const int dst_uv_width = (Abs(width) + 3) >> 2;
-  const int dst_uv_height = Abs(height);
-  return I420ToI4xx(src_y, src_stride_y,
-                    src_u, src_stride_u,
-                    src_v, src_stride_v,
-                    dst_y, dst_stride_y,
-                    dst_u, dst_stride_u,
-                    dst_v, dst_stride_v,
-                    width, height,
-                    dst_uv_width, dst_uv_height);
+  return I420ToI4xx(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                    src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                    dst_v, dst_stride_v, width, height, dst_uv_width,
+                    dst_uv_height);
 }
 
 // Copy to I400. Source can be I420,422,444,400,NV12,NV21
 LIBYUV_API
-int I400Copy(const uint8* src_y, int src_stride_y,
-             uint8* dst_y, int dst_stride_y,
-             int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Copy(const uint8_t* src_y,
+             int src_stride_y,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             int width,
+             int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -146,17 +181,21 @@ int I400Copy(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I422ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -166,10 +205,8 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
@@ -182,6 +219,14 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -202,17 +247,21 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToYUY2(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int I420ToYUY2(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_yuy2, int width) =
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
       I422ToYUY2Row_C;
-  if (!src_y || !src_u || !src_v || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -229,6 +278,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -237,6 +294,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -254,17 +319,21 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I422ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I422ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -274,10 +343,8 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      src_stride_u * 2 == width &&
-      src_stride_v * 2 == width &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_y == width && src_stride_u * 2 == width &&
+      src_stride_v * 2 == width && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
@@ -290,6 +357,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -298,6 +373,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -310,17 +393,21 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int I420ToUYVY(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int I420ToUYVY(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-                        const uint8* src_v, uint8* dst_uyvy, int width) =
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
       I422ToUYVYRow_C;
-  if (!src_y || !src_u || !src_v || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -337,6 +424,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -345,6 +440,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -363,14 +466,20 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
 
 // TODO(fbarchard): test negative height for invert.
 LIBYUV_API
-int I420ToNV12(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+int I420ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
     return -1;
   }
   int halfwidth = (width + 1) / 2;
@@ -378,44 +487,47 @@ int I420ToNV12(const uint8* src_y, int src_stride_y,
   if (dst_y) {
     CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
   }
-  MergeUVPlane(src_u, src_stride_u,
-               src_v, src_stride_v,
-               dst_uv, dst_stride_uv,
+  MergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv, dst_stride_uv,
                halfwidth, halfheight);
   return 0;
 }
 
 LIBYUV_API
-int I420ToNV21(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_vu, int dst_stride_vu,
-               int width, int height) {
-  return I420ToNV12(src_y, src_stride_y,
-                    src_v, src_stride_v,
-                    src_u, src_stride_u,
-                    dst_y, dst_stride_y,
-                    dst_vu, dst_stride_vu,
+int I420ToNV21(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  return I420ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
                     width, height);
 }
 
 // Convert I422 to RGBA with matrix
-static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I420ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -448,13 +560,12 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
   }
 #endif
 
@@ -472,50 +583,58 @@ static int I420ToRGBAMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGBA.
 LIBYUV_API
-int I420ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I420ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to BGRA.
 LIBYUV_API
-int I420ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I420ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I420ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I420ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert I420 to RGB24 with matrix
-static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
-                             const uint8* src_u, int src_stride_u,
-                             const uint8* src_v, int src_stride_v,
-                             uint8* dst_rgb24, int dst_stride_rgb24,
+static int I420ToRGB24Matrix(const uint8_t* src_y,
+                             int src_stride_y,
+                             const uint8_t* src_u,
+                             int src_stride_u,
+                             const uint8_t* src_v,
+                             int src_stride_v,
+                             uint8_t* dst_rgb24,
+                             int dst_stride_rgb24,
                              const struct YuvConstants* yuvconstants,
-                             int width, int height) {
+                             int width,
+                             int height) {
   int y;
-  void (*I422ToRGB24Row)(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) = I422ToRGB24Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB24Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                         const uint8_t* v_buf, uint8_t* rgb_buf,
+                         const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB24Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -548,6 +667,14 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB24Row = I422ToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB24Row = I422ToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, yuvconstants, width);
@@ -563,50 +690,95 @@ static int I420ToRGB24Matrix(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGB24.
 LIBYUV_API
-int I420ToRGB24(const uint8* src_y, int src_stride_y,
-                const uint8* src_u, int src_stride_u,
-                const uint8* src_v, int src_stride_v,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_u, src_stride_u,
-                           src_v, src_stride_v,
-                           dst_rgb24, dst_stride_rgb24,
-                           &kYuvI601Constants,
-                           width, height);
+int I420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvI601Constants, width, height);
 }
 
 // Convert I420 to RAW.
 LIBYUV_API
-int I420ToRAW(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
-  return I420ToRGB24Matrix(src_y, src_stride_y,
-                           src_v, src_stride_v,  // Swap U and V
-                           src_u, src_stride_u,
-                           dst_raw, dst_stride_raw,
+int I420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
                            &kYvuI601Constants,  // Use Yvu matrix
                            width, height);
 }
 
+// Convert H420 to RGB24.
+LIBYUV_API
+int H420ToRGB24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_u,
+                int src_stride_u,
+                const uint8_t* src_v,
+                int src_stride_v,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                           src_stride_v, dst_rgb24, dst_stride_rgb24,
+                           &kYuvH709Constants, width, height);
+}
+
+// Convert H420 to RAW.
+LIBYUV_API
+int H420ToRAW(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
+  return I420ToRGB24Matrix(src_y, src_stride_y, src_v,
+                           src_stride_v,  // Swap U and V
+                           src_u, src_stride_u, dst_raw, dst_stride_raw,
+                           &kYvuH709Constants,  // Use Yvu matrix
+                           width, height);
+}
+
 // Convert I420 to ARGB1555.
 LIBYUV_API
-int I420ToARGB1555(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int I420ToARGB1555(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB1555Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB1555Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB1555Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb1555 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb1555 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -639,6 +811,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB1555Row = I422ToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB1555Row = I422ToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, &kYuvI601Constants,
@@ -653,23 +833,25 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-
 // Convert I420 to ARGB4444.
 LIBYUV_API
-int I420ToARGB4444(const uint8* src_y, int src_stride_y,
-                   const uint8* src_u, int src_stride_u,
-                   const uint8* src_v, int src_stride_v,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int I420ToARGB4444(const uint8_t* src_y,
+                   int src_stride_y,
+                   const uint8_t* src_u,
+                   int src_stride_u,
+                   const uint8_t* src_v,
+                   int src_stride_v,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*I422ToARGB4444Row)(const uint8* y_buf,
-                            const uint8* u_buf,
-                            const uint8* v_buf,
-                            uint8* rgb_buf,
+  void (*I422ToARGB4444Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                            const uint8_t* v_buf, uint8_t* rgb_buf,
                             const struct YuvConstants* yuvconstants,
                             int width) = I422ToARGB4444Row_C;
-  if (!src_y || !src_u || !src_v || !dst_argb4444 ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_argb4444 || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -702,6 +884,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGB4444Row = I422ToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGB4444Row = I422ToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, &kYuvI601Constants,
@@ -718,20 +908,22 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to RGB565.
 LIBYUV_API
-int I420ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_u, int src_stride_u,
-                 const uint8* src_v, int src_stride_v,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int I420ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*I422ToRGB565Row)(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = I422ToRGB565Row_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -764,6 +956,14 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
@@ -777,32 +977,102 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I422 to RGB565.
+LIBYUV_API
+int I422ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_u,
+                 int src_stride_u,
+                 const uint8_t* src_v,
+                 int src_stride_v,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
+  int y;
+  void (*I422ToRGB565Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                          const uint8_t* v_buf, uint8_t* rgb_buf,
+                          const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGB565Row_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+    dst_stride_rgb565 = -dst_stride_rgb565;
+  }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToRGB565Row = I422ToRGB565Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_I422TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGB565Row = I422ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGB565Row = I422ToRGB565Row_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, &kYuvI601Constants, width);
+    dst_rgb565 += dst_stride_rgb565;
+    src_y += src_stride_y;
+    src_u += src_stride_u;
+    src_v += src_stride_v;
+  }
+  return 0;
+}
+
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert I420 to RGB565 with dithering.
 LIBYUV_API
-int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int I420ToRGB565Dither(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToARGBRow_C;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToARGBRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -838,12 +1108,12 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
@@ -869,6 +1139,14 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
       ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON;
     }
   }
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a row of argb.
@@ -876,7 +1154,8 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
     for (y = 0; y < height; ++y) {
       I422ToARGBRow(src_y, src_u, src_v, row_argb, &kYuvI601Constants, width);
       ARGBToRGB565DitherRow(row_argb, dst_rgb565,
-                            *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                            *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                            width);
       dst_rgb565 += dst_stride_rgb565;
       src_y += src_stride_y;
       if (y & 1) {
@@ -889,220 +1168,254 @@ int I420ToRGB565Dither(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
+// Convert I420 to AR30 with matrix
+static int I420ToAR30Matrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_ar30,
+                            int dst_stride_ar30,
+                            const struct YuvConstants* yuvconstants,
+                            int width,
+                            int height) {
+  int y;
+  void (*I422ToAR30Row)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToAR30Row_C;
+
+  if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30;
+    dst_stride_ar30 = -dst_stride_ar30;
+  }
+
+#if defined(HAS_I422TOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToAR30Row = I422ToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_I422TOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToAR30Row = I422ToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      I422ToAR30Row = I422ToAR30Row_AVX2;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
+    dst_ar30 += dst_stride_ar30;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_u += src_stride_u;
+      src_v += src_stride_v;
+    }
+  }
+  return 0;
+}
+
+// Convert I420 to AR30.
+LIBYUV_API
+int I420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYuvI601Constants, width, height);
+}
+
+// Convert H420 to AR30.
+LIBYUV_API
+int H420ToAR30(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  return I420ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_ar30, dst_stride_ar30,
+                          &kYvuH709Constants, width, height);
+}
+
 // Convert I420 to specified format
 LIBYUV_API
-int ConvertFromI420(const uint8* y, int y_stride,
-                    const uint8* u, int u_stride,
-                    const uint8* v, int v_stride,
-                    uint8* dst_sample, int dst_sample_stride,
-                    int width, int height,
-                    uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+int ConvertFromI420(const uint8_t* y,
+                    int y_stride,
+                    const uint8_t* u,
+                    int u_stride,
+                    const uint8_t* v,
+                    int v_stride,
+                    uint8_t* dst_sample,
+                    int dst_sample_stride,
+                    int width,
+                    int height,
+                    uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int r = 0;
-  if (!y || !u|| !v || !dst_sample ||
-      width <= 0 || height == 0) {
+  if (!y || !u || !v || !dst_sample || width <= 0 || height == 0) {
     return -1;
   }
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
-      r = I420ToYUY2(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToYUY2(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_UYVY:
-      r = I420ToUYVY(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 2,
-                     width, height);
+      r = I420ToUYVY(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 2, width,
+                     height);
       break;
     case FOURCC_RGBP:
-      r = I420ToRGB565(y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       dst_sample,
-                       dst_sample_stride ? dst_sample_stride : width * 2,
-                       width, height);
+      r = I420ToRGB565(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                       dst_sample_stride ? dst_sample_stride : width * 2, width,
+                       height);
       break;
     case FOURCC_RGBO:
-      r = I420ToARGB1555(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB1555(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_R444:
-      r = I420ToARGB4444(y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         dst_sample,
+      r = I420ToARGB4444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
                          dst_sample_stride ? dst_sample_stride : width * 2,
                          width, height);
       break;
     case FOURCC_24BG:
-      r = I420ToRGB24(y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      dst_sample,
-                      dst_sample_stride ? dst_sample_stride : width * 3,
-                      width, height);
+      r = I420ToRGB24(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                      dst_sample_stride ? dst_sample_stride : width * 3, width,
+                      height);
       break;
     case FOURCC_RAW:
-      r = I420ToRAW(y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    dst_sample,
-                    dst_sample_stride ? dst_sample_stride : width * 3,
-                    width, height);
+      r = I420ToRAW(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                    dst_sample_stride ? dst_sample_stride : width * 3, width,
+                    height);
       break;
     case FOURCC_ARGB:
-      r = I420ToARGB(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToARGB(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_BGRA:
-      r = I420ToBGRA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToBGRA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_ABGR:
-      r = I420ToABGR(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToABGR(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_RGBA:
-      r = I420ToRGBA(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width * 4,
-                     width, height);
+      r = I420ToRGBA(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
+      break;
+    case FOURCC_AR30:
+      r = I420ToAR30(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width * 4, width,
+                     height);
       break;
     case FOURCC_I400:
-      r = I400Copy(y, y_stride,
-                   dst_sample,
-                   dst_sample_stride ? dst_sample_stride : width,
-                   width, height);
+      r = I400Copy(y, y_stride, dst_sample,
+                   dst_sample_stride ? dst_sample_stride : width, width,
+                   height);
       break;
     case FOURCC_NV12: {
-      uint8* dst_uv = dst_sample + width * height;
-      r = I420ToNV12(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_uv,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_uv = dst_sample + width * height;
+      r = I420ToNV12(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_uv,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     case FOURCC_NV21: {
-      uint8* dst_vu = dst_sample + width * height;
-      r = I420ToNV21(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     dst_vu,
-                     dst_sample_stride ? dst_sample_stride : width,
-                     width, height);
+      uint8_t* dst_vu = dst_sample + width * height;
+      r = I420ToNV21(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride ? dst_sample_stride : width, dst_vu,
+                     dst_sample_stride ? dst_sample_stride : width, width,
+                     height);
       break;
     }
     // TODO(fbarchard): Add M420.
     // Triplanar formats
-    // TODO(fbarchard): halfstride instead of halfwidth
     case FOURCC_I420:
     case FOURCC_YV12: {
-      int halfwidth = (width + 1) / 2;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
       int halfheight = (height + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV12) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * halfheight;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * halfheight;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * halfheight;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * halfheight;
       }
-      r = I420Copy(y, y_stride,
-                   u, u_stride,
-                   v, v_stride,
-                   dst_sample, width,
-                   dst_u, halfwidth,
-                   dst_v, halfwidth,
+      r = I420Copy(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                   dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                    width, height);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      int halfwidth = (width + 1) / 2;
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      int halfstride = (dst_sample_stride + 1) / 2;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV16) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + halfwidth * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + halfstride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + halfwidth * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + halfstride * height;
       }
-      r = I420ToI422(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, halfwidth,
-                     dst_v, halfwidth,
+      r = I420ToI422(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, halfstride, dst_v, halfstride,
                      width, height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      uint8* dst_u;
-      uint8* dst_v;
+      dst_sample_stride = dst_sample_stride ? dst_sample_stride : width;
+      uint8_t* dst_u;
+      uint8_t* dst_v;
       if (format == FOURCC_YV24) {
-        dst_v = dst_sample + width * height;
-        dst_u = dst_v + width * height;
+        dst_v = dst_sample + dst_sample_stride * height;
+        dst_u = dst_v + dst_sample_stride * height;
       } else {
-        dst_u = dst_sample + width * height;
-        dst_v = dst_u + width * height;
+        dst_u = dst_sample + dst_sample_stride * height;
+        dst_v = dst_u + dst_sample_stride * height;
       }
-      r = I420ToI444(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, width,
-                     dst_v, width,
-                     width, height);
+      r = I420ToI444(y, y_stride, u, u_stride, v, v_stride, dst_sample,
+                     dst_sample_stride, dst_u, dst_sample_stride, dst_v,
+                     dst_sample_stride, width, height);
       break;
     }
-    case FOURCC_I411: {
-      int quarterwidth = (width + 3) / 4;
-      uint8* dst_u = dst_sample + width * height;
-      uint8* dst_v = dst_u + quarterwidth * height;
-      r = I420ToI411(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     dst_sample, width,
-                     dst_u, quarterwidth,
-                     dst_v, quarterwidth,
-                     width, height);
-      break;
-    }
-
     // Formats not supported - MJPG, biplanar, some rgb formats.
     default:
       return -1;  // unknown fourcc - return failure code.
diff --git a/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc
index 2a8682b7eb4..c8d91252e9b 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_from_argb.cc
@@ -22,16 +22,21 @@ extern "C" {
 
 // ARGB little endian (bgra in memory) to I444
 LIBYUV_API
-int ARGBToI444(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI444(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV444Row_C;
+  void (*ARGBToUV444Row)(const uint8_t* src_argb, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = ARGBToUV444Row_C;
   if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
@@ -41,20 +46,18 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u == width &&
-      dst_stride_v == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u == width && dst_stride_v == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
   }
 #if defined(HAS_ARGBTOUV444ROW_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
-      ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
-      if (IS_ALIGNED(width, 16)) {
-        ARGBToUV444Row = ARGBToUV444Row_SSSE3;
-      }
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+    }
   }
 #endif
 #if defined(HAS_ARGBTOUV444ROW_NEON)
@@ -65,6 +68,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOUV444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUV444Row = ARGBToUV444Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUV444Row = ARGBToUV444Row_MSA;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTOYROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBToYRow = ARGBToYRow_Any_SSSE3;
@@ -89,6 +100,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUV444Row(src_argb, dst_u, dst_v, width);
@@ -103,19 +122,23 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb,
 
 // ARGB little endian (bgra in memory) to I422
 LIBYUV_API
-int ARGBToI422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToI422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -125,10 +148,8 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -170,81 +191,25 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb,
   }
 #endif
 
-  for (y = 0; y < height; ++y) {
-    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
-    ARGBToYRow(src_argb, dst_y, width);
-    src_argb += src_stride_argb;
-    dst_y += dst_stride_y;
-    dst_u += dst_stride_u;
-    dst_v += dst_stride_v;
-  }
-  return 0;
-}
-
-// ARGB little endian (bgra in memory) to I411
-LIBYUV_API
-int ARGBToI411(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
-  int y;
-  void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-      int width) = ARGBToUV411Row_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
-      ARGBToYRow_C;
-  if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
-    return -1;
-  }
-  if (height < 0) {
-    height = -height;
-    src_argb = src_argb + (height - 1) * src_stride_argb;
-    src_stride_argb = -src_stride_argb;
-  }
-  // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width &&
-      dst_stride_u * 4 == width &&
-      dst_stride_v * 4 == width) {
-    width *= height;
-    height = 1;
-    src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
-  }
-#if defined(HAS_ARGBTOYROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToYRow = ARGBToYRow_Any_SSSE3;
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
     if (IS_ALIGNED(width, 16)) {
-      ARGBToYRow = ARGBToYRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOYROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    ARGBToYRow = ARGBToYRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      ARGBToYRow = ARGBToYRow_AVX2;
+      ARGBToYRow = ARGBToYRow_MSA;
     }
   }
 #endif
-#if defined(HAS_ARGBTOYROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToYRow = ARGBToYRow_Any_NEON;
-    if (IS_ALIGNED(width, 8)) {
-      ARGBToYRow = ARGBToYRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_ARGBTOUV411ROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
     if (IS_ALIGNED(width, 32)) {
-      ARGBToUV411Row = ARGBToUV411Row_NEON;
+      ARGBToUVRow = ARGBToUVRow_MSA;
     }
   }
 #endif
 
   for (y = 0; y < height; ++y) {
-    ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+    ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
     ARGBToYRow(src_argb, dst_y, width);
     src_argb += src_stride_argb;
     dst_y += dst_stride_y;
@@ -255,21 +220,24 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb,
 }
 
 LIBYUV_API
-int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV12(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -314,6 +282,22 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -337,11 +321,19 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
@@ -364,21 +356,24 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
 
 // Same as NV12 but U and V swapped.
 LIBYUV_API
-int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int ARGBToNV21(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb0, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                      int width) = MergeUVRow_C;
-  if (!src_argb ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_argb || !dst_y || !dst_vu || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -423,6 +418,22 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     MergeUVRow_ = MergeUVRow_Any_SSE2;
@@ -446,24 +457,32 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
       MergeUVRow_ = MergeUVRow_NEON;
     }
   }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
 #endif
   {
     // Allocate a rows of uv.
     align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
-    uint8* row_v = row_u + ((halfwidth + 31) & ~31);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
 
     for (y = 0; y < height - 1; y += 2) {
       ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
       ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
       src_argb += src_stride_argb * 2;
       dst_y += dst_stride_y * 2;
-      dst_uv += dst_stride_uv;
+      dst_vu += dst_stride_vu;
     }
     if (height & 1) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
-      MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
       ARGBToYRow(src_argb, dst_y, width);
     }
     free_aligned_buffer_64(row_u);
@@ -473,19 +492,23 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to YUY2.
 LIBYUV_API
-int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yuy2, int dst_stride_yuy2,
-               int width, int height) {
+int ARGBToYUY2(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yuy2,
+               int dst_stride_yuy2,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+  void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_yuy2, int width) =
+      I422ToYUY2Row_C;
 
-  if (!src_argb || !dst_yuy2 ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yuy2 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -495,8 +518,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     dst_stride_yuy2 = -dst_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yuy2 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_yuy2 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yuy2 = 0;
@@ -537,6 +559,22 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
@@ -545,6 +583,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOYUY2ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
@@ -553,12 +599,20 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -575,19 +629,23 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to UYVY.
 LIBYUV_API
-int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_uyvy, int dst_stride_uyvy,
-               int width, int height) {
+int ARGBToUYVY(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_uyvy,
+               int dst_stride_uyvy,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVRow)(const uint8* src_argb, int src_stride_argb,
-      uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToUVRow)(const uint8_t* src_argb, int src_stride_argb,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVRow_C;
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
-  void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
-      const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+  void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u,
+                        const uint8_t* src_v, uint8_t* dst_uyvy, int width) =
+      I422ToUYVYRow_C;
 
-  if (!src_argb || !dst_uyvy ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_uyvy || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -597,8 +655,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     dst_stride_uyvy = -dst_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_uyvy == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_uyvy == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_uyvy = 0;
@@ -639,6 +696,22 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVRow = ARGBToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVRow = ARGBToUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
@@ -647,6 +720,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_I422TOUYVYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
@@ -655,12 +736,20 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
     align_buffer_64(row_y, ((width + 63) & ~63) * 2);
-    uint8* row_u = row_y + ((width + 63) & ~63);
-    uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+    uint8_t* row_u = row_y + ((width + 63) & ~63);
+    uint8_t* row_v = row_u + ((width + 63) & ~63) / 2;
 
     for (y = 0; y < height; ++y) {
       ARGBToUVRow(src_argb, 0, row_u, row_v, width);
@@ -677,11 +766,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to I400.
 LIBYUV_API
-int ARGBToI400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int ARGBToI400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int width) =
+  void (*ARGBToYRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
       ARGBToYRow_C;
   if (!src_argb || !dst_y || width <= 0 || height == 0) {
     return -1;
@@ -692,8 +784,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_y == width) {
+  if (src_stride_argb == width * 4 && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_y = 0;
@@ -722,6 +813,14 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYRow = ARGBToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYRow = ARGBToYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYRow(src_argb, dst_y, width);
@@ -732,28 +831,31 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
 }
 
 // Shuffle table for converting ARGB to RGBA.
-static uvec8 kShuffleMaskARGBToRGBA = {
-  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
-};
+static const uvec8 kShuffleMaskARGBToRGBA = {
+    3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u};
 
 // Convert ARGB to RGBA.
 LIBYUV_API
-int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return ARGBShuffle(src_argb, src_stride_argb,
-                     dst_rgba, dst_stride_rgba,
-                     (const uint8*)(&kShuffleMaskARGBToRGBA),
-                     width, height);
+int ARGBToRGBA(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return ARGBShuffle(src_argb, src_stride_argb, dst_rgba, dst_stride_rgba,
+                     (const uint8_t*)(&kShuffleMaskARGBToRGBA), width, height);
 }
 
 // Convert ARGB To RGB24.
 LIBYUV_API
-int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_rgb24, int dst_stride_rgb24,
-                int width, int height) {
+int ARGBToRGB24(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_rgb24,
+                int dst_stride_rgb24,
+                int width,
+                int height) {
   int y;
-  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
+  void (*ARGBToRGB24Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRGB24Row_C;
   if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
@@ -764,8 +866,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb24 = 0;
@@ -778,6 +879,22 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+  if (TestCpuFlag(kCpuHasAVX512VBMI)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX512VBMI;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_AVX512VBMI;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORGB24ROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
@@ -786,6 +903,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB24Row = ARGBToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRGB24Row = ARGBToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB24Row(src_argb, dst_rgb24, width);
@@ -797,11 +922,14 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To RAW.
 LIBYUV_API
-int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_raw, int dst_stride_raw,
-              int width, int height) {
+int ARGBToRAW(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_raw,
+              int dst_stride_raw,
+              int width,
+              int height) {
   int y;
-  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int width) =
+  void (*ARGBToRAWRow)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
       ARGBToRAWRow_C;
   if (!src_argb || !dst_raw || width <= 0 || height == 0) {
     return -1;
@@ -812,8 +940,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_raw == width * 3) {
+  if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_raw = 0;
@@ -826,6 +953,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToRAWRow = ARGBToRAWRow_AVX2;
+    }
+  }
+#endif
 #if defined(HAS_ARGBTORAWROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
@@ -834,6 +969,14 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORAWROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRAWRow = ARGBToRAWRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToRAWRow = ARGBToRAWRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRAWRow(src_argb, dst_raw, width);
@@ -844,21 +987,23 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
 }
 
 // Ordered 8x8 dither for 888 to 565.  Values from 0 to 7.
-static const uint8 kDither565_4x4[16] = {
-  0, 4, 1, 5,
-  6, 2, 7, 3,
-  1, 5, 0, 4,
-  7, 3, 6, 2,
+static const uint8_t kDither565_4x4[16] = {
+    0, 4, 1, 5, 6, 2, 7, 3, 1, 5, 0, 4, 7, 3, 6, 2,
 };
 
 // Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes).
 LIBYUV_API
-int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_rgb565, int dst_stride_rgb565,
-                       const uint8* dither4x4, int width, int height) {
+int ARGBToRGB565Dither(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_rgb565,
+                       int dst_stride_rgb565,
+                       const uint8_t* dither4x4,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb,
-      const uint32 dither4, int width) = ARGBToRGB565DitherRow_C;
+  void (*ARGBToRGB565DitherRow)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                                const uint32_t dither4, int width) =
+      ARGBToRGB565DitherRow_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -894,9 +1039,19 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_MSA;
+    }
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBToRGB565DitherRow(src_argb, dst_rgb565,
-                          *(uint32*)(dither4x4 + ((y & 3) << 2)), width);
+                          *(const uint32_t*)(dither4x4 + ((y & 3) << 2)),
+                          width);
     src_argb += src_stride_argb;
     dst_rgb565 += dst_stride_rgb565;
   }
@@ -906,12 +1061,15 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb,
 // Convert ARGB To RGB565.
 // TODO(fbarchard): Consider using dither function low level with zeros.
 LIBYUV_API
-int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int ARGBToRGB565(const uint8_t* src_argb,
+                 int src_stride_argb,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToRGB565Row_C;
+  void (*ARGBToRGB565Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                          int width) = ARGBToRGB565Row_C;
   if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
@@ -921,8 +1079,7 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_rgb565 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_rgb565 = 0;
@@ -951,6 +1108,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToRGB565Row = ARGBToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToRGB565Row = ARGBToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToRGB565Row(src_argb, dst_rgb565, width);
@@ -962,12 +1127,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB1555.
 LIBYUV_API
-int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb1555, int dst_stride_argb1555,
-                   int width, int height) {
+int ARGBToARGB1555(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb1555,
+                   int dst_stride_argb1555,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB1555Row_C;
+  void (*ARGBToARGB1555Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB1555Row_C;
   if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
     return -1;
   }
@@ -977,8 +1145,7 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb1555 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb1555 = 0;
@@ -1007,6 +1174,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB1555ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB1555Row = ARGBToARGB1555Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB1555Row = ARGBToARGB1555Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB1555Row(src_argb, dst_argb1555, width);
@@ -1018,12 +1193,15 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB To ARGB4444.
 LIBYUV_API
-int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb4444, int dst_stride_argb4444,
-                   int width, int height) {
+int ARGBToARGB4444(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb4444,
+                   int dst_stride_argb4444,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
-      ARGBToARGB4444Row_C;
+  void (*ARGBToARGB4444Row)(const uint8_t* src_argb, uint8_t* dst_rgb,
+                            int width) = ARGBToARGB4444Row_C;
   if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
     return -1;
   }
@@ -1033,8 +1211,7 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb4444 == width * 2) {
+  if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb4444 = 0;
@@ -1063,6 +1240,14 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOARGB4444ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToARGB4444Row = ARGBToARGB4444Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToARGB4444Row = ARGBToARGB4444Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToARGB4444Row(src_argb, dst_argb4444, width);
@@ -1072,21 +1257,123 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert ABGR To AR30.
+LIBYUV_API
+int ABGRToAR30(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) =
+      ABGRToAR30Row_C;
+  if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+  // Coalesce rows.
+  if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_abgr = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ABGRToAR30Row = ABGRToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToAR30Row = ABGRToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToAR30Row = ABGRToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ABGRToAR30Row(src_abgr, dst_ar30, width);
+    src_abgr += src_stride_abgr;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
+// Convert ARGB To AR30.
+LIBYUV_API
+int ARGBToAR30(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_ar30,
+               int dst_stride_ar30,
+               int width,
+               int height) {
+  int y;
+  void (*ARGBToAR30Row)(const uint8_t* src_argb, uint8_t* dst_rgb, int width) =
+      ARGBToAR30Row_C;
+  if (!src_argb || !dst_ar30 || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (height < 0) {
+    height = -height;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
+  }
+  // Coalesce rows.
+  if (src_stride_argb == width * 4 && dst_stride_ar30 == width * 4) {
+    width *= height;
+    height = 1;
+    src_stride_argb = dst_stride_ar30 = 0;
+  }
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBToAR30Row = ARGBToAR30Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBToAR30Row = ARGBToAR30Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    ARGBToAR30Row(src_argb, dst_ar30, width);
+    src_argb += src_stride_argb;
+    dst_ar30 += dst_stride_ar30;
+  }
+  return 0;
+}
+
 // Convert ARGB to J420. (JPeg full range I420).
 LIBYUV_API
-int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ420(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1129,6 +1416,22 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
@@ -1148,19 +1451,23 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to J422. (JPeg full range I422).
 LIBYUV_API
-int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int ARGBToJ422(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
-  if (!src_argb ||
-      !dst_yj || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_argb || !dst_yj || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1170,10 +1477,8 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = dst_stride_u = dst_stride_v = 0;
@@ -1212,6 +1517,22 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOUVJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
@@ -1226,11 +1547,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to J400.
 LIBYUV_API
-int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_yj, int dst_stride_yj,
-               int width, int height) {
+int ARGBToJ400(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_yj,
+               int dst_stride_yj,
+               int width,
+               int height) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int width) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
       ARGBToYJRow_C;
   if (!src_argb || !dst_yj || width <= 0 || height == 0) {
     return -1;
@@ -1241,8 +1565,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_yj == width) {
+  if (src_stride_argb == width * 4 && dst_stride_yj == width) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_yj = 0;
@@ -1271,6 +1594,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBToYJRow(src_argb, dst_yj, width);
diff --git a/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc
index 90f550a26ad..ae3cc18cd24 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_jpeg.cc
@@ -22,28 +22,24 @@ extern "C" {
 
 #ifdef HAVE_JPEG
 struct I420Buffers {
-  uint8* y;
+  uint8_t* y;
   int y_stride;
-  uint8* u;
+  uint8_t* u;
   int u_stride;
-  uint8* v;
+  uint8_t* v;
   int v_stride;
   int w;
   int h;
 };
 
 static void JpegCopyI420(void* opaque,
-                         const uint8* const* data,
+                         const uint8_t* const* data,
                          const int* strides,
                          int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I420Copy(data[0], strides[0],
-           data[1], strides[1],
-           data[2], strides[2],
-           dest->y, dest->y_stride,
-           dest->u, dest->u_stride,
-           dest->v, dest->v_stride,
-           dest->w, rows);
+  I420Copy(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+           dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+           dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -51,17 +47,13 @@ static void JpegCopyI420(void* opaque,
 }
 
 static void JpegI422ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I422ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I422ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -69,35 +61,13 @@ static void JpegI422ToI420(void* opaque,
 }
 
 static void JpegI444ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I444ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
-  dest->y += rows * dest->y_stride;
-  dest->u += ((rows + 1) >> 1) * dest->u_stride;
-  dest->v += ((rows + 1) >> 1) * dest->v_stride;
-  dest->h -= rows;
-}
-
-static void JpegI411ToI420(void* opaque,
-                           const uint8* const* data,
-                           const int* strides,
-                           int rows) {
-  I420Buffers* dest = (I420Buffers*)(opaque);
-  I411ToI420(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I444ToI420(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->y, dest->y_stride, dest->u, dest->u_stride, dest->v,
+             dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -105,15 +75,12 @@ static void JpegI411ToI420(void* opaque,
 }
 
 static void JpegI400ToI420(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   I420Buffers* dest = (I420Buffers*)(opaque);
-  I400ToI420(data[0], strides[0],
-             dest->y, dest->y_stride,
-             dest->u, dest->u_stride,
-             dest->v, dest->v_stride,
-             dest->w, rows);
+  I400ToI420(data[0], strides[0], dest->y, dest->y_stride, dest->u,
+             dest->u_stride, dest->v, dest->v_stride, dest->w, rows);
   dest->y += rows * dest->y_stride;
   dest->u += ((rows + 1) >> 1) * dest->u_stride;
   dest->v += ((rows + 1) >> 1) * dest->v_stride;
@@ -122,8 +89,10 @@ static void JpegI400ToI420(void* opaque,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8* sample, size_t sample_size,
-             int* width, int* height) {
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
+             int* width,
+             int* height) {
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret) {
@@ -135,15 +104,21 @@ int MJPGSize(const uint8* sample, size_t sample_size,
 }
 
 // MJPG (Motion JPeg) to I420
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToI420(const uint8* sample,
+int MJPGToI420(const uint8_t* sample,
                size_t sample_size,
-               uint8* y, int y_stride,
-               uint8* u, int u_stride,
-               uint8* v, int v_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -152,17 +127,17 @@ int MJPGToI420(const uint8* sample,
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+    I420Buffers bufs = {dst_y, dst_stride_y, dst_u,     dst_stride_u,
+                        dst_v, dst_stride_v, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -170,8 +145,9 @@ int MJPGToI420(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -181,8 +157,9 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -192,28 +169,19 @@ int MJPGToI420(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dst_width,
+                                           dst_height);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
@@ -224,88 +192,67 @@ int MJPGToI420(const uint8* sample,
 
 #ifdef HAVE_JPEG
 struct ARGBBuffers {
-  uint8* argb;
+  uint8_t* argb;
   int argb_stride;
   int w;
   int h;
 };
 
 static void JpegI420ToARGB(void* opaque,
-                         const uint8* const* data,
-                         const int* strides,
-                         int rows) {
-  ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I420ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
-  dest->argb += rows * dest->argb_stride;
-  dest->h -= rows;
-}
-
-static void JpegI422ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I422ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I420ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
-static void JpegI444ToARGB(void* opaque,
-                           const uint8* const* data,
+static void JpegI422ToARGB(void* opaque,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I444ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I422ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
-static void JpegI411ToARGB(void* opaque,
-                           const uint8* const* data,
+static void JpegI444ToARGB(void* opaque,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I411ToARGB(data[0], strides[0],
-             data[1], strides[1],
-             data[2], strides[2],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I444ToARGB(data[0], strides[0], data[1], strides[1], data[2], strides[2],
+             dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 static void JpegI400ToARGB(void* opaque,
-                           const uint8* const* data,
+                           const uint8_t* const* data,
                            const int* strides,
                            int rows) {
   ARGBBuffers* dest = (ARGBBuffers*)(opaque);
-  I400ToARGB(data[0], strides[0],
-             dest->argb, dest->argb_stride,
-             dest->w, rows);
+  I400ToARGB(data[0], strides[0], dest->argb, dest->argb_stride, dest->w, rows);
   dest->argb += rows * dest->argb_stride;
   dest->h -= rows;
 }
 
 // MJPG (Motion JPeg) to ARGB
-// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+// TODO(fbarchard): review src_width and src_height requirement. dst_width and
+// dst_height may be enough.
 LIBYUV_API
-int MJPGToARGB(const uint8* sample,
+int MJPGToARGB(const uint8_t* sample,
                size_t sample_size,
-               uint8* argb, int argb_stride,
-               int w, int h,
-               int dw, int dh) {
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int src_width,
+               int src_height,
+               int dst_width,
+               int dst_height) {
   if (sample_size == kUnknownDataSize) {
     // ERROR: MJPEG frame size unknown
     return -1;
@@ -314,17 +261,16 @@ int MJPGToARGB(const uint8* sample,
   // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
-  if (ret && (mjpeg_decoder.GetWidth() != w ||
-              mjpeg_decoder.GetHeight() != h)) {
+  if (ret && (mjpeg_decoder.GetWidth() != src_width ||
+              mjpeg_decoder.GetHeight() != src_height)) {
     // ERROR: MJPEG frame has unexpected dimensions
     mjpeg_decoder.UnloadFrame();
     return 1;  // runtime failure
   }
   if (ret) {
-    ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+    ARGBBuffers bufs = {dst_argb, dst_stride_argb, dst_width, dst_height};
     // YUV420
-    if (mjpeg_decoder.GetColorSpace() ==
-            MJpegDecoder::kColorSpaceYCbCr &&
+    if (mjpeg_decoder.GetColorSpace() == MJpegDecoder::kColorSpaceYCbCr &&
         mjpeg_decoder.GetNumComponents() == 3 &&
         mjpeg_decoder.GetVertSampFactor(0) == 2 &&
         mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
@@ -332,8 +278,9 @@ int MJPGToARGB(const uint8* sample,
         mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
         mjpeg_decoder.GetVertSampFactor(2) == 1 &&
         mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
-    // YUV422
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV422
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -343,8 +290,9 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
-    // YUV444
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV444
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceYCbCr &&
                mjpeg_decoder.GetNumComponents() == 3 &&
@@ -354,28 +302,19 @@ int MJPGToARGB(const uint8* sample,
                mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
                mjpeg_decoder.GetVertSampFactor(2) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
-    // YUV411
-    } else if (mjpeg_decoder.GetColorSpace() ==
-                   MJpegDecoder::kColorSpaceYCbCr &&
-               mjpeg_decoder.GetNumComponents() == 3 &&
-               mjpeg_decoder.GetVertSampFactor(0) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
-               mjpeg_decoder.GetVertSampFactor(1) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
-               mjpeg_decoder.GetVertSampFactor(2) == 1 &&
-               mjpeg_decoder.GetHorizSampFactor(2) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
-    // YUV400
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dst_width,
+                                           dst_height);
+      // YUV400
     } else if (mjpeg_decoder.GetColorSpace() ==
                    MJpegDecoder::kColorSpaceGrayscale &&
                mjpeg_decoder.GetNumComponents() == 1 &&
                mjpeg_decoder.GetVertSampFactor(0) == 1 &&
                mjpeg_decoder.GetHorizSampFactor(0) == 1) {
-      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+      ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dst_width,
+                                           dst_height);
     } else {
       // TODO(fbarchard): Implement conversion for any other colorspace/sample
-      // factors that occur in practice. 411 is supported by libjpeg
+      // factors that occur in practice.
       // ERROR: Unable to convert MJPEG frame because format is not supported
       mjpeg_decoder.UnloadFrame();
       return 1;
diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc
index aecdc80fde2..67484522c0f 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_to_argb.cc
@@ -28,36 +28,50 @@ extern "C" {
 // src_height is used to compute location of planes, and indicate inversion
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
+
+// TODO(fbarchard): Add the following:
+// H010ToARGB
+// H420ToARGB
+// H422ToARGB
+// I010ToARGB
+// J400ToARGB
+// J422ToARGB
+// J444ToARGB
+
 LIBYUV_API
-int ConvertToARGB(const uint8* sample, size_t sample_size,
-                  uint8* crop_argb, int argb_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+int ConvertToARGB(const uint8_t* sample,
+                  size_t sample_size,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
+  const uint8_t* src;
+  const uint8_t* src_uv;
   int abs_src_height = (src_height < 0) ? -src_height : src_height;
   int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
 
   // One pass rotation is available for some formats. For the rest, convert
-  // to I420 (with optional vertical flipping) into a temporary I420 buffer,
-  // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination crop_argb is same as source sample,
+  // to ARGB (with optional vertical flipping) into a temporary ARGB buffer,
+  // and then rotate the ARGB to the final destination buffer.
+  // For in-place conversion, if destination dst_argb is same as source sample,
   // also enable temporary buffer.
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
-      crop_argb == sample;
-  uint8* dest_argb = crop_argb;
-  int dest_argb_stride = argb_stride;
-  uint8* rotate_buffer = NULL;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_ARGB) || dst_argb == sample;
+  uint8_t* dest_argb = dst_argb;
+  int dest_dst_stride_argb = dst_stride_argb;
+  uint8_t* rotate_buffer = NULL;
   int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
 
-  if (crop_argb == NULL || sample == NULL ||
-      src_width <= 0 || crop_width <= 0 ||
+  if (dst_argb == NULL || sample == NULL || src_width <= 0 || crop_width <= 0 ||
       src_height == 0 || crop_height == 0) {
     return -1;
   }
@@ -67,187 +81,174 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
 
   if (need_buf) {
     int argb_size = crop_width * 4 * abs_crop_height;
-    rotate_buffer = (uint8*)malloc(argb_size);
+    rotate_buffer = (uint8_t*)malloc(argb_size); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    crop_argb = rotate_buffer;
-    argb_stride = crop_width * 4;
+    dst_argb = rotate_buffer;
+    dst_stride_argb = crop_width * 4;
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = YUY2ToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToARGB(src, aligned_src_width * 2,
-                     crop_argb, argb_stride,
+      r = UYVYToARGB(src, aligned_src_width * 2, dst_argb, dst_stride_argb,
                      crop_width, inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToARGB(src, src_width * 3,
-                      crop_argb, argb_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToARGB(src, src_width * 3,
-                    crop_argb, argb_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToARGB(src, src_width * 3, dst_argb, dst_stride_argb, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
-      src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      if (!need_buf && !rotation) {
+        src = sample + (src_width * crop_y + crop_x) * 4;
+        r = ARGBToARGB(src, src_width * 4, dst_argb, dst_stride_argb,
+                       crop_width, inv_crop_height);
+      }
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToARGB(src, src_width * 4,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AR30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AR30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
+      break;
+    case FOURCC_AB30:
+      src = sample + (src_width * crop_y + crop_x) * 4;
+      r = AB30ToARGB(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToARGB(src, src_width * 2,
-                       crop_argb, argb_stride,
+      r = RGB565ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                        crop_width, inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB1555ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToARGB(src, src_width * 2,
-                         crop_argb, argb_stride,
+      r = ARGB4444ToARGB(src, src_width * 2, dst_argb, dst_stride_argb,
                          crop_width, inv_crop_height);
       break;
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
 
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
-      r = NV12ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
+      r = NV12ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+      src_uv = sample + aligned_src_width * (abs_src_height + crop_y / 2) + crop_x;
       // Call NV12 but with u and v parameters swapped.
-      r = NV21ToARGB(src, src_width,
-                     src_uv, aligned_src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = NV21ToARGB(src, src_width, src_uv, aligned_src_width, dst_argb,
+                     dst_stride_argb, crop_width, inv_crop_height);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToARGB(src, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = M420ToARGB(src, src_width, dst_argb, dst_stride_argb, crop_width,
+                     inv_crop_height);
       break;
+
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_J420: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       src_u = sample + src_width * abs_src_height +
-          (halfwidth * crop_y + crop_x) / 2;
+              (halfwidth * crop_y + crop_x) / 2;
       src_v = sample + src_width * abs_src_height +
-          halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
-      r = J420ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+              halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+      r = J420ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToARGB(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToARGB(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -255,32 +256,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToARGB(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToARGB(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     crop_argb, argb_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToARGB(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_argb, dst_stride_argb, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToARGB(sample, sample_size,
-                     crop_argb, argb_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToARGB(sample, sample_size, dst_argb, dst_stride_argb, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -289,11 +272,14 @@ int ConvertToARGB(const uint8* sample, size_t sample_size,
 
   if (need_buf) {
     if (!r) {
-      r = ARGBRotate(crop_argb, argb_stride,
-                     dest_argb, dest_argb_stride,
+      r = ARGBRotate(dst_argb, dst_stride_argb, dest_argb, dest_dst_stride_argb,
                      crop_width, abs_crop_height, rotation);
     }
     free(rotate_buffer);
+  } else if (rotation) {
+    src = sample + (src_width * crop_y + crop_x) * 4;
+    r = ARGBRotate(src, src_width * 4, dst_argb, dst_stride_argb, crop_width,
+                   inv_crop_height, rotation);
   }
 
   return r;
diff --git a/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc
index e5f307c4464..df08309f9ba 100644
--- a/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc
+++ b/libs/libvpx/third_party/libyuv/source/convert_to_i420.cc
@@ -25,251 +25,216 @@ extern "C" {
 // sample_size is measured in bytes and is the size of the frame.
 //   With MJPEG it is the compressed size of the frame.
 LIBYUV_API
-int ConvertToI420(const uint8* sample,
+int ConvertToI420(const uint8_t* sample,
                   size_t sample_size,
-                  uint8* y, int y_stride,
-                  uint8* u, int u_stride,
-                  uint8* v, int v_stride,
-                  int crop_x, int crop_y,
-                  int src_width, int src_height,
-                  int crop_width, int crop_height,
+                  uint8_t* dst_y,
+                  int dst_stride_y,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int crop_x,
+                  int crop_y,
+                  int src_width,
+                  int src_height,
+                  int crop_width,
+                  int crop_height,
                   enum RotationMode rotation,
-                  uint32 fourcc) {
-  uint32 format = CanonicalFourCC(fourcc);
+                  uint32_t fourcc) {
+  uint32_t format = CanonicalFourCC(fourcc);
   int aligned_src_width = (src_width + 1) & ~1;
-  const uint8* src;
-  const uint8* src_uv;
+  const uint8_t* src;
+  const uint8_t* src_uv;
   const int abs_src_height = (src_height < 0) ? -src_height : src_height;
   // TODO(nisse): Why allow crop_height < 0?
   const int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
   int r = 0;
-  LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
-      format != FOURCC_NV12 && format != FOURCC_NV21 &&
-      format != FOURCC_YV12) || y == sample;
-  uint8* tmp_y = y;
-  uint8* tmp_u = u;
-  uint8* tmp_v = v;
-  int tmp_y_stride = y_stride;
-  int tmp_u_stride = u_stride;
-  int tmp_v_stride = v_stride;
-  uint8* rotate_buffer = NULL;
+  LIBYUV_BOOL need_buf =
+      (rotation && format != FOURCC_I420 && format != FOURCC_NV12 &&
+       format != FOURCC_NV21 && format != FOURCC_YV12) ||
+      dst_y == sample;
+  uint8_t* tmp_y = dst_y;
+  uint8_t* tmp_u = dst_u;
+  uint8_t* tmp_v = dst_v;
+  int tmp_y_stride = dst_stride_y;
+  int tmp_u_stride = dst_stride_u;
+  int tmp_v_stride = dst_stride_v;
+  uint8_t* rotate_buffer = NULL;
   const int inv_crop_height =
       (src_height < 0) ? -abs_crop_height : abs_crop_height;
 
-  if (!y || !u || !v || !sample ||
-      src_width <= 0 || crop_width <= 0  ||
-      src_height == 0 || crop_height == 0) {
+  if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
+      crop_width <= 0 || src_height == 0 || crop_height == 0) {
     return -1;
   }
 
   // One pass rotation is available for some formats. For the rest, convert
   // to I420 (with optional vertical flipping) into a temporary I420 buffer,
   // and then rotate the I420 to the final destination buffer.
-  // For in-place conversion, if destination y is same as source sample,
+  // For in-place conversion, if destination dst_y is same as source sample,
   // also enable temporary buffer.
   if (need_buf) {
     int y_size = crop_width * abs_crop_height;
     int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
-    rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+    rotate_buffer = (uint8_t*)malloc(y_size + uv_size * 2); /* NOLINT */
     if (!rotate_buffer) {
       return 1;  // Out of memory runtime error.
     }
-    y = rotate_buffer;
-    u = y + y_size;
-    v = u + uv_size;
-    y_stride = crop_width;
-    u_stride = v_stride = ((crop_width + 1) / 2);
+    dst_y = rotate_buffer;
+    dst_u = dst_y + y_size;
+    dst_v = dst_u + uv_size;
+    dst_stride_y = crop_width;
+    dst_stride_u = dst_stride_v = ((crop_width + 1) / 2);
   }
 
   switch (format) {
     // Single plane formats
     case FOURCC_YUY2:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = YUY2ToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_UYVY:
       src = sample + (aligned_src_width * crop_y + crop_x) * 2;
-      r = UYVYToI420(src, aligned_src_width * 2,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBP:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = RGB565ToI420(src, src_width * 2,
-                       y, y_stride,
-                       u, u_stride,
-                       v, v_stride,
-                       crop_width, inv_crop_height);
+      r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                       dst_stride_u, dst_v, dst_stride_v, crop_width,
+                       inv_crop_height);
       break;
     case FOURCC_RGBO:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB1555ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB1555ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_R444:
       src = sample + (src_width * crop_y + crop_x) * 2;
-      r = ARGB4444ToI420(src, src_width * 2,
-                         y, y_stride,
-                         u, u_stride,
-                         v, v_stride,
-                         crop_width, inv_crop_height);
+      r = ARGB4444ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
+                         dst_stride_u, dst_v, dst_stride_v, crop_width,
+                         inv_crop_height);
       break;
     case FOURCC_24BG:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RGB24ToI420(src, src_width * 3,
-                      y, y_stride,
-                      u, u_stride,
-                      v, v_stride,
-                      crop_width, inv_crop_height);
+      r = RGB24ToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                      dst_stride_u, dst_v, dst_stride_v, crop_width,
+                      inv_crop_height);
       break;
     case FOURCC_RAW:
       src = sample + (src_width * crop_y + crop_x) * 3;
-      r = RAWToI420(src, src_width * 3,
-                    y, y_stride,
-                    u, u_stride,
-                    v, v_stride,
-                    crop_width, inv_crop_height);
+      r = RAWToI420(src, src_width * 3, dst_y, dst_stride_y, dst_u,
+                    dst_stride_u, dst_v, dst_stride_v, crop_width,
+                    inv_crop_height);
       break;
     case FOURCC_ARGB:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ARGBToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ARGBToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_BGRA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = BGRAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = BGRAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_ABGR:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = ABGRToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = ABGRToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
     case FOURCC_RGBA:
       src = sample + (src_width * crop_y + crop_x) * 4;
-      r = RGBAToI420(src, src_width * 4,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = RGBAToI420(src, src_width * 4, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, crop_width,
+                     inv_crop_height);
       break;
+    // TODO(fbarchard): Add AR30 and AB30
     case FOURCC_I400:
       src = sample + src_width * crop_y + crop_x;
-      r = I400ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I400ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Biplanar formats
     case FOURCC_NV12:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           u, u_stride,
-                           v, v_stride,
-                           crop_width, inv_crop_height, rotation);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_u, dst_stride_u, dst_v,
+                           dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_NV21:
       src = sample + (src_width * crop_y + crop_x);
-      src_uv = sample + (src_width * src_height) +
-        ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
-      // Call NV12 but with u and v parameters swapped.
-      r = NV12ToI420Rotate(src, src_width,
-                           src_uv, aligned_src_width,
-                           y, y_stride,
-                           v, v_stride,
-                           u, u_stride,
-                           crop_width, inv_crop_height, rotation);
+      src_uv = sample + (src_width * abs_src_height) +
+               ((crop_y / 2) * aligned_src_width) + ((crop_x / 2) * 2);
+      // Call NV12 but with dst_u and dst_v parameters swapped.
+      r = NV12ToI420Rotate(src, src_width, src_uv, aligned_src_width, dst_y,
+                           dst_stride_y, dst_v, dst_stride_v, dst_u,
+                           dst_stride_u, crop_width, inv_crop_height, rotation);
       break;
     case FOURCC_M420:
       src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
-      r = M420ToI420(src, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = M420ToI420(src, src_width, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                     dst_v, dst_stride_v, crop_width, inv_crop_height);
       break;
     // Triplanar formats
     case FOURCC_I420:
     case FOURCC_YV12: {
-      const uint8* src_y = sample + (src_width * crop_y + crop_x);
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + (src_width * crop_y + crop_x);
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       int halfheight = (abs_src_height + 1) / 2;
       if (format == FOURCC_YV12) {
         src_v = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       } else {
         src_u = sample + src_width * abs_src_height +
-            (halfwidth * crop_y + crop_x) / 2;
+                (halfwidth * crop_y + crop_x) / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+                halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
       }
-      r = I420Rotate(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height, rotation);
+      r = I420Rotate(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height, rotation);
       break;
     }
     case FOURCC_I422:
     case FOURCC_YV16: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       int halfwidth = (src_width + 1) / 2;
       if (format == FOURCC_YV16) {
-        src_v = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_v = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_u = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       } else {
-        src_u = sample + src_width * abs_src_height +
-            halfwidth * crop_y + crop_x / 2;
+        src_u = sample + src_width * abs_src_height + halfwidth * crop_y +
+                crop_x / 2;
         src_v = sample + src_width * abs_src_height +
-            halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+                halfwidth * (abs_src_height + crop_y) + crop_x / 2;
       }
-      r = I422ToI420(src_y, src_width,
-                     src_u, halfwidth,
-                     src_v, halfwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I422ToI420(src_y, src_width, src_u, halfwidth, src_v, halfwidth,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
     case FOURCC_I444:
     case FOURCC_YV24: {
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u;
-      const uint8* src_v;
+      const uint8_t* src_y = sample + src_width * crop_y + crop_x;
+      const uint8_t* src_u;
+      const uint8_t* src_v;
       if (format == FOURCC_YV24) {
         src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
@@ -277,38 +242,16 @@ int ConvertToI420(const uint8* sample,
         src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
         src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
       }
-      r = I444ToI420(src_y, src_width,
-                     src_u, src_width,
-                     src_v, src_width,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
-      break;
-    }
-    case FOURCC_I411: {
-      int quarterwidth = (src_width + 3) / 4;
-      const uint8* src_y = sample + src_width * crop_y + crop_x;
-      const uint8* src_u = sample + src_width * abs_src_height +
-          quarterwidth * crop_y + crop_x / 4;
-      const uint8* src_v = sample + src_width * abs_src_height +
-          quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
-      r = I411ToI420(src_y, src_width,
-                     src_u, quarterwidth,
-                     src_v, quarterwidth,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     crop_width, inv_crop_height);
+      r = I444ToI420(src_y, src_width, src_u, src_width, src_v, src_width,
+                     dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, crop_width, inv_crop_height);
       break;
     }
 #ifdef HAVE_JPEG
     case FOURCC_MJPG:
-      r = MJPGToI420(sample, sample_size,
-                     y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     src_width, abs_src_height, crop_width, inv_crop_height);
+      r = MJPGToI420(sample, sample_size, dst_y, dst_stride_y, dst_u,
+                     dst_stride_u, dst_v, dst_stride_v, src_width,
+                     abs_src_height, crop_width, inv_crop_height);
       break;
 #endif
     default:
@@ -317,13 +260,10 @@ int ConvertToI420(const uint8* sample,
 
   if (need_buf) {
     if (!r) {
-      r = I420Rotate(y, y_stride,
-                     u, u_stride,
-                     v, v_stride,
-                     tmp_y, tmp_y_stride,
-                     tmp_u, tmp_u_stride,
-                     tmp_v, tmp_v_stride,
-                     crop_width, abs_crop_height, rotation);
+      r = I420Rotate(dst_y, dst_stride_y, dst_u, dst_stride_u, dst_v,
+                     dst_stride_v, tmp_y, tmp_y_stride, tmp_u, tmp_u_stride,
+                     tmp_v, tmp_v_stride, crop_width, abs_crop_height,
+                     rotation);
     }
     free(rotate_buffer);
   }
diff --git a/libs/libvpx/third_party/libyuv/source/cpu_id.cc b/libs/libvpx/third_party/libyuv/source/cpu_id.cc
index 84927ebc3e2..31e24b6739b 100644
--- a/libs/libvpx/third_party/libyuv/source/cpu_id.cc
+++ b/libs/libvpx/third_party/libyuv/source/cpu_id.cc
@@ -13,22 +13,16 @@
 #if defined(_MSC_VER)
 #include <intrin.h>  // For __cpuidex()
 #endif
-#if !defined(__pnacl__) && !defined(__CLR_VER) && \
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                           \
     !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \
     defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
 #include <immintrin.h>  // For _xgetbv()
 #endif
 
-#if !defined(__native_client__)
-#include <stdlib.h>  // For getenv()
-#endif
-
 // For ArmCpuCaps() but unittested on all platforms
 #include <stdio.h>
 #include <string.h>
 
-#include "libyuv/basic_types.h"  // For CPU_X86
-
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -43,16 +37,20 @@ extern "C" {
 #define SAFEBUFFERS
 #endif
 
+// cpu_info_ variable for SIMD instruction sets detected.
+LIBYUV_API int cpu_info_ = 0;
+
+// TODO(fbarchard): Consider using int for cpuid so casting is not needed.
 // Low level cpuid for X86.
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER)
 LIBYUV_API
-void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
+void CpuId(int info_eax, int info_ecx, int* cpu_info) {
 #if defined(_MSC_VER)
 // Visual C version uses intrinsic or inline x86 assembly.
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  __cpuidex((int*)(cpu_info), info_eax, info_ecx);
+  __cpuidex(cpu_info, info_eax, info_ecx);
 #elif defined(_M_IX86)
   __asm {
     mov        eax, info_eax
@@ -66,26 +64,26 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
   }
 #else  // Visual C but not x86
   if (info_ecx == 0) {
-    __cpuid((int*)(cpu_info), info_eax);
+    __cpuid(cpu_info, info_eax);
   } else {
-    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0;
+    cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0u;
   }
 #endif
 // GCC version uses inline x86 assembly.
 #else  // defined(_MSC_VER)
-  uint32 info_ebx, info_edx;
-  asm volatile (
-#if defined( __i386__) && defined(__PIC__)
-    // Preserve ebx for fpic 32 bit.
-    "mov %%ebx, %%edi                          \n"
-    "cpuid                                     \n"
-    "xchg %%edi, %%ebx                         \n"
-    : "=D" (info_ebx),
+  int info_ebx, info_edx;
+  asm volatile(
+#if defined(__i386__) && defined(__PIC__)
+      // Preserve ebx for fpic 32 bit.
+      "mov %%ebx, %%edi                          \n"
+      "cpuid                                     \n"
+      "xchg %%edi, %%ebx                         \n"
+      : "=D"(info_ebx),
 #else
-    "cpuid                                     \n"
-    : "=b" (info_ebx),
+      "cpuid                                     \n"
+      : "=b"(info_ebx),
 #endif  //  defined( __i386__) && defined(__PIC__)
-      "+a" (info_eax), "+c" (info_ecx), "=d" (info_edx));
+        "+a"(info_eax), "+c"(info_ecx), "=d"(info_edx));
   cpu_info[0] = info_eax;
   cpu_info[1] = info_ebx;
   cpu_info[2] = info_ecx;
@@ -94,7 +92,9 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
 }
 #else  // (defined(_M_IX86) || defined(_M_X64) ...
 LIBYUV_API
-void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
+void CpuId(int eax, int ecx, int* cpu_info) {
+  (void)eax;
+  (void)ecx;
   cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0;
 }
 #endif
@@ -111,20 +111,22 @@ void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) {
 #if defined(_M_IX86) && (_MSC_VER < 1900)
 #pragma optimize("g", off)
 #endif
-#if (defined(_M_IX86) || defined(_M_X64) || \
-    defined(__i386__) || defined(__x86_64__)) && \
+#if (defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+     defined(__x86_64__)) &&                                     \
     !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__)
-#define HAS_XGETBV
 // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers.
 int GetXCR0() {
-  uint32 xcr0 = 0u;
+  int xcr0 = 0;
 #if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 160040219)
-  xcr0 = (uint32)(_xgetbv(0));  // VS2010 SP1 required.
+  xcr0 = (int)_xgetbv(0);  // VS2010 SP1 required.  NOLINT
 #elif defined(__i386__) || defined(__x86_64__)
-  asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx");
+  asm(".byte 0x0f, 0x01, 0xd0" : "=a"(xcr0) : "c"(0) : "%edx");
 #endif  // defined(__i386__) || defined(__x86_64__)
   return xcr0;
 }
+#else
+// xgetbv unavailable to query for OSSave support.  Return 0.
+#define GetXCR0() 0
 #endif  // defined(_M_IX86) || defined(_M_X64) ..
 // Return optimization to previous setting.
 #if defined(_M_IX86) && (_MSC_VER < 1900)
@@ -133,8 +135,7 @@ int GetXCR0() {
 
 // based on libvpx arm_cpudetect.c
 // For Arm, but public to allow testing on any CPU
-LIBYUV_API SAFEBUFFERS
-int ArmCpuCaps(const char* cpuinfo_name) {
+LIBYUV_API SAFEBUFFERS int ArmCpuCaps(const char* cpuinfo_name) {
   char cpuinfo_line[512];
   FILE* f = fopen(cpuinfo_name, "r");
   if (!f) {
@@ -151,7 +152,7 @@ int ArmCpuCaps(const char* cpuinfo_name) {
       }
       // aarch64 uses asimd for Neon.
       p = strstr(cpuinfo_line, " asimd");
-      if (p && (p[6] == ' ' || p[6] == '\n')) {
+      if (p) {
         fclose(f);
         return kCpuHasNEON;
       }
@@ -161,103 +162,78 @@ int ArmCpuCaps(const char* cpuinfo_name) {
   return 0;
 }
 
-// CPU detect function for SIMD instruction sets.
-LIBYUV_API
-int cpu_info_ = 0;  // cpu_info is not initialized yet.
-
-// Test environment variable for disabling CPU features. Any non-zero value
-// to disable. Zero ignored to make it easy to set the variable on/off.
-#if !defined(__native_client__) && !defined(_M_ARM)
-
-static LIBYUV_BOOL TestEnv(const char* name) {
-  const char* var = getenv(name);
-  if (var) {
-    if (var[0] != '0') {
-      return LIBYUV_TRUE;
+// TODO(fbarchard): Consider read_msa_ir().
+// TODO(fbarchard): Add unittest.
+LIBYUV_API SAFEBUFFERS int MipsCpuCaps(const char* cpuinfo_name,
+                                       const char ase[]) {
+  char cpuinfo_line[512];
+  FILE* f = fopen(cpuinfo_name, "r");
+  if (!f) {
+    // ase enabled if /proc/cpuinfo is unavailable.
+    if (strcmp(ase, " msa") == 0) {
+      return kCpuHasMSA;
     }
+    return 0;
   }
-  return LIBYUV_FALSE;
-}
-#else  // nacl does not support getenv().
-static LIBYUV_BOOL TestEnv(const char*) {
-  return LIBYUV_FALSE;
+  while (fgets(cpuinfo_line, sizeof(cpuinfo_line) - 1, f)) {
+    if (memcmp(cpuinfo_line, "ASEs implemented", 16) == 0) {
+      char* p = strstr(cpuinfo_line, ase);
+      if (p) {
+        fclose(f);
+        if (strcmp(ase, " msa") == 0) {
+          return kCpuHasMSA;
+        }
+        return 0;
+      }
+    }
+  }
+  fclose(f);
+  return 0;
 }
-#endif
 
-LIBYUV_API SAFEBUFFERS
-int InitCpuFlags(void) {
-  // TODO(fbarchard): swap kCpuInit logic so 0 means uninitialized.
+static SAFEBUFFERS int GetCpuFlags(void) {
   int cpu_info = 0;
-#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
-  uint32 cpu_info0[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info1[4] = { 0, 0, 0, 0 };
-  uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+#if !defined(__pnacl__) && !defined(__CLR_VER) &&                   \
+    (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+     defined(_M_IX86))
+  int cpu_info0[4] = {0, 0, 0, 0};
+  int cpu_info1[4] = {0, 0, 0, 0};
+  int cpu_info7[4] = {0, 0, 0, 0};
   CpuId(0, 0, cpu_info0);
   CpuId(1, 0, cpu_info1);
   if (cpu_info0[0] >= 7) {
     CpuId(7, 0, cpu_info7);
   }
-  cpu_info = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
+  cpu_info = kCpuHasX86 | ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
              ((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
              ((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
              ((cpu_info1[2] & 0x00100000) ? kCpuHasSSE42 : 0) |
-             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
-             ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
-             kCpuHasX86;
+             ((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0);
 
-#ifdef HAS_XGETBV
-  // AVX requires CPU has AVX, XSAVE and OSXSave for xgetbv
+  // AVX requires OS saves YMM registers.
   if (((cpu_info1[2] & 0x1c000000) == 0x1c000000) &&  // AVX and OSXSave
       ((GetXCR0() & 6) == 6)) {  // Test OS saves YMM registers
-    cpu_info |= ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) | kCpuHasAVX;
+    cpu_info |= kCpuHasAVX | ((cpu_info7[1] & 0x00000020) ? kCpuHasAVX2 : 0) |
+                ((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
+                ((cpu_info1[2] & 0x20000000) ? kCpuHasF16C : 0);
 
     // Detect AVX512bw
     if ((GetXCR0() & 0xe0) == 0xe0) {
-      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX3 : 0;
+      cpu_info |= (cpu_info7[1] & 0x40000000) ? kCpuHasAVX512BW : 0;
+      cpu_info |= (cpu_info7[1] & 0x80000000) ? kCpuHasAVX512VL : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000002) ? kCpuHasAVX512VBMI : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000040) ? kCpuHasAVX512VBMI2 : 0;
+      cpu_info |= (cpu_info7[2] & 0x00001000) ? kCpuHasAVX512VBITALG : 0;
+      cpu_info |= (cpu_info7[2] & 0x00004000) ? kCpuHasAVX512VPOPCNTDQ : 0;
+      cpu_info |= (cpu_info7[2] & 0x00000100) ? kCpuHasGFNI : 0;
     }
   }
 #endif
-
-  // Environment variable overrides for testing.
-  if (TestEnv("LIBYUV_DISABLE_X86")) {
-    cpu_info &= ~kCpuHasX86;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE2")) {
-    cpu_info &= ~kCpuHasSSE2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSSE3")) {
-    cpu_info &= ~kCpuHasSSSE3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE41")) {
-    cpu_info &= ~kCpuHasSSE41;
-  }
-  if (TestEnv("LIBYUV_DISABLE_SSE42")) {
-    cpu_info &= ~kCpuHasSSE42;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX")) {
-    cpu_info &= ~kCpuHasAVX;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX2")) {
-    cpu_info &= ~kCpuHasAVX2;
-  }
-  if (TestEnv("LIBYUV_DISABLE_ERMS")) {
-    cpu_info &= ~kCpuHasERMS;
-  }
-  if (TestEnv("LIBYUV_DISABLE_FMA3")) {
-    cpu_info &= ~kCpuHasFMA3;
-  }
-  if (TestEnv("LIBYUV_DISABLE_AVX3")) {
-    cpu_info &= ~kCpuHasAVX3;
-  }
-#endif
 #if defined(__mips__) && defined(__linux__)
-#if defined(__mips_dspr2)
-  cpu_info |= kCpuHasDSPR2;
+#if defined(__mips_msa)
+  cpu_info = MipsCpuCaps("/proc/cpuinfo", " msa");
 #endif
   cpu_info |= kCpuHasMIPS;
-  if (getenv("LIBYUV_DISABLE_DSPR2")) {
-    cpu_info &= ~kCpuHasDSPR2;
-  }
 #endif
 #if defined(__arm__) || defined(__aarch64__)
 // gcc -mfpu=neon defines __ARM_NEON__
@@ -276,22 +252,22 @@ int InitCpuFlags(void) {
   cpu_info = ArmCpuCaps("/proc/cpuinfo");
 #endif
   cpu_info |= kCpuHasARM;
-  if (TestEnv("LIBYUV_DISABLE_NEON")) {
-    cpu_info &= ~kCpuHasNEON;
-  }
 #endif  // __arm__
-  if (TestEnv("LIBYUV_DISABLE_ASM")) {
-    cpu_info = 0;
-  }
-  cpu_info  |= kCpuInitialized;
-  cpu_info_ = cpu_info;
+  cpu_info |= kCpuInitialized;
   return cpu_info;
 }
 
 // Note that use of this function is not thread safe.
 LIBYUV_API
-void MaskCpuFlags(int enable_flags) {
-  cpu_info_ = InitCpuFlags() & enable_flags;
+int MaskCpuFlags(int enable_flags) {
+  int cpu_info = GetCpuFlags() & enable_flags;
+  SetCpuFlags(cpu_info);
+  return cpu_info;
+}
+
+LIBYUV_API
+int InitCpuFlags(void) {
+  return MaskCpuFlags(-1);
 }
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
index 22025ad04ae..eaf2530130b 100644
--- a/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
+++ b/libs/libvpx/third_party/libyuv/source/mjpeg_decoder.cc
@@ -21,7 +21,7 @@
 
 #if defined(_MSC_VER)
 // disable warning 4324: structure was padded due to __declspec(align())
-#pragma warning(disable:4324)
+#pragma warning(disable : 4324)
 #endif
 
 #endif
@@ -102,7 +102,7 @@ MJpegDecoder::~MJpegDecoder() {
   DestroyOutputBuffers();
 }
 
-LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8_t* src, size_t src_len) {
   if (!ValidateJpeg(src, src_len)) {
     return LIBYUV_FALSE;
   }
@@ -129,7 +129,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (scanlines_[i]) {
         delete scanlines_[i];
       }
-      scanlines_[i] = new uint8* [scanlines_size];
+      scanlines_[i] = new uint8_t*[scanlines_size];
       scanlines_sizes_[i] = scanlines_size;
     }
 
@@ -145,7 +145,7 @@ LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
       if (databuf_[i]) {
         delete databuf_[i];
       }
-      databuf_[i] = new uint8[databuf_size];
+      databuf_[i] = new uint8_t[databuf_size];
       databuf_strides_[i] = databuf_stride;
     }
 
@@ -195,13 +195,11 @@ int MJpegDecoder::GetVertSampFactor(int component) {
 }
 
 int MJpegDecoder::GetHorizSubSampFactor(int component) {
-  return decompress_struct_->max_h_samp_factor /
-      GetHorizSampFactor(component);
+  return decompress_struct_->max_h_samp_factor / GetHorizSampFactor(component);
 }
 
 int MJpegDecoder::GetVertSubSampFactor(int component) {
-  return decompress_struct_->max_v_samp_factor /
-      GetVertSampFactor(component);
+  return decompress_struct_->max_v_samp_factor / GetVertSampFactor(component);
 }
 
 int MJpegDecoder::GetImageScanlinesPerImcuRow() {
@@ -245,10 +243,10 @@ LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
 }
 
 // TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
-LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
-    uint8** planes, int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(uint8_t** planes,
+                                          int dst_width,
+                                          int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -289,14 +287,13 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
       for (int i = 0; i < num_outbufs_; ++i) {
         // TODO(fbarchard): Compute skip to avoid this
         assert(skip % GetVertSubSampFactor(i) == 0);
-        int rows_to_skip =
-            DivideAndRoundDown(skip, GetVertSubSampFactor(i));
-        int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
-                                rows_to_skip;
+        int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+        int scanlines_to_copy =
+            GetComponentScanlinesPerImcuRow(i) - rows_to_skip;
         int data_to_skip = rows_to_skip * GetComponentStride(i);
-        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
-                  planes[i], GetComponentWidth(i),
-                  GetComponentWidth(i), scanlines_to_copy);
+        CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i), planes[i],
+                  GetComponentWidth(i), GetComponentWidth(i),
+                  scanlines_to_copy);
         planes[i] += scanlines_to_copy * GetComponentWidth(i);
       }
       lines_left -= (GetImageScanlinesPerImcuRow() - skip);
@@ -305,16 +302,15 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
 
   // Read full MCUs but cropped horizontally
   for (; lines_left > GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
     }
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
@@ -328,19 +324,19 @@ LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
     for (int i = 0; i < num_outbufs_; ++i) {
       int scanlines_to_copy =
           DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
-      CopyPlane(databuf_[i], GetComponentStride(i),
-                planes[i], GetComponentWidth(i),
-                GetComponentWidth(i), scanlines_to_copy);
+      CopyPlane(databuf_[i], GetComponentStride(i), planes[i],
+                GetComponentWidth(i), GetComponentWidth(i), scanlines_to_copy);
       planes[i] += scanlines_to_copy * GetComponentWidth(i);
     }
   }
   return FinishDecode();
 }
 
-LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
-    int dst_width, int dst_height) {
-  if (dst_width != GetWidth() ||
-      dst_height > GetHeight()) {
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn,
+                                           void* opaque,
+                                           int dst_width,
+                                           int dst_height) {
+  if (dst_width != GetWidth() || dst_height > GetHeight()) {
     // ERROR: Bad dimensions
     return LIBYUV_FALSE;
   }
@@ -395,7 +391,7 @@ LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
   }
   // Read full MCUs until we get to the crop point.
   for (; lines_left >= GetImageScanlinesPerImcuRow();
-         lines_left -= GetImageScanlinesPerImcuRow()) {
+       lines_left -= GetImageScanlinesPerImcuRow()) {
     if (!DecodeImcuRow()) {
       FinishDecode();
       return LIBYUV_FALSE;
@@ -435,22 +431,22 @@ void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
 }
 
 void term_source(j_decompress_ptr cinfo) {
-  // Nothing to do.
+  (void)cinfo;  // Nothing to do.
 }
 
 #ifdef HAVE_SETJMP
 void ErrorHandler(j_common_ptr cinfo) {
-  // This is called when a jpeglib command experiences an error. Unfortunately
-  // jpeglib's error handling model is not very flexible, because it expects the
-  // error handler to not return--i.e., it wants the program to terminate. To
-  // recover from errors we use setjmp() as shown in their example. setjmp() is
-  // C's implementation for the "call with current continuation" functionality
-  // seen in some functional programming languages.
-  // A formatted message can be output, but is unsafe for release.
+// This is called when a jpeglib command experiences an error. Unfortunately
+// jpeglib's error handling model is not very flexible, because it expects the
+// error handler to not return--i.e., it wants the program to terminate. To
+// recover from errors we use setjmp() as shown in their example. setjmp() is
+// C's implementation for the "call with current continuation" functionality
+// seen in some functional programming languages.
+// A formatted message can be output, but is unsafe for release.
 #ifdef DEBUG
   char buf[JMSG_LENGTH_MAX];
   (*cinfo->err->format_message)(cinfo, buf);
-  // ERROR: Error in jpeglib: buf
+// ERROR: Error in jpeglib: buf
 #endif
 
   SetJmpErrorMgr* mgr = reinterpret_cast<SetJmpErrorMgr*>(cinfo->err);
@@ -459,8 +455,9 @@ void ErrorHandler(j_common_ptr cinfo) {
   longjmp(mgr->setjmp_buffer, 1);
 }
 
+// Suppress fprintf warnings.
 void OutputHandler(j_common_ptr cinfo) {
-  // Suppress fprintf warnings.
+  (void)cinfo;
 }
 
 #endif  // HAVE_SETJMP
@@ -472,9 +469,9 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
     // it.
     DestroyOutputBuffers();
 
-    scanlines_ = new uint8** [num_outbufs];
+    scanlines_ = new uint8_t**[num_outbufs];
     scanlines_sizes_ = new int[num_outbufs];
-    databuf_ = new uint8* [num_outbufs];
+    databuf_ = new uint8_t*[num_outbufs];
     databuf_strides_ = new int[num_outbufs];
 
     for (int i = 0; i < num_outbufs; ++i) {
@@ -490,13 +487,13 @@ void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
 
 void MJpegDecoder::DestroyOutputBuffers() {
   for (int i = 0; i < num_outbufs_; ++i) {
-    delete [] scanlines_[i];
-    delete [] databuf_[i];
+    delete[] scanlines_[i];
+    delete[] databuf_[i];
   }
-  delete [] scanlines_;
-  delete [] databuf_;
-  delete [] scanlines_sizes_;
-  delete [] databuf_strides_;
+  delete[] scanlines_;
+  delete[] databuf_;
+  delete[] scanlines_sizes_;
+  delete[] databuf_strides_;
   scanlines_ = NULL;
   databuf_ = NULL;
   scanlines_sizes_ = NULL;
@@ -530,9 +527,9 @@ LIBYUV_BOOL MJpegDecoder::FinishDecode() {
   return LIBYUV_TRUE;
 }
 
-void MJpegDecoder::SetScanlinePointers(uint8** data) {
+void MJpegDecoder::SetScanlinePointers(uint8_t** data) {
   for (int i = 0; i < num_outbufs_; ++i) {
-    uint8* data_i = data[i];
+    uint8_t* data_i = data[i];
     for (int j = 0; j < scanlines_sizes_[i]; ++j) {
       scanlines_[i][j] = data_i;
       data_i += GetComponentStride(i);
@@ -542,26 +539,26 @@ void MJpegDecoder::SetScanlinePointers(uint8** data) {
 
 inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
   return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
-      jpeg_read_raw_data(decompress_struct_,
-                         scanlines_,
-                         GetImageScanlinesPerImcuRow());
+         jpeg_read_raw_data(decompress_struct_, scanlines_,
+                            GetImageScanlinesPerImcuRow());
 }
 
 // The helper function which recognizes the jpeg sub-sampling type.
 JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
-    int* subsample_x, int* subsample_y, int number_of_components) {
+    int* subsample_x,
+    int* subsample_y,
+    int number_of_components) {
   if (number_of_components == 3) {  // Color images.
-    if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 2 &&
-        subsample_x[2] == 2 && subsample_y[2] == 2) {
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 2 && subsample_x[2] == 2 && subsample_y[2] == 2) {
       return kJpegYuv420;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 2 && subsample_y[1] == 1 &&
-        subsample_x[2] == 2 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 2 &&
+        subsample_y[1] == 1 && subsample_x[2] == 2 && subsample_y[2] == 1) {
       return kJpegYuv422;
-    } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
-        subsample_x[1] == 1 && subsample_y[1] == 1 &&
-        subsample_x[2] == 1 && subsample_y[2] == 1) {
+    }
+    if (subsample_x[0] == 1 && subsample_y[0] == 1 && subsample_x[1] == 1 &&
+        subsample_y[1] == 1 && subsample_x[2] == 1 && subsample_y[2] == 1) {
       return kJpegYuv444;
     }
   } else if (number_of_components == 1) {  // Grey-scale images.
@@ -574,4 +571,3 @@ JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
 
 }  // namespace libyuv
 #endif  // HAVE_JPEG
-
diff --git a/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc
index 9c488320451..80c2cc0cb9b 100644
--- a/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc
+++ b/libs/libvpx/third_party/libyuv/source/mjpeg_validate.cc
@@ -18,13 +18,13 @@ extern "C" {
 #endif
 
 // Helper function to scan for EOI marker (0xff 0xd9).
-static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
+static LIBYUV_BOOL ScanEOI(const uint8_t* sample, size_t sample_size) {
   if (sample_size >= 2) {
-    const uint8* end = sample + sample_size - 1;
-    const uint8* it = sample;
+    const uint8_t* end = sample + sample_size - 1;
+    const uint8_t* it = sample;
     while (it < end) {
       // TODO(fbarchard): scan for 0xd9 instead.
-      it = static_cast<const uint8 *>(memchr(it, 0xff, end - it));
+      it = (const uint8_t*)(memchr(it, 0xff, end - it));
       if (it == NULL) {
         break;
       }
@@ -39,7 +39,7 @@ static LIBYUV_BOOL ScanEOI(const uint8* sample, size_t sample_size) {
 }
 
 // Helper function to validate the jpeg appears intact.
-LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size) {
   // Maximum size that ValidateJpeg will consider valid.
   const size_t kMaxJpegSize = 0x7fffffffull;
   const size_t kBackSearchSize = 1024;
@@ -68,4 +68,3 @@ LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/libs/libvpx/third_party/libyuv/source/planar_functions.cc b/libs/libvpx/third_party/libyuv/source/planar_functions.cc
index a764f8da472..5eae3f763a7 100644
--- a/libs/libvpx/third_party/libyuv/source/planar_functions.cc
+++ b/libs/libvpx/third_party/libyuv/source/planar_functions.cc
@@ -26,11 +26,14 @@ extern "C" {
 
 // Copy a plane of data
 LIBYUV_API
-void CopyPlane(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+void CopyPlane(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -38,8 +41,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     dst_stride_y = -dst_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -48,6 +50,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
   if (src_y == dst_y && src_stride_y == dst_stride_y) {
     return;
   }
+
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -68,11 +71,6 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -83,15 +81,18 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
 }
 
 // TODO(fbarchard): Consider support for negative height.
+// TODO(fbarchard): Consider stride measured in bytes.
 LIBYUV_API
-void CopyPlane_16(const uint16* src_y, int src_stride_y,
-                  uint16* dst_y, int dst_stride_y,
-                  int width, int height) {
+void CopyPlane_16(const uint16_t* src_y,
+                  int src_stride_y,
+                  uint16_t* dst_y,
+                  int dst_stride_y,
+                  int width,
+                  int height) {
   int y;
-  void (*CopyRow)(const uint16* src, uint16* dst, int width) = CopyRow_16_C;
+  void (*CopyRow)(const uint16_t* src, uint16_t* dst, int width) = CopyRow_16_C;
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_y == width) {
+  if (src_stride_y == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_y = 0;
@@ -111,11 +112,6 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
     CopyRow = CopyRow_16_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_16_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_16_MIPS;
-  }
-#endif
 
   // Copy plane
   for (y = 0; y < height; ++y) {
@@ -125,19 +121,124 @@ void CopyPlane_16(const uint16* src_y, int src_stride_y,
   }
 }
 
+// Convert a plane of 16 bit data to 8 bit
+LIBYUV_API
+void Convert16To8Plane(const uint16_t* src_y,
+                       int src_stride_y,
+                       uint8_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert16To8Row)(const uint16_t* src_y, uint8_t* dst_y, int scale,
+                          int width) = Convert16To8Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT16TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert16To8Row = Convert16To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      Convert16To8Row = Convert16To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT16TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert16To8Row = Convert16To8Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert16To8Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
+// Convert a plane of 8 bit data to 16 bit
+LIBYUV_API
+void Convert8To16Plane(const uint8_t* src_y,
+                       int src_stride_y,
+                       uint16_t* dst_y,
+                       int dst_stride_y,
+                       int scale,  // 16384 for 10 bits
+                       int width,
+                       int height) {
+  int y;
+  void (*Convert8To16Row)(const uint8_t* src_y, uint16_t* dst_y, int scale,
+                          int width) = Convert8To16Row_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO16ROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    Convert8To16Row = Convert8To16Row_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      Convert8To16Row = Convert8To16Row_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO16ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To16Row = Convert8To16Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To16Row = Convert8To16Row_AVX2;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To16Row(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Copy I422.
 LIBYUV_API
-int I422Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
+int I422Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
   int halfwidth = (width + 1) >> 1;
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -161,16 +262,21 @@ int I422Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I444.
 LIBYUV_API
-int I444Copy(const uint8* src_y, int src_stride_y,
-             const uint8* src_u, int src_stride_u,
-             const uint8* src_v, int src_stride_v,
-             uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int width, int height) {
-  if (!src_u || !src_v ||
-      !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+int I444Copy(const uint8_t* src_y,
+             int src_stride_y,
+             const uint8_t* src_u,
+             int src_stride_u,
+             const uint8_t* src_v,
+             int src_stride_v,
+             uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int width,
+             int height) {
+  if (!src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -194,9 +300,12 @@ int I444Copy(const uint8* src_y, int src_stride_y,
 
 // Copy I400.
 LIBYUV_API
-int I400ToI400(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I400ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -212,11 +321,20 @@ int I400ToI400(const uint8* src_y, int src_stride_y,
 
 // Convert I420 to I400.
 LIBYUV_API
-int I420ToI400(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int I420ToI400(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  (void)src_u;
+  (void)src_stride_u;
+  (void)src_v;
+  (void)src_stride_v;
   if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -234,12 +352,16 @@ int I420ToI400(const uint8* src_y, int src_stride_y,
 // Support function for NV12 etc UV channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
-void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
-                  uint8* dst_u, int dst_stride_u,
-                  uint8* dst_v, int dst_stride_v,
-                  int width, int height) {
+void SplitUVPlane(const uint8_t* src_uv,
+                  int src_stride_uv,
+                  uint8_t* dst_u,
+                  int dst_stride_u,
+                  uint8_t* dst_v,
+                  int dst_stride_v,
+                  int width,
+                  int height) {
   int y;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
   // Negative height means invert the image.
   if (height < 0) {
@@ -250,8 +372,7 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
     dst_stride_v = -dst_stride_v;
   }
   // Coalesce rows.
-  if (src_stride_uv == width * 2 &&
-      dst_stride_u == width &&
+  if (src_stride_uv == width * 2 && dst_stride_u == width &&
       dst_stride_v == width) {
     width *= height;
     height = 1;
@@ -281,13 +402,11 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
     }
   }
 #endif
-#if defined(HAS_SPLITUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
-      IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
-    SplitUVRow = SplitUVRow_Any_DSPR2;
-    if (IS_ALIGNED(width, 16)) {
-      SplitUVRow = SplitUVRow_DSPR2;
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
     }
   }
 #endif
@@ -302,13 +421,17 @@ void SplitUVPlane(const uint8* src_uv, int src_stride_uv,
 }
 
 LIBYUV_API
-void MergeUVPlane(const uint8* src_u, int src_stride_u,
-                  const uint8* src_v, int src_stride_v,
-                  uint8* dst_uv, int dst_stride_uv,
-                  int width, int height) {
+void MergeUVPlane(const uint8_t* src_u,
+                  int src_stride_u,
+                  const uint8_t* src_v,
+                  int src_stride_v,
+                  uint8_t* dst_uv,
+                  int dst_stride_uv,
+                  int width,
+                  int height) {
   int y;
-  void (*MergeUVRow)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-      int width) = MergeUVRow_C;
+  void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
+                     uint8_t* dst_uv, int width) = MergeUVRow_C;
   // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
@@ -317,8 +440,7 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
     dst_stride_uv = -dst_stride_uv;
   }
   // Coalesce rows.
-  if (src_stride_u == width &&
-      src_stride_v == width &&
+  if (src_stride_u == width && src_stride_v == width &&
       dst_stride_uv == width * 2) {
     width *= height;
     height = 1;
@@ -348,6 +470,14 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
     }
   }
 #endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      MergeUVRow = MergeUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     // Merge a row of U and V into a row of UV.
@@ -358,12 +488,131 @@ void MergeUVPlane(const uint8* src_u, int src_stride_u,
   }
 }
 
+// Support function for NV12 etc RGB channels.
+// Width and height are plane sizes (typically half pixel width).
+LIBYUV_API
+void SplitRGBPlane(const uint8_t* src_rgb,
+                   int src_stride_rgb,
+                   uint8_t* dst_r,
+                   int dst_stride_r,
+                   uint8_t* dst_g,
+                   int dst_stride_g,
+                   uint8_t* dst_b,
+                   int dst_stride_b,
+                   int width,
+                   int height) {
+  int y;
+  void (*SplitRGBRow)(const uint8_t* src_rgb, uint8_t* dst_r, uint8_t* dst_g,
+                      uint8_t* dst_b, int width) = SplitRGBRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_r = dst_r + (height - 1) * dst_stride_r;
+    dst_g = dst_g + (height - 1) * dst_stride_g;
+    dst_b = dst_b + (height - 1) * dst_stride_b;
+    dst_stride_r = -dst_stride_r;
+    dst_stride_g = -dst_stride_g;
+    dst_stride_b = -dst_stride_b;
+  }
+  // Coalesce rows.
+  if (src_stride_rgb == width * 3 && dst_stride_r == width &&
+      dst_stride_g == width && dst_stride_b == width) {
+    width *= height;
+    height = 1;
+    src_stride_rgb = dst_stride_r = dst_stride_g = dst_stride_b = 0;
+  }
+#if defined(HAS_SPLITRGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SplitRGBRow = SplitRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SPLITRGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SplitRGBRow = SplitRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SplitRGBRow = SplitRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Copy a row of RGB.
+    SplitRGBRow(src_rgb, dst_r, dst_g, dst_b, width);
+    dst_r += dst_stride_r;
+    dst_g += dst_stride_g;
+    dst_b += dst_stride_b;
+    src_rgb += src_stride_rgb;
+  }
+}
+
+LIBYUV_API
+void MergeRGBPlane(const uint8_t* src_r,
+                   int src_stride_r,
+                   const uint8_t* src_g,
+                   int src_stride_g,
+                   const uint8_t* src_b,
+                   int src_stride_b,
+                   uint8_t* dst_rgb,
+                   int dst_stride_rgb,
+                   int width,
+                   int height) {
+  int y;
+  void (*MergeRGBRow)(const uint8_t* src_r, const uint8_t* src_g,
+                      const uint8_t* src_b, uint8_t* dst_rgb, int width) =
+      MergeRGBRow_C;
+  // Coalesce rows.
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb;
+    dst_stride_rgb = -dst_stride_rgb;
+  }
+  // Coalesce rows.
+  if (src_stride_r == width && src_stride_g == width && src_stride_b == width &&
+      dst_stride_rgb == width * 3) {
+    width *= height;
+    height = 1;
+    src_stride_r = src_stride_g = src_stride_b = dst_stride_rgb = 0;
+  }
+#if defined(HAS_MERGERGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MergeRGBRow = MergeRGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MERGERGBROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeRGBRow = MergeRGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      MergeRGBRow = MergeRGBRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    // Merge a row of U and V into a row of RGB.
+    MergeRGBRow(src_r, src_g, src_b, dst_rgb, width);
+    src_r += src_stride_r;
+    src_g += src_stride_g;
+    src_b += src_stride_b;
+    dst_rgb += dst_stride_rgb;
+  }
+}
+
 // Mirror a plane of data.
-void MirrorPlane(const uint8* src_y, int src_stride_y,
-                 uint8* dst_y, int dst_stride_y,
-                 int width, int height) {
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height) {
   int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -394,12 +643,12 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
   }
 #endif
 
@@ -413,17 +662,24 @@ void MirrorPlane(const uint8* src_y, int src_stride_y,
 
 // Convert YUY2 to I422.
 LIBYUV_API
-int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int YUY2ToI422(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*YUY2ToUV422Row)(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      YUY2ToUV422Row_C;
-  void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int width) =
+  void (*YUY2ToUV422Row)(const uint8_t* src_yuy2, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = YUY2ToUV422Row_C;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
       YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -431,10 +687,9 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     src_stride_yuy2 = -src_stride_yuy2;
   }
   // Coalesce rows.
-  if (src_stride_yuy2 == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_yuy2 = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -462,15 +717,23 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 #if defined(HAS_YUY2TOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     YUY2ToYRow = YUY2ToYRow_Any_NEON;
-    if (width >= 16) {
-      YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
-    }
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       YUY2ToYRow = YUY2ToYRow_NEON;
       YUY2ToUV422Row = YUY2ToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -485,17 +748,24 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
 
 // Convert UYVY to I422.
 LIBYUV_API
-int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int UYVYToI422(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int y;
-  void (*UYVYToUV422Row)(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) =
-      UYVYToUV422Row_C;
-  void (*UYVYToYRow)(const uint8* src_uyvy,
-                     uint8* dst_y, int width) = UYVYToYRow_C;
+  void (*UYVYToUV422Row)(const uint8_t* src_uyvy, uint8_t* dst_u,
+                         uint8_t* dst_v, int width) = UYVYToUV422Row_C;
+  void (*UYVYToYRow)(const uint8_t* src_uyvy, uint8_t* dst_y, int width) =
+      UYVYToYRow_C;
+  if (!src_uyvy || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -503,10 +773,9 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     src_stride_uyvy = -src_stride_uyvy;
   }
   // Coalesce rows.
-  if (src_stride_uyvy == width * 2 &&
-      dst_stride_y == width &&
-      dst_stride_u * 2 == width &&
-      dst_stride_v * 2 == width) {
+  if (src_stride_uyvy == width * 2 && dst_stride_y == width &&
+      dst_stride_u * 2 == width && dst_stride_v * 2 == width &&
+      width * height <= 32768) {
     width *= height;
     height = 1;
     src_stride_uyvy = dst_stride_y = dst_stride_u = dst_stride_v = 0;
@@ -534,15 +803,23 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
 #if defined(HAS_UYVYTOYROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     UYVYToYRow = UYVYToYRow_Any_NEON;
-    if (width >= 16) {
-      UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
-    }
+    UYVYToUV422Row = UYVYToUV422Row_Any_NEON;
     if (IS_ALIGNED(width, 16)) {
       UYVYToYRow = UYVYToYRow_NEON;
       UYVYToUV422Row = UYVYToUV422Row_NEON;
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
@@ -555,13 +832,82 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
   return 0;
 }
 
+// Convert YUY2 to Y.
+LIBYUV_API
+int YUY2ToY(const uint8_t* src_yuy2,
+            int src_stride_yuy2,
+            uint8_t* dst_y,
+            int dst_stride_y,
+            int width,
+            int height) {
+  int y;
+  void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
+      YUY2ToYRow_C;
+  if (!src_yuy2 || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+    src_stride_yuy2 = -src_stride_yuy2;
+  }
+  // Coalesce rows.
+  if (src_stride_yuy2 == width * 2 && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_yuy2 = dst_stride_y = 0;
+  }
+#if defined(HAS_YUY2TOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    YUY2ToYRow = YUY2ToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      YUY2ToYRow = YUY2ToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    YUY2ToYRow(src_yuy2, dst_y, width);
+    src_yuy2 += src_stride_yuy2;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
-int I400Mirror(const uint8* src_y, int src_stride_y,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
-  if (!src_y || !dst_y ||
-      width <= 0 || height == 0) {
+int I400Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -577,17 +923,24 @@ int I400Mirror(const uint8* src_y, int src_stride_y,
 
 // Mirror I420 with optional flipping
 LIBYUV_API
-int I420Mirror(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height) {
+int I420Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -612,11 +965,14 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
 
 // ARGB mirror.
 LIBYUV_API
-int ARGBMirror(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBMirror(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src, uint8_t* dst, int width) =
       ARGBMirrorRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -651,6 +1007,14 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 
   // Mirror plane
   for (y = 0; y < height; ++y) {
@@ -666,8 +1030,8 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb,
 // the same blend function for all pixels if possible.
 LIBYUV_API
 ARGBBlendRow GetARGBBlend() {
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = ARGBBlendRow_C;
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = ARGBBlendRow_C;
 #if defined(HAS_ARGBBLENDROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBBlendRow = ARGBBlendRow_SSSE3;
@@ -678,19 +1042,28 @@ ARGBBlendRow GetARGBBlend() {
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBBlendRow = ARGBBlendRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBBLENDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBBlendRow = ARGBBlendRow_MSA;
+  }
 #endif
   return ARGBBlendRow;
 }
 
 // Alpha Blend 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
-              const uint8* src_argb1, int src_stride_argb1,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
+int ARGBBlend(const uint8_t* src_argb0,
+              int src_stride_argb0,
+              const uint8_t* src_argb1,
+              int src_stride_argb1,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
   int y;
-  void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
-                       uint8* dst_argb, int width) = GetARGBBlend();
+  void (*ARGBBlendRow)(const uint8_t* src_argb, const uint8_t* src_argb1,
+                       uint8_t* dst_argb, int width) = GetARGBBlend();
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -701,8 +1074,7 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -720,14 +1092,20 @@ int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
 
 // Alpha Blend plane and store to destination.
 LIBYUV_API
-int BlendPlane(const uint8* src_y0, int src_stride_y0,
-               const uint8* src_y1, int src_stride_y1,
-               const uint8* alpha, int alpha_stride,
-               uint8* dst_y, int dst_stride_y,
-               int width, int height) {
+int BlendPlane(const uint8_t* src_y0,
+               int src_stride_y0,
+               const uint8_t* src_y1,
+               int src_stride_y1,
+               const uint8_t* alpha,
+               int alpha_stride,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               int width,
+               int height) {
   int y;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
   if (!src_y0 || !src_y1 || !alpha || !dst_y || width <= 0 || height == 0) {
     return -1;
   }
@@ -739,10 +1117,8 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
   }
 
   // Coalesce rows for Y plane.
-  if (src_stride_y0 == width &&
-      src_stride_y1 == width &&
-      alpha_stride == width &&
-      dst_stride_y == width) {
+  if (src_stride_y0 == width && src_stride_y1 == width &&
+      alpha_stride == width && dst_stride_y == width) {
     width *= height;
     height = 1;
     src_stride_y0 = src_stride_y1 = alpha_stride = dst_stride_y = 0;
@@ -750,7 +1126,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-  BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
+    BlendPlaneRow = BlendPlaneRow_Any_SSSE3;
     if (IS_ALIGNED(width, 8)) {
       BlendPlaneRow = BlendPlaneRow_SSSE3;
     }
@@ -758,7 +1134,7 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 #endif
 #if defined(HAS_BLENDPLANEROW_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-  BlendPlaneRow = BlendPlaneRow_Any_AVX2;
+    BlendPlaneRow = BlendPlaneRow_Any_AVX2;
     if (IS_ALIGNED(width, 32)) {
       BlendPlaneRow = BlendPlaneRow_AVX2;
     }
@@ -778,24 +1154,36 @@ int BlendPlane(const uint8* src_y0, int src_stride_y0,
 #define MAXTWIDTH 2048
 // Alpha Blend YUV images and store to destination.
 LIBYUV_API
-int I420Blend(const uint8* src_y0, int src_stride_y0,
-              const uint8* src_u0, int src_stride_u0,
-              const uint8* src_v0, int src_stride_v0,
-              const uint8* src_y1, int src_stride_y1,
-              const uint8* src_u1, int src_stride_u1,
-              const uint8* src_v1, int src_stride_v1,
-              const uint8* alpha, int alpha_stride,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int width, int height) {
+int I420Blend(const uint8_t* src_y0,
+              int src_stride_y0,
+              const uint8_t* src_u0,
+              int src_stride_u0,
+              const uint8_t* src_v0,
+              int src_stride_v0,
+              const uint8_t* src_y1,
+              int src_stride_y1,
+              const uint8_t* src_u1,
+              int src_stride_u1,
+              const uint8_t* src_v1,
+              int src_stride_v1,
+              const uint8_t* alpha,
+              int alpha_stride,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int width,
+              int height) {
   int y;
   // Half width/height for UV.
   int halfwidth = (width + 1) >> 1;
-  void (*BlendPlaneRow)(const uint8* src0, const uint8* src1,
-      const uint8* alpha, uint8* dst, int width) = BlendPlaneRow_C;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
+  void (*BlendPlaneRow)(const uint8_t* src0, const uint8_t* src1,
+                        const uint8_t* alpha, uint8_t* dst, int width) =
+      BlendPlaneRow_C;
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) = ScaleRowDown2Box_C;
   if (!src_y0 || !src_u0 || !src_v0 || !src_y1 || !src_u1 || !src_v1 ||
       !alpha || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
@@ -809,11 +1197,8 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
   }
 
   // Blend Y plane.
-  BlendPlane(src_y0, src_stride_y0,
-             src_y1, src_stride_y1,
-             alpha, alpha_stride,
-             dst_y, dst_stride_y,
-             width, height);
+  BlendPlane(src_y0, src_stride_y0, src_y1, src_stride_y1, alpha, alpha_stride,
+             dst_y, dst_stride_y, width, height);
 
 #if defined(HAS_BLENDPLANEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
@@ -893,13 +1278,17 @@ int I420Blend(const uint8* src_y0, int src_stride_y0,
 
 // Multiply 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBMultiply(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBMultiplyRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBMultiplyRow_C;
+  void (*ARGBMultiplyRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBMultiplyRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -910,8 +1299,7 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -941,6 +1329,14 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBMULTIPLYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMultiplyRow = ARGBMultiplyRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBMultiplyRow = ARGBMultiplyRow_MSA;
+    }
+  }
+#endif
 
   // Multiply plane
   for (y = 0; y < height; ++y) {
@@ -954,12 +1350,16 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
 
 // Add 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
-            const uint8* src_argb1, int src_stride_argb1,
-            uint8* dst_argb, int dst_stride_argb,
-            int width, int height) {
+int ARGBAdd(const uint8_t* src_argb0,
+            int src_stride_argb0,
+            const uint8_t* src_argb1,
+            int src_stride_argb1,
+            uint8_t* dst_argb,
+            int dst_stride_argb,
+            int width,
+            int height) {
   int y;
-  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
+  void (*ARGBAddRow)(const uint8_t* src0, const uint8_t* src1, uint8_t* dst,
                      int width) = ARGBAddRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -971,8 +1371,7 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -1007,6 +1406,14 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBADDROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAddRow = ARGBAddRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAddRow = ARGBAddRow_MSA;
+    }
+  }
+#endif
 
   // Add plane
   for (y = 0; y < height; ++y) {
@@ -1020,13 +1427,17 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
 
 // Subtract 2 ARGB images and store to destination.
 LIBYUV_API
-int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
-                 const uint8* src_argb1, int src_stride_argb1,
-                 uint8* dst_argb, int dst_stride_argb,
-                 int width, int height) {
+int ARGBSubtract(const uint8_t* src_argb0,
+                 int src_stride_argb0,
+                 const uint8_t* src_argb1,
+                 int src_stride_argb1,
+                 uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBSubtractRow)(const uint8* src0, const uint8* src1, uint8* dst,
-                          int width) = ARGBSubtractRow_C;
+  void (*ARGBSubtractRow)(const uint8_t* src0, const uint8_t* src1,
+                          uint8_t* dst, int width) = ARGBSubtractRow_C;
   if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1037,8 +1448,7 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     dst_stride_argb = -dst_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb0 == width * 4 &&
-      src_stride_argb1 == width * 4 &&
+  if (src_stride_argb0 == width * 4 && src_stride_argb1 == width * 4 &&
       dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
@@ -1068,6 +1478,14 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
     }
   }
 #endif
+#if defined(HAS_ARGBSUBTRACTROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSubtractRow = ARGBSubtractRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBSubtractRow = ARGBSubtractRow_MSA;
+    }
+  }
+#endif
 
   // Subtract plane
   for (y = 0; y < height; ++y) {
@@ -1079,21 +1497,23 @@ int ARGBSubtract(const uint8* src_argb0, int src_stride_argb0,
   return 0;
 }
 // Convert I422 to RGBA with matrix
-static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
-                            const uint8* src_u, int src_stride_u,
-                            const uint8* src_v, int src_stride_v,
-                            uint8* dst_rgba, int dst_stride_rgba,
+static int I422ToRGBAMatrix(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_rgba,
+                            int dst_stride_rgba,
                             const struct YuvConstants* yuvconstants,
-                            int width, int height) {
+                            int width,
+                            int height) {
   int y;
-  void (*I422ToRGBARow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        const struct YuvConstants* yuvconstants,
-                        int width) = I422ToRGBARow_C;
-  if (!src_y || !src_u || !src_v || !dst_rgba ||
-      width <= 0 || height == 0) {
+  void (*I422ToRGBARow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf,
+                        const struct YuvConstants* yuvconstants, int width) =
+      I422ToRGBARow_C;
+  if (!src_y || !src_u || !src_v || !dst_rgba || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1126,13 +1546,12 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
-#if defined(HAS_I422TORGBAROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_rgba, 4) && IS_ALIGNED(dst_stride_rgba, 4)) {
-    I422ToRGBARow = I422ToRGBARow_DSPR2;
+#if defined(HAS_I422TORGBAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToRGBARow = I422ToRGBARow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      I422ToRGBARow = I422ToRGBARow_MSA;
+    }
   }
 #endif
 
@@ -1148,48 +1567,55 @@ static int I422ToRGBAMatrix(const uint8* src_y, int src_stride_y,
 
 // Convert I422 to RGBA.
 LIBYUV_API
-int I422ToRGBA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_rgba, int dst_stride_rgba,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_u, src_stride_u,
-                          src_v, src_stride_v,
-                          dst_rgba, dst_stride_rgba,
-                          &kYuvI601Constants,
-                          width, height);
+int I422ToRGBA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_rgba,
+               int dst_stride_rgba,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_rgba, dst_stride_rgba,
+                          &kYuvI601Constants, width, height);
 }
 
 // Convert I422 to BGRA.
 LIBYUV_API
-int I422ToBGRA(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_bgra, int dst_stride_bgra,
-               int width, int height) {
-  return I422ToRGBAMatrix(src_y, src_stride_y,
-                          src_v, src_stride_v,  // Swap U and V
-                          src_u, src_stride_u,
-                          dst_bgra, dst_stride_bgra,
+int I422ToBGRA(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_bgra,
+               int dst_stride_bgra,
+               int width,
+               int height) {
+  return I422ToRGBAMatrix(src_y, src_stride_y, src_v,
+                          src_stride_v,  // Swap U and V
+                          src_u, src_stride_u, dst_bgra, dst_stride_bgra,
                           &kYvuI601Constants,  // Use Yvu matrix
                           width, height);
 }
 
 // Convert NV12 to RGB565.
 LIBYUV_API
-int NV12ToRGB565(const uint8* src_y, int src_stride_y,
-                 const uint8* src_uv, int src_stride_uv,
-                 uint8* dst_rgb565, int dst_stride_rgb565,
-                 int width, int height) {
+int NV12ToRGB565(const uint8_t* src_y,
+                 int src_stride_y,
+                 const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_rgb565,
+                 int dst_stride_rgb565,
+                 int width,
+                 int height) {
   int y;
-  void (*NV12ToRGB565Row)(const uint8* y_buf,
-                          const uint8* uv_buf,
-                          uint8* rgb_buf,
-                          const struct YuvConstants* yuvconstants,
-                          int width) = NV12ToRGB565Row_C;
-  if (!src_y || !src_uv || !dst_rgb565 ||
-      width <= 0 || height == 0) {
+  void (*NV12ToRGB565Row)(
+      const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* rgb_buf,
+      const struct YuvConstants* yuvconstants, int width) = NV12ToRGB565Row_C;
+  if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1222,6 +1648,14 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_NV12TORGB565ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    NV12ToRGB565Row = NV12ToRGB565Row_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      NV12ToRGB565Row = NV12ToRGB565Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     NV12ToRGB565Row(src_y, src_uv, dst_rgb565, &kYuvI601Constants, width);
@@ -1236,14 +1670,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
 
 // Convert RAW to RGB24.
 LIBYUV_API
-int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
-               uint8* dst_rgb24, int dst_stride_rgb24,
-               int width, int height) {
+int RAWToRGB24(const uint8_t* src_raw,
+               int src_stride_raw,
+               uint8_t* dst_rgb24,
+               int dst_stride_rgb24,
+               int width,
+               int height) {
   int y;
-  void (*RAWToRGB24Row)(const uint8* src_rgb, uint8* dst_rgb24, int width) =
+  void (*RAWToRGB24Row)(const uint8_t* src_rgb, uint8_t* dst_rgb24, int width) =
       RAWToRGB24Row_C;
-  if (!src_raw || !dst_rgb24 ||
-      width <= 0 || height == 0) {
+  if (!src_raw || !dst_rgb24 || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -1253,8 +1689,7 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
     src_stride_raw = -src_stride_raw;
   }
   // Coalesce rows.
-  if (src_stride_raw == width * 3 &&
-      dst_stride_rgb24 == width * 3) {
+  if (src_stride_raw == width * 3 && dst_stride_rgb24 == width * 3) {
     width *= height;
     height = 1;
     src_stride_raw = dst_stride_rgb24 = 0;
@@ -1275,6 +1710,14 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
     }
   }
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RAWToRGB24Row = RAWToRGB24Row_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RAWToRGB24Row = RAWToRGB24Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     RAWToRGB24Row(src_raw, dst_rgb24, width);
@@ -1285,11 +1728,13 @@ int RAWToRGB24(const uint8* src_raw, int src_stride_raw,
 }
 
 LIBYUV_API
-void SetPlane(uint8* dst_y, int dst_stride_y,
-              int width, int height,
-              uint32 value) {
+void SetPlane(uint8_t* dst_y,
+              int dst_stride_y,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*SetRow)(uint8* dst, uint8 value, int width) = SetRow_C;
+  void (*SetRow)(uint8_t * dst, uint8_t value, int width) = SetRow_C;
   if (height < 0) {
     height = -height;
     dst_y = dst_y + (height - 1) * dst_stride_y;
@@ -1322,6 +1767,11 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
     SetRow = SetRow_ERMS;
   }
 #endif
+#if defined(HAS_SETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 16)) {
+    SetRow = SetRow_MSA;
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1332,22 +1782,26 @@ void SetPlane(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into I420
 LIBYUV_API
-int I420Rect(uint8* dst_y, int dst_stride_y,
-             uint8* dst_u, int dst_stride_u,
-             uint8* dst_v, int dst_stride_v,
-             int x, int y,
-             int width, int height,
-             int value_y, int value_u, int value_v) {
+int I420Rect(uint8_t* dst_y,
+             int dst_stride_y,
+             uint8_t* dst_u,
+             int dst_stride_u,
+             uint8_t* dst_v,
+             int dst_stride_v,
+             int x,
+             int y,
+             int width,
+             int height,
+             int value_y,
+             int value_u,
+             int value_v) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  uint8* start_y = dst_y + y * dst_stride_y + x;
-  uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
-  uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
-  if (!dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0 ||
-      x < 0 || y < 0 ||
-      value_y < 0 || value_y > 255 ||
-      value_u < 0 || value_u > 255 ||
+  uint8_t* start_y = dst_y + y * dst_stride_y + x;
+  uint8_t* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2);
+  uint8_t* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2);
+  if (!dst_y || !dst_u || !dst_v || width <= 0 || height == 0 || x < 0 ||
+      y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 ||
       value_v < 0 || value_v > 255) {
     return -1;
   }
@@ -1360,15 +1814,17 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
 
 // Draw a rectangle into ARGB
 LIBYUV_API
-int ARGBRect(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height,
-             uint32 value) {
+int ARGBRect(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height,
+             uint32_t value) {
   int y;
-  void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int width) = ARGBSetRow_C;
-  if (!dst_argb ||
-      width <= 0 || height == 0 ||
-      dst_x < 0 || dst_y < 0) {
+  void (*ARGBSetRow)(uint8_t * dst_argb, uint32_t value, int width) =
+      ARGBSetRow_C;
+  if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
   if (height < 0) {
@@ -1397,6 +1853,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
     ARGBSetRow = ARGBSetRow_X86;
   }
 #endif
+#if defined(HAS_ARGBSETROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBSetRow = ARGBSetRow_Any_MSA;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBSetRow = ARGBSetRow_MSA;
+    }
+  }
+#endif
 
   // Set plane
   for (y = 0; y < height; ++y) {
@@ -1420,11 +1884,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
 //   f is foreground pixel premultiplied by alpha
 
 LIBYUV_API
-int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBAttenuate(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                            int width) = ARGBAttenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1435,8 +1902,7 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1465,6 +1931,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBATTENUATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBAttenuateRow(src_argb, dst_argb, width);
@@ -1476,11 +1950,14 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert preattentuated ARGB to unattenuated ARGB.
 LIBYUV_API
-int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height) {
+int ARGBUnattenuate(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBUnattenuateRow)(const uint8* src_argb, uint8* dst_argb,
+  void (*ARGBUnattenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb,
                              int width) = ARGBUnattenuateRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1491,8 +1968,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1513,7 +1989,7 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-// TODO(fbarchard): Neon version.
+  // TODO(fbarchard): Neon version.
 
   for (y = 0; y < height; ++y) {
     ARGBUnattenuateRow(src_argb, dst_argb, width);
@@ -1525,12 +2001,15 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
 
 // Convert ARGB to Grayed ARGB.
 LIBYUV_API
-int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb,
-               int width, int height) {
+int ARGBGrayTo(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1540,8 +2019,7 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1556,6 +2034,11 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(src_argb, dst_argb, width);
@@ -1567,13 +2050,16 @@ int ARGBGrayTo(const uint8* src_argb, int src_stride_argb,
 
 // Make a rectangle of ARGB gray scale.
 LIBYUV_API
-int ARGBGray(uint8* dst_argb, int dst_stride_argb,
-             int dst_x, int dst_y,
-             int width, int height) {
+int ARGBGray(uint8_t* dst_argb,
+             int dst_stride_argb,
+             int dst_x,
+             int dst_y,
+             int width,
+             int height) {
   int y;
-  void (*ARGBGrayRow)(const uint8* src_argb, uint8* dst_argb,
-                      int width) = ARGBGrayRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBGrayRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      ARGBGrayRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1593,6 +2079,12 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
     ARGBGrayRow = ARGBGrayRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBGRAYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBGrayRow = ARGBGrayRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBGrayRow(dst, dst, width);
     dst += dst_stride_argb;
@@ -1602,11 +2094,15 @@ int ARGBGray(uint8* dst_argb, int dst_stride_argb,
 
 // Make a rectangle of ARGB Sepia tone.
 LIBYUV_API
-int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
-              int dst_x, int dst_y, int width, int height) {
+int ARGBSepia(uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_x,
+              int dst_y,
+              int width,
+              int height) {
   int y;
-  void (*ARGBSepiaRow)(uint8* dst_argb, int width) = ARGBSepiaRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  void (*ARGBSepiaRow)(uint8_t * dst_argb, int width) = ARGBSepiaRow_C;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0) {
     return -1;
   }
@@ -1626,6 +2122,12 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
     ARGBSepiaRow = ARGBSepiaRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSEPIAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBSepiaRow = ARGBSepiaRow_MSA;
+  }
+#endif
+
   for (y = 0; y < height; ++y) {
     ARGBSepiaRow(dst, width);
     dst += dst_stride_argb;
@@ -1636,13 +2138,17 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb,
 // Apply a 4x4 matrix to each ARGB pixel.
 // Note: Normally for shading, but can be used to swizzle or invert.
 LIBYUV_API
-int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
-                    uint8* dst_argb, int dst_stride_argb,
-                    const int8* matrix_argb,
-                    int width, int height) {
+int ARGBColorMatrix(const uint8_t* src_argb,
+                    int src_stride_argb,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    const int8_t* matrix_argb,
+                    int width,
+                    int height) {
   int y;
-  void (*ARGBColorMatrixRow)(const uint8* src_argb, uint8* dst_argb,
-      const int8* matrix_argb, int width) = ARGBColorMatrixRow_C;
+  void (*ARGBColorMatrixRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                             const int8_t* matrix_argb, int width) =
+      ARGBColorMatrixRow_C;
   if (!src_argb || !dst_argb || !matrix_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -1652,8 +2158,7 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1667,6 +2172,11 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBColorMatrixRow = ARGBColorMatrixRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBCOLORMATRIXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBColorMatrixRow = ARGBColorMatrixRow_MSA;
+  }
 #endif
   for (y = 0; y < height; ++y) {
     ARGBColorMatrixRow(src_argb, dst_argb, matrix_argb, width);
@@ -1679,13 +2189,17 @@ int ARGBColorMatrix(const uint8* src_argb, int src_stride_argb,
 // Apply a 4x3 matrix to each ARGB pixel.
 // Deprecated.
 LIBYUV_API
-int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
-                   const int8* matrix_rgb,
-                   int dst_x, int dst_y, int width, int height) {
-  SIMD_ALIGNED(int8 matrix_argb[16]);
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+int RGBColorMatrix(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const int8_t* matrix_rgb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
+  SIMD_ALIGNED(int8_t matrix_argb[16]);
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !matrix_rgb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
 
@@ -1705,23 +2219,26 @@ int RGBColorMatrix(uint8* dst_argb, int dst_stride_argb,
   matrix_argb[14] = matrix_argb[13] = matrix_argb[12] = 0;
   matrix_argb[15] = 64;  // 1.0
 
-  return ARGBColorMatrix((const uint8*)(dst), dst_stride_argb,
-                         dst, dst_stride_argb,
-                         &matrix_argb[0], width, height);
+  return ARGBColorMatrix((const uint8_t*)(dst), dst_stride_argb, dst,
+                         dst_stride_argb, &matrix_argb[0], width, height);
 }
 
 // Apply a color table each ARGB pixel.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                   const uint8* table_argb,
-                   int dst_x, int dst_y, int width, int height) {
+int ARGBColorTable(uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   const uint8_t* table_argb,
+                   int dst_x,
+                   int dst_y,
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*ARGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                             int width) = ARGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1745,15 +2262,19 @@ int ARGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Apply a color table each ARGB pixel but preserve destination alpha.
 // Table contains 256 ARGB values.
 LIBYUV_API
-int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
-                  const uint8* table_argb,
-                  int dst_x, int dst_y, int width, int height) {
+int RGBColorTable(uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  const uint8_t* table_argb,
+                  int dst_x,
+                  int dst_y,
+                  int width,
+                  int height) {
   int y;
-  void (*RGBColorTableRow)(uint8* dst_argb, const uint8* table_argb,
+  void (*RGBColorTableRow)(uint8_t * dst_argb, const uint8_t* table_argb,
                            int width) = RGBColorTableRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
-  if (!dst_argb || !table_argb || width <= 0 || height <= 0 ||
-      dst_x < 0 || dst_y < 0) {
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  if (!dst_argb || !table_argb || width <= 0 || height <= 0 || dst_x < 0 ||
+      dst_y < 0) {
     return -1;
   }
   // Coalesce rows.
@@ -1784,13 +2305,19 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb,
 // Caveat - although SSE2 saturates, the C function does not and should be used
 // with care if doing anything but quantization.
 LIBYUV_API
-int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
-                 int scale, int interval_size, int interval_offset,
-                 int dst_x, int dst_y, int width, int height) {
+int ARGBQuantize(uint8_t* dst_argb,
+                 int dst_stride_argb,
+                 int scale,
+                 int interval_size,
+                 int interval_offset,
+                 int dst_x,
+                 int dst_y,
+                 int width,
+                 int height) {
   int y;
-  void (*ARGBQuantizeRow)(uint8* dst_argb, int scale, int interval_size,
+  void (*ARGBQuantizeRow)(uint8_t * dst_argb, int scale, int interval_size,
                           int interval_offset, int width) = ARGBQuantizeRow_C;
-  uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
+  uint8_t* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
   if (!dst_argb || width <= 0 || height <= 0 || dst_x < 0 || dst_y < 0 ||
       interval_size < 1 || interval_size > 255) {
     return -1;
@@ -1810,6 +2337,11 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     ARGBQuantizeRow = ARGBQuantizeRow_NEON;
   }
+#endif
+#if defined(HAS_ARGBQUANTIZEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 8)) {
+    ARGBQuantizeRow = ARGBQuantizeRow_MSA;
+  }
 #endif
   for (y = 0; y < height; ++y) {
     ARGBQuantizeRow(dst, scale, interval_size, interval_offset, width);
@@ -1821,13 +2353,17 @@ int ARGBQuantize(uint8* dst_argb, int dst_stride_argb,
 // Computes table of cumulative sum for image where the value is the sum
 // of all values above and to the left of the entry. Used by ARGBBlur.
 LIBYUV_API
-int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
-                             int32* dst_cumsum, int dst_stride32_cumsum,
-                             int width, int height) {
+int ARGBComputeCumulativeSum(const uint8_t* src_argb,
+                             int src_stride_argb,
+                             int32_t* dst_cumsum,
+                             int dst_stride32_cumsum,
+                             int width,
+                             int height) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8* row, int32* cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  int32* previous_cumsum = dst_cumsum;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  int32_t* previous_cumsum = dst_cumsum;
   if (!dst_cumsum || !src_argb || width <= 0 || height <= 0) {
     return -1;
   }
@@ -1851,18 +2387,25 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb,
 // aligned to 16 byte boundary. height can be radius * 2 + 2 to save memory
 // as the buffer is treated as circular.
 LIBYUV_API
-int ARGBBlur(const uint8* src_argb, int src_stride_argb,
-             uint8* dst_argb, int dst_stride_argb,
-             int32* dst_cumsum, int dst_stride32_cumsum,
-             int width, int height, int radius) {
+int ARGBBlur(const uint8_t* src_argb,
+             int src_stride_argb,
+             uint8_t* dst_argb,
+             int dst_stride_argb,
+             int32_t* dst_cumsum,
+             int dst_stride32_cumsum,
+             int width,
+             int height,
+             int radius) {
   int y;
-  void (*ComputeCumulativeSumRow)(const uint8 *row, int32 *cumsum,
-      const int32* previous_cumsum, int width) = ComputeCumulativeSumRow_C;
-  void (*CumulativeSumToAverageRow)(const int32* topleft, const int32* botleft,
-      int width, int area, uint8* dst, int count) = CumulativeSumToAverageRow_C;
-  int32* cumsum_bot_row;
-  int32* max_cumsum_bot_row;
-  int32* cumsum_top_row;
+  void (*ComputeCumulativeSumRow)(const uint8_t* row, int32_t* cumsum,
+                                  const int32_t* previous_cumsum, int width) =
+      ComputeCumulativeSumRow_C;
+  void (*CumulativeSumToAverageRow)(
+      const int32_t* topleft, const int32_t* botleft, int width, int area,
+      uint8_t* dst, int count) = CumulativeSumToAverageRow_C;
+  int32_t* cumsum_bot_row;
+  int32_t* max_cumsum_bot_row;
+  int32_t* cumsum_top_row;
 
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
@@ -1889,9 +2432,8 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 #endif
   // Compute enough CumulativeSum for first row to be blurred. After this
   // one row of CumulativeSum is updated at a time.
-  ARGBComputeCumulativeSum(src_argb, src_stride_argb,
-                           dst_cumsum, dst_stride32_cumsum,
-                           width, radius);
+  ARGBComputeCumulativeSum(src_argb, src_stride_argb, dst_cumsum,
+                           dst_stride32_cumsum, width, radius);
 
   src_argb = src_argb + radius * src_stride_argb;
   cumsum_bot_row = &dst_cumsum[(radius - 1) * dst_stride32_cumsum];
@@ -1917,7 +2459,7 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
     // Increment cumsum_bot_row pointer with circular buffer wrap around and
     // then fill in a row of CumulativeSum.
     if ((y + radius) < height) {
-      const int32* prev_cumsum_bot_row = cumsum_bot_row;
+      const int32_t* prev_cumsum_bot_row = cumsum_bot_row;
       cumsum_bot_row += dst_stride32_cumsum;
       if (cumsum_bot_row >= max_cumsum_bot_row) {
         cumsum_bot_row = dst_cumsum;
@@ -1929,24 +2471,24 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
     // Left clipped.
     for (x = 0; x < radius + 1; ++x) {
-      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+      CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                                &dst_argb[x * 4], 1);
       area += (bot_y - top_y);
       boxwidth += 4;
     }
 
     // Middle unclipped.
     n = (width - 1) - radius - x + 1;
-    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row,
-                              boxwidth, area, &dst_argb[x * 4], n);
+    CumulativeSumToAverageRow(cumsum_top_row, cumsum_bot_row, boxwidth, area,
+                              &dst_argb[x * 4], n);
 
     // Right clipped.
     for (x += n; x <= width - 1; ++x) {
       area -= (bot_y - top_y);
       boxwidth -= 4;
       CumulativeSumToAverageRow(cumsum_top_row + (x - radius - 1) * 4,
-                                cumsum_bot_row + (x - radius - 1) * 4,
-                                boxwidth, area, &dst_argb[x * 4], 1);
+                                cumsum_bot_row + (x - radius - 1) * 4, boxwidth,
+                                area, &dst_argb[x * 4], 1);
     }
     dst_argb += dst_stride_argb;
   }
@@ -1955,12 +2497,16 @@ int ARGBBlur(const uint8* src_argb, int src_stride_argb,
 
 // Multiply ARGB image by a specified ARGB value.
 LIBYUV_API
-int ARGBShade(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height, uint32 value) {
+int ARGBShade(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height,
+              uint32_t value) {
   int y;
-  void (*ARGBShadeRow)(const uint8* src_argb, uint8* dst_argb,
-                       int width, uint32 value) = ARGBShadeRow_C;
+  void (*ARGBShadeRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width,
+                       uint32_t value) = ARGBShadeRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0 || value == 0u) {
     return -1;
   }
@@ -1970,8 +2516,7 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -1986,6 +2531,11 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
     ARGBShadeRow = ARGBShadeRow_NEON;
   }
 #endif
+#if defined(HAS_ARGBSHADEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 4)) {
+    ARGBShadeRow = ARGBShadeRow_MSA;
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShadeRow(src_argb, dst_argb, width, value);
@@ -1997,12 +2547,17 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 
 // Interpolate 2 planes by specified amount (0 to 255).
 LIBYUV_API
-int InterpolatePlane(const uint8* src0, int src_stride0,
-                     const uint8* src1, int src_stride1,
-                     uint8* dst, int dst_stride,
-                     int width, int height, int interpolation) {
+int InterpolatePlane(const uint8_t* src0,
+                     int src_stride0,
+                     const uint8_t* src1,
+                     int src_stride1,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width,
+                     int height,
+                     int interpolation) {
   int y;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
   if (!src0 || !src1 || !dst || width <= 0 || height == 0) {
@@ -2015,9 +2570,7 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
     dst_stride = -dst_stride;
   }
   // Coalesce rows.
-  if (src_stride0 == width &&
-      src_stride1 == width &&
-      dst_stride == width) {
+  if (src_stride0 == width && src_stride1 == width && dst_stride == width) {
     width *= height;
     height = 1;
     src_stride0 = src_stride1 = dst_stride = 0;
@@ -2046,13 +2599,12 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src0, 4) && IS_ALIGNED(src_stride0, 4) &&
-      IS_ALIGNED(src1, 4) && IS_ALIGNED(src_stride1, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4) &&
-      IS_ALIGNED(width, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
@@ -2067,61 +2619,71 @@ int InterpolatePlane(const uint8* src0, int src_stride0,
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
 LIBYUV_API
-int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
-                    const uint8* src_argb1, int src_stride_argb1,
-                    uint8* dst_argb, int dst_stride_argb,
-                    int width, int height, int interpolation) {
-  return InterpolatePlane(src_argb0, src_stride_argb0,
-                          src_argb1, src_stride_argb1,
-                          dst_argb, dst_stride_argb,
+int ARGBInterpolate(const uint8_t* src_argb0,
+                    int src_stride_argb0,
+                    const uint8_t* src_argb1,
+                    int src_stride_argb1,
+                    uint8_t* dst_argb,
+                    int dst_stride_argb,
+                    int width,
+                    int height,
+                    int interpolation) {
+  return InterpolatePlane(src_argb0, src_stride_argb0, src_argb1,
+                          src_stride_argb1, dst_argb, dst_stride_argb,
                           width * 4, height, interpolation);
 }
 
 // Interpolate 2 YUV images by specified amount (0 to 255).
 LIBYUV_API
-int I420Interpolate(const uint8* src0_y, int src0_stride_y,
-                    const uint8* src0_u, int src0_stride_u,
-                    const uint8* src0_v, int src0_stride_v,
-                    const uint8* src1_y, int src1_stride_y,
-                    const uint8* src1_u, int src1_stride_u,
-                    const uint8* src1_v, int src1_stride_v,
-                    uint8* dst_y, int dst_stride_y,
-                    uint8* dst_u, int dst_stride_u,
-                    uint8* dst_v, int dst_stride_v,
-                    int width, int height, int interpolation) {
+int I420Interpolate(const uint8_t* src0_y,
+                    int src0_stride_y,
+                    const uint8_t* src0_u,
+                    int src0_stride_u,
+                    const uint8_t* src0_v,
+                    int src0_stride_v,
+                    const uint8_t* src1_y,
+                    int src1_stride_y,
+                    const uint8_t* src1_u,
+                    int src1_stride_u,
+                    const uint8_t* src1_v,
+                    int src1_stride_v,
+                    uint8_t* dst_y,
+                    int dst_stride_y,
+                    uint8_t* dst_u,
+                    int dst_stride_u,
+                    uint8_t* dst_v,
+                    int dst_stride_v,
+                    int width,
+                    int height,
+                    int interpolation) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src0_y || !src0_u || !src0_v ||
-      !src1_y || !src1_u || !src1_v ||
-      !dst_y || !dst_u || !dst_v ||
-      width <= 0 || height == 0) {
+  if (!src0_y || !src0_u || !src0_v || !src1_y || !src1_u || !src1_v ||
+      !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
     return -1;
   }
-  InterpolatePlane(src0_y, src0_stride_y,
-                   src1_y, src1_stride_y,
-                   dst_y, dst_stride_y,
-                   width, height, interpolation);
-  InterpolatePlane(src0_u, src0_stride_u,
-                   src1_u, src1_stride_u,
-                   dst_u, dst_stride_u,
-                   halfwidth, halfheight, interpolation);
-  InterpolatePlane(src0_v, src0_stride_v,
-                   src1_v, src1_stride_v,
-                   dst_v, dst_stride_v,
-                   halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_y, src0_stride_y, src1_y, src1_stride_y, dst_y,
+                   dst_stride_y, width, height, interpolation);
+  InterpolatePlane(src0_u, src0_stride_u, src1_u, src1_stride_u, dst_u,
+                   dst_stride_u, halfwidth, halfheight, interpolation);
+  InterpolatePlane(src0_v, src0_stride_v, src1_v, src1_stride_v, dst_v,
+                   dst_stride_v, halfwidth, halfheight, interpolation);
   return 0;
 }
 
 // Shuffle ARGB channel order.  e.g. BGRA to ARGB.
 LIBYUV_API
-int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
-                uint8* dst_argb, int dst_stride_argb,
-                const uint8* shuffler, int width, int height) {
+int ARGBShuffle(const uint8_t* src_bgra,
+                int src_stride_bgra,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                const uint8_t* shuffler,
+                int width,
+                int height) {
   int y;
-  void (*ARGBShuffleRow)(const uint8* src_bgra, uint8* dst_argb,
-                         const uint8* shuffler, int width) = ARGBShuffleRow_C;
-  if (!src_bgra || !dst_argb ||
-      width <= 0 || height == 0) {
+  void (*ARGBShuffleRow)(const uint8_t* src_bgra, uint8_t* dst_argb,
+                         const uint8_t* shuffler, int width) = ARGBShuffleRow_C;
+  if (!src_bgra || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2131,20 +2693,11 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     src_stride_bgra = -src_stride_bgra;
   }
   // Coalesce rows.
-  if (src_stride_bgra == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_bgra == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_bgra = dst_stride_argb = 0;
   }
-#if defined(HAS_ARGBSHUFFLEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2)) {
-    ARGBShuffleRow = ARGBShuffleRow_Any_SSE2;
-    if (IS_ALIGNED(width, 4)) {
-      ARGBShuffleRow = ARGBShuffleRow_SSE2;
-    }
-  }
-#endif
 #if defined(HAS_ARGBSHUFFLEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     ARGBShuffleRow = ARGBShuffleRow_Any_SSSE3;
@@ -2169,6 +2722,14 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
     }
   }
 #endif
+#if defined(HAS_ARGBSHUFFLEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBShuffleRow = ARGBShuffleRow_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBShuffleRow = ARGBShuffleRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     ARGBShuffleRow(src_bgra, dst_argb, shuffler, width);
@@ -2179,28 +2740,32 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra,
 }
 
 // Sobel ARGB effect.
-static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
-                        uint8* dst_argb, int dst_stride_argb,
-                        int width, int height,
-                        void (*SobelRow)(const uint8* src_sobelx,
-                                         const uint8* src_sobely,
-                                         uint8* dst, int width)) {
+static int ARGBSobelize(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height,
+                        void (*SobelRow)(const uint8_t* src_sobelx,
+                                         const uint8_t* src_sobely,
+                                         uint8_t* dst,
+                                         int width)) {
   int y;
-  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int width) =
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
       ARGBToYJRow_C;
-  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) = SobelYRow_C;
-  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobely, int width) =
+  void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    uint8_t* dst_sobely, int width) = SobelYRow_C;
+  void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
+                    const uint8_t* src_y2, uint8_t* dst_sobely, int width) =
       SobelXRow_C;
   const int kEdge = 16;  // Extra pixels at start of row for extrude/align.
-  if (!src_argb  || !dst_argb || width <= 0 || height == 0) {
+  if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
 
@@ -2228,6 +2793,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_ARGBTOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBToYJRow = ARGBToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_MSA;
+    }
+  }
+#endif
 
 #if defined(HAS_SOBELYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -2239,6 +2812,11 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
     SobelYRow = SobelYRow_NEON;
   }
 #endif
+#if defined(HAS_SOBELYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelYRow = SobelYRow_MSA;
+  }
+#endif
 #if defined(HAS_SOBELXROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXRow = SobelXRow_SSE2;
@@ -2248,19 +2826,24 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
   if (TestCpuFlag(kCpuHasNEON)) {
     SobelXRow = SobelXRow_NEON;
   }
+#endif
+#if defined(HAS_SOBELXROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXRow = SobelXRow_MSA;
+  }
 #endif
   {
     // 3 rows with edges before/after.
     const int kRowSize = (width + kEdge + 31) & ~31;
     align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge));
-    uint8* row_sobelx = rows;
-    uint8* row_sobely = rows + kRowSize;
-    uint8* row_y = rows + kRowSize * 2;
+    uint8_t* row_sobelx = rows;
+    uint8_t* row_sobely = rows + kRowSize;
+    uint8_t* row_y = rows + kRowSize * 2;
 
     // Convert first row.
-    uint8* row_y0 = row_y + kEdge;
-    uint8* row_y1 = row_y0 + kRowSize;
-    uint8* row_y2 = row_y1 + kRowSize;
+    uint8_t* row_y0 = row_y + kEdge;
+    uint8_t* row_y1 = row_y0 + kRowSize;
+    uint8_t* row_y2 = row_y1 + kRowSize;
     ARGBToYJRow(src_argb, row_y0, width);
     row_y0[-1] = row_y0[0];
     memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
@@ -2284,7 +2867,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
       // Cycle thru circular queue of 3 row_y buffers.
       {
-        uint8* row_yt = row_y0;
+        uint8_t* row_yt = row_y0;
         row_y0 = row_y1;
         row_y1 = row_y2;
         row_y2 = row_yt;
@@ -2299,11 +2882,14 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
 
 // Sobel ARGB effect.
 LIBYUV_API
-int ARGBSobel(const uint8* src_argb, int src_stride_argb,
-              uint8* dst_argb, int dst_stride_argb,
-              int width, int height) {
-  void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) = SobelRow_C;
+int ARGBSobel(const uint8_t* src_argb,
+              int src_stride_argb,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int width,
+              int height) {
+  void (*SobelRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                   uint8_t* dst_argb, int width) = SobelRow_C;
 #if defined(HAS_SOBELROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelRow = SobelRow_Any_SSE2;
@@ -2319,6 +2905,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
       SobelRow = SobelRow_NEON;
     }
   }
+#endif
+#if defined(HAS_SOBELROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelRow = SobelRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelRow = SobelRow_MSA;
+    }
+  }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelRow);
@@ -2326,11 +2920,14 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb,
 
 // Sobel ARGB effect with planar output.
 LIBYUV_API
-int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
-                     uint8* dst_y, int dst_stride_y,
-                     int width, int height) {
-  void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_, int width) = SobelToPlaneRow_C;
+int ARGBSobelToPlane(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     int width,
+                     int height) {
+  void (*SobelToPlaneRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                          uint8_t* dst_, int width) = SobelToPlaneRow_C;
 #if defined(HAS_SOBELTOPLANEROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelToPlaneRow = SobelToPlaneRow_Any_SSE2;
@@ -2347,18 +2944,29 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
-  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y,
-                      width, height, SobelToPlaneRow);
+#if defined(HAS_SOBELTOPLANEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelToPlaneRow = SobelToPlaneRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SobelToPlaneRow = SobelToPlaneRow_MSA;
+    }
+  }
+#endif
+  return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, width,
+                      height, SobelToPlaneRow);
 }
 
 // SobelXY ARGB effect.
 // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B.  G = Sobel.
 LIBYUV_API
-int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
-                uint8* dst_argb, int dst_stride_argb,
-                int width, int height) {
-  void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) = SobelXYRow_C;
+int ARGBSobelXY(const uint8_t* src_argb,
+                int src_stride_argb,
+                uint8_t* dst_argb,
+                int dst_stride_argb,
+                int width,
+                int height) {
+  void (*SobelXYRow)(const uint8_t* src_sobelx, const uint8_t* src_sobely,
+                     uint8_t* dst_argb, int width) = SobelXYRow_C;
 #if defined(HAS_SOBELXYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     SobelXYRow = SobelXYRow_Any_SSE2;
@@ -2374,6 +2982,14 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
       SobelXYRow = SobelXYRow_NEON;
     }
   }
+#endif
+#if defined(HAS_SOBELXYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SobelXYRow = SobelXYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      SobelXYRow = SobelXYRow_MSA;
+    }
+  }
 #endif
   return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height, SobelXYRow);
@@ -2381,26 +2997,27 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb,
 
 // Apply a 4x4 polynomial to each ARGB pixel.
 LIBYUV_API
-int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
-                   uint8* dst_argb, int dst_stride_argb,
+int ARGBPolynomial(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
                    const float* poly,
-                   int width, int height) {
+                   int width,
+                   int height) {
   int y;
-  void (*ARGBPolynomialRow)(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) = ARGBPolynomialRow_C;
+  void (*ARGBPolynomialRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                            const float* poly, int width) = ARGBPolynomialRow_C;
   if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2425,28 +3042,132 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
   return 0;
 }
 
+// Convert plane of 16 bit shorts to half floats.
+// Source values are multiplied by scale before storing as half float.
+LIBYUV_API
+int HalfFloatPlane(const uint16_t* src_y,
+                   int src_stride_y,
+                   uint16_t* dst_y,
+                   int dst_stride_y,
+                   float scale,
+                   int width,
+                   int height) {
+  int y;
+  void (*HalfFloatRow)(const uint16_t* src, uint16_t* dst, float scale,
+                       int width) = HalfFloatRow_C;
+  if (!src_y || !dst_y || width <= 0 || height == 0) {
+    return -1;
+  }
+  src_stride_y >>= 1;
+  dst_stride_y >>= 1;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_HALFFLOATROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    HalfFloatRow = HalfFloatRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = HalfFloatRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    HalfFloatRow = HalfFloatRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = HalfFloatRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_F16C)
+  if (TestCpuFlag(kCpuHasAVX2) && TestCpuFlag(kCpuHasF16C)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_F16C : HalfFloatRow_Any_F16C;
+    if (IS_ALIGNED(width, 16)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_F16C : HalfFloatRow_F16C;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    HalfFloatRow =
+        (scale == 1.0f) ? HalfFloat1Row_Any_NEON : HalfFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      HalfFloatRow = (scale == 1.0f) ? HalfFloat1Row_NEON : HalfFloatRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_HALFFLOATROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    HalfFloatRow = HalfFloatRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      HalfFloatRow = HalfFloatRow_MSA;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    HalfFloatRow(src_y, dst_y, scale, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+  return 0;
+}
+
+// Convert a buffer of bytes to floats, scale the values and store as floats.
+LIBYUV_API
+int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
+  void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
+                         int width) = ByteToFloatRow_C;
+  if (!src_y || !dst_y || width <= 0) {
+    return -1;
+  }
+#if defined(HAS_BYTETOFLOATROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ByteToFloatRow = ByteToFloatRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ByteToFloatRow = ByteToFloatRow_NEON;
+    }
+  }
+#endif
+
+  ByteToFloatRow(src_y, dst_y, scale, width);
+  return 0;
+}
+
 // Apply a lumacolortable to each ARGB pixel.
 LIBYUV_API
-int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_argb, int dst_stride_argb,
-                       const uint8* luma,
-                       int width, int height) {
+int ARGBLumaColorTable(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       const uint8_t* luma,
+                       int width,
+                       int height) {
   int y;
-  void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb,
-      int width, const uint8* luma, const uint32 lumacoeff) =
-      ARGBLumaColorTableRow_C;
+  void (*ARGBLumaColorTableRow)(
+      const uint8_t* src_argb, uint8_t* dst_argb, int width,
+      const uint8_t* luma, const uint32_t lumacoeff) = ARGBLumaColorTableRow_C;
   if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb  = src_argb  + (height - 1) * src_stride_argb;
+    src_argb = src_argb + (height - 1) * src_stride_argb;
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2467,12 +3188,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
 
 // Copy Alpha from one ARGB image to another.
 LIBYUV_API
-int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int width, int height) {
+int ARGBCopyAlpha(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   int y;
-  void (*ARGBCopyAlphaRow)(const uint8* src_argb, uint8* dst_argb, int width) =
-      ARGBCopyAlphaRow_C;
+  void (*ARGBCopyAlphaRow)(const uint8_t* src_argb, uint8_t* dst_argb,
+                           int width) = ARGBCopyAlphaRow_C;
   if (!src_argb || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2483,8 +3207,7 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
     src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride_argb == width * 4 &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_argb = dst_stride_argb = 0;
@@ -2516,55 +3239,73 @@ int ARGBCopyAlpha(const uint8* src_argb, int src_stride_argb,
 
 // Extract just the alpha channel from ARGB.
 LIBYUV_API
-int ARGBExtractAlpha(const uint8* src_argb, int src_stride,
-                     uint8* dst_a, int dst_stride,
-                     int width, int height) {
+int ARGBExtractAlpha(const uint8_t* src_argb,
+                     int src_stride_argb,
+                     uint8_t* dst_a,
+                     int dst_stride_a,
+                     int width,
+                     int height) {
   if (!src_argb || !dst_a || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
-    src_argb += (height - 1) * src_stride;
-    src_stride = -src_stride;
+    src_argb += (height - 1) * src_stride_argb;
+    src_stride_argb = -src_stride_argb;
   }
   // Coalesce rows.
-  if (src_stride == width * 4 && dst_stride == width) {
+  if (src_stride_argb == width * 4 && dst_stride_a == width) {
     width *= height;
     height = 1;
-    src_stride = dst_stride = 0;
+    src_stride_argb = dst_stride_a = 0;
   }
-  void (*ARGBExtractAlphaRow)(const uint8 *src_argb, uint8 *dst_a, int width) =
-      ARGBExtractAlphaRow_C;
+  void (*ARGBExtractAlphaRow)(const uint8_t* src_argb, uint8_t* dst_a,
+                              int width) = ARGBExtractAlphaRow_C;
 #if defined(HAS_ARGBEXTRACTALPHAROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_SSE2
                                                : ARGBExtractAlphaRow_Any_SSE2;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 32) ? ARGBExtractAlphaRow_AVX2
+                                                : ARGBExtractAlphaRow_Any_AVX2;
+  }
+#endif
 #if defined(HAS_ARGBEXTRACTALPHAROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON
                                                 : ARGBExtractAlphaRow_Any_NEON;
   }
 #endif
+#if defined(HAS_ARGBEXTRACTALPHAROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_MSA
+                                                : ARGBExtractAlphaRow_Any_MSA;
+  }
+#endif
 
   for (int y = 0; y < height; ++y) {
     ARGBExtractAlphaRow(src_argb, dst_a, width);
-    src_argb += src_stride;
-    dst_a += dst_stride;
+    src_argb += src_stride_argb;
+    dst_a += dst_stride_a;
   }
   return 0;
 }
 
 // Copy a planar Y channel to the alpha channel of a destination ARGB image.
 LIBYUV_API
-int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
-                     uint8* dst_argb, int dst_stride_argb,
-                     int width, int height) {
+int ARGBCopyYToAlpha(const uint8_t* src_y,
+                     int src_stride_y,
+                     uint8_t* dst_argb,
+                     int dst_stride_argb,
+                     int width,
+                     int height) {
   int y;
-  void (*ARGBCopyYToAlphaRow)(const uint8* src_y, uint8* dst_argb, int width) =
-      ARGBCopyYToAlphaRow_C;
+  void (*ARGBCopyYToAlphaRow)(const uint8_t* src_y, uint8_t* dst_argb,
+                              int width) = ARGBCopyYToAlphaRow_C;
   if (!src_y || !dst_argb || width <= 0 || height == 0) {
     return -1;
   }
@@ -2575,8 +3316,7 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
     src_stride_y = -src_stride_y;
   }
   // Coalesce rows.
-  if (src_stride_y == width &&
-      dst_stride_argb == width * 4) {
+  if (src_stride_y == width && dst_stride_argb == width * 4) {
     width *= height;
     height = 1;
     src_stride_y = dst_stride_argb = 0;
@@ -2610,20 +3350,22 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y,
 // directly. A SplitUVRow_Odd function could copy the remaining chroma.
 
 LIBYUV_API
-int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int YUY2ToNV12(const uint8_t* src_yuy2,
+               int src_stride_yuy2,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_yuy2 ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2656,6 +3398,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2680,6 +3430,14 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
@@ -2708,20 +3466,22 @@ int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 LIBYUV_API
-int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_uv, int dst_stride_uv,
-               int width, int height) {
+int UYVYToNV12(const uint8_t* src_uyvy,
+               int src_stride_uyvy,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
   int y;
   int halfwidth = (width + 1) >> 1;
-  void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+  void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
                      int width) = SplitUVRow_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
                          ptrdiff_t src_stride, int dst_width,
                          int source_y_fraction) = InterpolateRow_C;
-  if (!src_uyvy ||
-      !dst_y || !dst_uv ||
-      width <= 0 || height == 0) {
+  if (!src_uyvy || !dst_y || !dst_uv || width <= 0 || height == 0) {
     return -1;
   }
   // Negative height means invert the image.
@@ -2754,6 +3514,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_SPLITUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    SplitUVRow = SplitUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      SplitUVRow = SplitUVRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -2778,6 +3546,14 @@ int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
+  }
+#endif
 
   {
     int awidth = halfwidth * 2;
diff --git a/libs/libvpx/third_party/libyuv/source/rotate.cc b/libs/libvpx/third_party/libyuv/source/rotate.cc
index 01ea5c40744..f2bed85b755 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate.cc
@@ -10,8 +10,8 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate_row.h"
 #include "libyuv/row.h"
@@ -22,12 +22,20 @@ extern "C" {
 #endif
 
 LIBYUV_API
-void TransposePlane(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposePlane(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i = height;
-  void (*TransposeWx8)(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
+                        int dst_stride, int width) = TransposeWx16_C;
+#else
+  void (*TransposeWx8)(const uint8_t* src, int src_stride, uint8_t* dst,
+                       int dst_stride, int width) = TransposeWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeWx8 = TransposeWx8_NEON;
@@ -49,24 +57,32 @@ void TransposePlane(const uint8* src, int src_stride,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    if (IS_ALIGNED(width, 4) &&
-        IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-      TransposeWx8 = TransposeWx8_Fast_DSPR2;
-    } else {
-      TransposeWx8 = TransposeWx8_DSPR2;
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeWx16 = TransposeWx16_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      TransposeWx16 = TransposeWx16_MSA;
     }
   }
 #endif
 
+#if defined(HAS_TRANSPOSEWX16_MSA)
+  // Work across the source in 16x16 tiles
+  while (i >= 16) {
+    TransposeWx16(src, src_stride, dst, dst_stride, width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst += 16;               // Move over 16 columns.
+    i -= 16;
+  }
+#else
   // Work across the source in 8x8 tiles
   while (i >= 8) {
     TransposeWx8(src, src_stride, dst, dst_stride, width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst += 8;                 // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst += 8;               // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
     TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
@@ -74,9 +90,12 @@ void TransposePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane90(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride,
-                   int width, int height) {
+void RotatePlane90(const uint8_t* src,
+                   int src_stride,
+                   uint8_t* dst,
+                   int dst_stride,
+                   int width,
+                   int height) {
   // Rotate by 90 is a transpose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
@@ -86,9 +105,12 @@ void RotatePlane90(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane270(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Rotate by 270 is a transpose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
@@ -98,17 +120,20 @@ void RotatePlane270(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void RotatePlane180(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void RotatePlane180(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src + src_stride * (height - 1);
+  uint8_t* dst_bot = dst + dst_stride * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  void (*CopyRow)(const uint8_t* src, uint8_t* dst, int width) = CopyRow_C;
 #if defined(HAS_MIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     MirrorRow = MirrorRow_Any_NEON;
@@ -133,12 +158,12 @@ void RotatePlane180(const uint8* src, int src_stride,
     }
   }
 #endif
-// TODO(fbarchard): Mirror on mips handle unaligned memory.
-#if defined(HAS_MIRRORROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
-    MirrorRow = MirrorRow_DSPR2;
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
   }
 #endif
 #if defined(HAS_COPYROW_SSE2)
@@ -161,11 +186,6 @@ void RotatePlane180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
@@ -181,15 +201,24 @@ void RotatePlane180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-void TransposeUV(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void TransposeUV(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i = height;
-  void (*TransposeUVWx8)(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  void (*TransposeUVWx16)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                          int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
+                          int width) = TransposeUVWx16_C;
+#else
+  void (*TransposeUVWx8)(const uint8_t* src, int src_stride, uint8_t* dst_a,
+                         int dst_stride_a, uint8_t* dst_b, int dst_stride_b,
                          int width) = TransposeUVWx8_C;
+#endif
 #if defined(HAS_TRANSPOSEUVWX8_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     TransposeUVWx8 = TransposeUVWx8_NEON;
@@ -203,72 +232,90 @@ void TransposeUV(const uint8* src, int src_stride,
     }
   }
 #endif
-#if defined(HAS_TRANSPOSEUVWX8_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(width, 2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    TransposeUVWx8 = TransposeUVWx8_DSPR2;
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    TransposeUVWx16 = TransposeUVWx16_Any_MSA;
+    if (IS_ALIGNED(width, 8)) {
+      TransposeUVWx16 = TransposeUVWx16_MSA;
+    }
   }
 #endif
 
+#if defined(HAS_TRANSPOSEUVWX16_MSA)
+  // Work through the source in 8x8 tiles.
+  while (i >= 16) {
+    TransposeUVWx16(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                    width);
+    src += 16 * src_stride;  // Go down 16 rows.
+    dst_a += 16;             // Move over 8 columns.
+    dst_b += 16;             // Move over 8 columns.
+    i -= 16;
+  }
+#else
   // Work through the source in 8x8 tiles.
   while (i >= 8) {
-    TransposeUVWx8(src, src_stride,
-                   dst_a, dst_stride_a,
-                   dst_b, dst_stride_b,
+    TransposeUVWx8(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                    width);
-    src += 8 * src_stride;    // Go down 8 rows.
-    dst_a += 8;               // Move over 8 columns.
-    dst_b += 8;               // Move over 8 columns.
+    src += 8 * src_stride;  // Go down 8 rows.
+    dst_a += 8;             // Move over 8 columns.
+    dst_b += 8;             // Move over 8 columns.
     i -= 8;
   }
+#endif
 
   if (i > 0) {
-    TransposeUVWxH_C(src, src_stride,
-                     dst_a, dst_stride_a,
-                     dst_b, dst_stride_b,
+    TransposeUVWxH_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
                      width, i);
   }
 }
 
 LIBYUV_API
-void RotateUV90(const uint8* src, int src_stride,
-                uint8* dst_a, int dst_stride_a,
-                uint8* dst_b, int dst_stride_b,
-                int width, int height) {
+void RotateUV90(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst_a,
+                int dst_stride_a,
+                uint8_t* dst_b,
+                int dst_stride_b,
+                int width,
+                int height) {
   src += src_stride * (height - 1);
   src_stride = -src_stride;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 LIBYUV_API
-void RotateUV270(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV270(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   dst_a += dst_stride_a * (width - 1);
   dst_b += dst_stride_b * (width - 1);
   dst_stride_a = -dst_stride_a;
   dst_stride_b = -dst_stride_b;
 
-  TransposeUV(src, src_stride,
-              dst_a, dst_stride_a,
-              dst_b, dst_stride_b,
-              width, height);
+  TransposeUV(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, width,
+              height);
 }
 
 // Rotate 180 is a horizontal and vertical flip.
 LIBYUV_API
-void RotateUV180(const uint8* src, int src_stride,
-                 uint8* dst_a, int dst_stride_a,
-                 uint8* dst_b, int dst_stride_b,
-                 int width, int height) {
+void RotateUV180(const uint8_t* src,
+                 int src_stride,
+                 uint8_t* dst_a,
+                 int dst_stride_a,
+                 uint8_t* dst_b,
+                 int dst_stride_b,
+                 int width,
+                 int height) {
   int i;
-  void (*MirrorUVRow)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
-      MirrorUVRow_C;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
+                      int width) = MirrorUVRow_C;
 #if defined(HAS_MIRRORUVROW_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
     MirrorUVRow = MirrorUVRow_NEON;
@@ -279,10 +326,9 @@ void RotateUV180(const uint8* src, int src_stride,
     MirrorUVRow = MirrorUVRow_SSSE3;
   }
 #endif
-#if defined(HAS_MIRRORUVROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
-    MirrorUVRow = MirrorUVRow_DSPR2;
+#if defined(HAS_MIRRORUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && IS_ALIGNED(width, 32)) {
+    MirrorUVRow = MirrorUVRow_MSA;
   }
 #endif
 
@@ -298,9 +344,12 @@ void RotateUV180(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int RotatePlane(const uint8* src, int src_stride,
-                uint8* dst, int dst_stride,
-                int width, int height,
+int RotatePlane(const uint8_t* src,
+                int src_stride,
+                uint8_t* dst,
+                int dst_stride,
+                int width,
+                int height,
                 enum RotationMode mode) {
   if (!src || width <= 0 || height == 0 || !dst) {
     return -1;
@@ -316,24 +365,16 @@ int RotatePlane(const uint8* src, int src_stride,
   switch (mode) {
     case kRotate0:
       // copy frame
-      CopyPlane(src, src_stride,
-                dst, dst_stride,
-                width, height);
+      CopyPlane(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate90:
-      RotatePlane90(src, src_stride,
-                    dst, dst_stride,
-                    width, height);
+      RotatePlane90(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate270:
-      RotatePlane270(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane270(src, src_stride, dst, dst_stride, width, height);
       return 0;
     case kRotate180:
-      RotatePlane180(src, src_stride,
-                     dst, dst_stride,
-                     width, height);
+      RotatePlane180(src, src_stride, dst, dst_stride, width, height);
       return 0;
     default:
       break;
@@ -342,18 +383,25 @@ int RotatePlane(const uint8* src, int src_stride,
 }
 
 LIBYUV_API
-int I420Rotate(const uint8* src_y, int src_stride_y,
-               const uint8* src_u, int src_stride_u,
-               const uint8* src_v, int src_stride_v,
-               uint8* dst_y, int dst_stride_y,
-               uint8* dst_u, int dst_stride_u,
-               uint8* dst_v, int dst_stride_v,
-               int width, int height,
+int I420Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
                enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
     return -1;
   }
 
@@ -372,45 +420,29 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return I420Copy(src_y, src_stride_y,
-                      src_u, src_stride_u,
-                      src_v, src_stride_v,
-                      dst_y, dst_stride_y,
-                      dst_u, dst_stride_u,
-                      dst_v, dst_stride_v,
-                      width, height);
+      return I420Copy(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                      src_stride_v, dst_y, dst_stride_y, dst_u, dst_stride_u,
+                      dst_v, dst_stride_v, width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotatePlane90(src_u, src_stride_u,
-                    dst_u, dst_stride_u,
-                    halfwidth, halfheight);
-      RotatePlane90(src_v, src_stride_v,
-                    dst_v, dst_stride_v,
-                    halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                    halfheight);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                    halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane270(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane270(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotatePlane180(src_u, src_stride_u,
-                     dst_u, dst_stride_u,
-                     halfwidth, halfheight);
-      RotatePlane180(src_v, src_stride_v,
-                     dst_v, dst_stride_v,
-                     halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth,
+                     halfheight);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth,
+                     halfheight);
       return 0;
     default:
       break;
@@ -419,17 +451,23 @@ int I420Rotate(const uint8* src_y, int src_stride_y,
 }
 
 LIBYUV_API
-int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
-                     const uint8* src_uv, int src_stride_uv,
-                     uint8* dst_y, int dst_stride_y,
-                     uint8* dst_u, int dst_stride_u,
-                     uint8* dst_v, int dst_stride_v,
-                     int width, int height,
+int NV12ToI420Rotate(const uint8_t* src_y,
+                     int src_stride_y,
+                     const uint8_t* src_uv,
+                     int src_stride_uv,
+                     uint8_t* dst_y,
+                     int dst_stride_y,
+                     uint8_t* dst_u,
+                     int dst_stride_u,
+                     uint8_t* dst_v,
+                     int dst_stride_v,
+                     int width,
+                     int height,
                      enum RotationMode mode) {
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_uv || width <= 0 || height == 0 ||
-      !dst_y || !dst_u || !dst_v) {
+  if (!src_y || !src_uv || width <= 0 || height == 0 || !dst_y || !dst_u ||
+      !dst_v) {
     return -1;
   }
 
@@ -446,38 +484,23 @@ int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return NV12ToI420(src_y, src_stride_y,
-                        src_uv, src_stride_uv,
-                        dst_y, dst_stride_y,
-                        dst_u, dst_stride_u,
-                        dst_v, dst_stride_v,
+      return NV12ToI420(src_y, src_stride_y, src_uv, src_stride_uv, dst_y,
+                        dst_stride_y, dst_u, dst_stride_u, dst_v, dst_stride_v,
                         width, height);
     case kRotate90:
-      RotatePlane90(src_y, src_stride_y,
-                    dst_y, dst_stride_y,
-                    width, height);
-      RotateUV90(src_uv, src_stride_uv,
-                 dst_u, dst_stride_u,
-                 dst_v, dst_stride_v,
-                 halfwidth, halfheight);
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV90(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                 dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate270:
-      RotatePlane270(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV270(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV270(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     case kRotate180:
-      RotatePlane180(src_y, src_stride_y,
-                     dst_y, dst_stride_y,
-                     width, height);
-      RotateUV180(src_uv, src_stride_uv,
-                  dst_u, dst_stride_u,
-                  dst_v, dst_stride_v,
-                  halfwidth, halfheight);
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotateUV180(src_uv, src_stride_uv, dst_u, dst_stride_u, dst_v,
+                  dst_stride_v, halfwidth, halfheight);
       return 0;
     default:
       break;
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_any.cc b/libs/libvpx/third_party/libyuv/source/rotate_any.cc
index 31a74c31555..c2752e6222c 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_any.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_any.cc
@@ -18,16 +18,16 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define TANY(NAMEANY, TPOS_SIMD, MASK)                                         \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                 uint8* dst, int dst_stride, int width) {                      \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst, dst_stride, n);                        \
-      }                                                                        \
-      TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r);\
-    }
+#define TANY(NAMEANY, TPOS_SIMD, MASK)                                        \
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst,              \
+               int dst_stride, int width) {                                   \
+    int r = width & MASK;                                                     \
+    int n = width - r;                                                        \
+    if (n > 0) {                                                              \
+      TPOS_SIMD(src, src_stride, dst, dst_stride, n);                         \
+    }                                                                         \
+    TransposeWx8_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \
+  }
 
 #ifdef HAS_TRANSPOSEWX8_NEON
 TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, 7)
@@ -38,25 +38,23 @@ TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, 7)
 #ifdef HAS_TRANSPOSEWX8_FAST_SSSE3
 TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, 15)
 #endif
-#ifdef HAS_TRANSPOSEWX8_DSPR2
-TANY(TransposeWx8_Any_DSPR2, TransposeWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEWX16_MSA
+TANY(TransposeWx16_Any_MSA, TransposeWx16_MSA, 15)
 #endif
 #undef TANY
 
 #define TUVANY(NAMEANY, TPOS_SIMD, MASK)                                       \
-    void NAMEANY(const uint8* src, int src_stride,                             \
-                uint8* dst_a, int dst_stride_a,                                \
-                uint8* dst_b, int dst_stride_b, int width) {                   \
-      int r = width & MASK;                                                    \
-      int n = width - r;                                                       \
-      if (n > 0) {                                                             \
-        TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,   \
-                  n);                                                          \
-      }                                                                        \
-      TransposeUVWx8_C(src + n * 2, src_stride,                                \
-                       dst_a + n * dst_stride_a, dst_stride_a,                 \
-                       dst_b + n * dst_stride_b, dst_stride_b, r);             \
-    }
+  void NAMEANY(const uint8_t* src, int src_stride, uint8_t* dst_a,             \
+               int dst_stride_a, uint8_t* dst_b, int dst_stride_b,             \
+               int width) {                                                    \
+    int r = width & MASK;                                                      \
+    int n = width - r;                                                         \
+    if (n > 0) {                                                               \
+      TPOS_SIMD(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b, n); \
+    }                                                                          \
+    TransposeUVWx8_C(src + n * 2, src_stride, dst_a + n * dst_stride_a,        \
+                     dst_stride_a, dst_b + n * dst_stride_b, dst_stride_b, r); \
+  }
 
 #ifdef HAS_TRANSPOSEUVWX8_NEON
 TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
@@ -64,8 +62,8 @@ TUVANY(TransposeUVWx8_Any_NEON, TransposeUVWx8_NEON, 7)
 #ifdef HAS_TRANSPOSEUVWX8_SSE2
 TUVANY(TransposeUVWx8_Any_SSE2, TransposeUVWx8_SSE2, 7)
 #endif
-#ifdef HAS_TRANSPOSEUVWX8_DSPR2
-TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
+#ifdef HAS_TRANSPOSEUVWX16_MSA
+TUVANY(TransposeUVWx16_Any_MSA, TransposeUVWx16_MSA, 7)
 #endif
 #undef TUVANY
 
@@ -73,8 +71,3 @@ TUVANY(TransposeUVWx8_Any_DSPR2, TransposeUVWx8_DSPR2, 7)
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_argb.cc b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc
index 787c0ad1be9..5a6e05376f1 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_argb.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_argb.cc
@@ -10,90 +10,106 @@
 
 #include "libyuv/rotate.h"
 
-#include "libyuv/cpu_id.h"
 #include "libyuv/convert.h"
+#include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/row.h"
+#include "libyuv/scale_row.h" /* for ScaleARGBRowDownEven_ */
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-// ARGBScale has a function to copy pixels to a row, striding each source
-// pixel by a constant.
-#if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(_M_IX86) || \
-    (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
-#define HAS_SCALEARGBROWDOWNEVEN_SSE2
-void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
-    (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__))
-#define HAS_SCALEARGBROWDOWNEVEN_NEON
-void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
-                               int src_stepx, uint8* dst_ptr, int dst_width);
-#endif
-
-void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
-                            int src_stepx, uint8* dst_ptr, int dst_width);
-
-static void ARGBTranspose(const uint8* src, int src_stride,
-                          uint8* dst, int dst_stride, int width, int height) {
+static void ARGBTranspose(const uint8_t* src_argb,
+                          int src_stride_argb,
+                          uint8_t* dst_argb,
+                          int dst_stride_argb,
+                          int width,
+                          int height) {
   int i;
-  int src_pixel_step = src_stride >> 2;
-  void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
-      int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+  int src_pixel_step = src_stride_argb >> 2;
+  void (*ScaleARGBRowDownEven)(
+      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
+      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+    }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4)) {  // Width of dest.
-    ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_MSA;
+    if (IS_ALIGNED(height, 4)) {  // Width of dest.
+      ScaleARGBRowDownEven = ScaleARGBRowDownEven_MSA;
+    }
   }
 #endif
 
   for (i = 0; i < width; ++i) {  // column of source to row of dest.
-    ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
-    dst += dst_stride;
-    src += 4;
+    ScaleARGBRowDownEven(src_argb, 0, src_pixel_step, dst_argb, height);
+    dst_argb += dst_stride_argb;
+    src_argb += 4;
   }
 }
 
-void ARGBRotate90(const uint8* src, int src_stride,
-                  uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate90(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int width,
+                  int height) {
   // Rotate by 90 is a ARGBTranspose with the source read
   // from bottom to top. So set the source pointer to the end
   // of the buffer and flip the sign of the source stride.
-  src += src_stride * (height - 1);
-  src_stride = -src_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  src_argb += src_stride_argb * (height - 1);
+  src_stride_argb = -src_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
 }
 
-void ARGBRotate270(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate270(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   // Rotate by 270 is a ARGBTranspose with the destination written
   // from bottom to top. So set the destination pointer to the end
   // of the buffer and flip the sign of the destination stride.
-  dst += dst_stride * (width - 1);
-  dst_stride = -dst_stride;
-  ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+  dst_argb += dst_stride_argb * (width - 1);
+  dst_stride_argb = -dst_stride_argb;
+  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                height);
 }
 
-void ARGBRotate180(const uint8* src, int src_stride,
-                   uint8* dst, int dst_stride, int width, int height) {
+void ARGBRotate180(const uint8_t* src_argb,
+                   int src_stride_argb,
+                   uint8_t* dst_argb,
+                   int dst_stride_argb,
+                   int width,
+                   int height) {
   // Swap first and last row and mirror the content. Uses a temporary row.
   align_buffer_64(row, width * 4);
-  const uint8* src_bot = src + src_stride * (height - 1);
-  uint8* dst_bot = dst + dst_stride * (height - 1);
+  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
+  uint8_t* dst_bot = dst_argb + dst_stride_argb * (height - 1);
   int half_height = (height + 1) >> 1;
   int y;
-  void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+  void (*ARGBMirrorRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
       ARGBMirrorRow_C;
-  void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+  void (*CopyRow)(const uint8_t* src_argb, uint8_t* dst_argb, int width) =
+      CopyRow_C;
 #if defined(HAS_ARGBMIRRORROW_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
     ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
@@ -118,6 +134,14 @@ void ARGBRotate180(const uint8* src, int src_stride,
     }
   }
 #endif
+#if defined(HAS_ARGBMIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ARGBMirrorRow = ARGBMirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBMirrorRow = ARGBMirrorRow_MSA;
+    }
+  }
+#endif
 #if defined(HAS_COPYROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_SSE2 : CopyRow_Any_SSE2;
@@ -138,28 +162,27 @@ void ARGBRotate180(const uint8* src, int src_stride,
     CopyRow = IS_ALIGNED(width * 4, 32) ? CopyRow_NEON : CopyRow_Any_NEON;
   }
 #endif
-#if defined(HAS_COPYROW_MIPS)
-  if (TestCpuFlag(kCpuHasMIPS)) {
-    CopyRow = CopyRow_MIPS;
-  }
-#endif
 
   // Odd height will harmlessly mirror the middle row twice.
   for (y = 0; y < half_height; ++y) {
-    ARGBMirrorRow(src, row, width);  // Mirror first row into a buffer
-    ARGBMirrorRow(src_bot, dst, width);  // Mirror last row into first row
+    ARGBMirrorRow(src_argb, row, width);      // Mirror first row into a buffer
+    ARGBMirrorRow(src_bot, dst_argb, width);  // Mirror last row into first row
     CopyRow(row, dst_bot, width * 4);  // Copy first mirrored row into last
-    src += src_stride;
-    dst += dst_stride;
-    src_bot -= src_stride;
-    dst_bot -= dst_stride;
+    src_argb += src_stride_argb;
+    dst_argb += dst_stride_argb;
+    src_bot -= src_stride_argb;
+    dst_bot -= dst_stride_argb;
   }
   free_aligned_buffer_64(row);
 }
 
 LIBYUV_API
-int ARGBRotate(const uint8* src_argb, int src_stride_argb,
-               uint8* dst_argb, int dst_stride_argb, int width, int height,
+int ARGBRotate(const uint8_t* src_argb,
+               int src_stride_argb,
+               uint8_t* dst_argb,
+               int dst_stride_argb,
+               int width,
+               int height,
                enum RotationMode mode) {
   if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
     return -1;
@@ -175,23 +198,19 @@ int ARGBRotate(const uint8* src_argb, int src_stride_argb,
   switch (mode) {
     case kRotate0:
       // copy frame
-      return ARGBCopy(src_argb, src_stride_argb,
-                      dst_argb, dst_stride_argb,
+      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                       width, height);
     case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb,
-                   dst_argb, dst_stride_argb,
-                   width, height);
+      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                   height);
       return 0;
     case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb,
-                    dst_argb, dst_stride_argb,
-                    width, height);
+      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
+                    height);
       return 0;
     default:
       break;
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_common.cc b/libs/libvpx/third_party/libyuv/source/rotate_common.cc
index b33a9a0c6ed..ff212adebc4 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_common.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_common.cc
@@ -8,16 +8,19 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
 #endif
 
-void TransposeWx8_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride, int width) {
+void TransposeWx8_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst[0] = src[0 * src_stride];
@@ -33,9 +36,13 @@ void TransposeWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWx8_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b, int width) {
+void TransposeUVWx8_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width) {
   int i;
   for (i = 0; i < width; ++i) {
     dst_a[0] = src[0 * src_stride + 0];
@@ -60,9 +67,12 @@ void TransposeUVWx8_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeWxH_C(const uint8* src, int src_stride,
-                    uint8* dst, int dst_stride,
-                    int width, int height) {
+void TransposeWxH_C(const uint8_t* src,
+                    int src_stride,
+                    uint8_t* dst,
+                    int dst_stride,
+                    int width,
+                    int height) {
   int i;
   for (i = 0; i < width; ++i) {
     int j;
@@ -72,10 +82,14 @@ void TransposeWxH_C(const uint8* src, int src_stride,
   }
 }
 
-void TransposeUVWxH_C(const uint8* src, int src_stride,
-                      uint8* dst_a, int dst_stride_a,
-                      uint8* dst_b, int dst_stride_b,
-                      int width, int height) {
+void TransposeUVWxH_C(const uint8_t* src,
+                      int src_stride,
+                      uint8_t* dst_a,
+                      int dst_stride_a,
+                      uint8_t* dst_b,
+                      int dst_stride_b,
+                      int width,
+                      int height) {
   int i;
   for (i = 0; i < width * 2; i += 2) {
     int j;
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc
index cbe870caa74..04e19e29eef 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_gcc.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -22,342 +22,348 @@ extern "C" {
 
 // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
 #if defined(HAS_TRANSPOSEWX8_SSSE3)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movq       (%0),%%xmm0                      \n"
-    "movq       (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "movq       (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "movq       (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movq       (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "movq       (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movq       (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "lea        0x8(%0,%3,8),%0                  \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "sub        $0x8,%2                          \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void TransposeWx8_SSSE3(const uint8_t* src,
+                        int src_stride,
+                        uint8_t* dst,
+                        int dst_stride,
+                        int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movq       (%0),%%xmm0                      \n"
+      "movq       (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "movq       (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "movq       (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movq       (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "movq       (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movq       (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "lea        0x8(%0,%3,8),%0                  \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "sub        $0x8,%2                          \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_SSSE3)
 
 // Transpose 16x8. 64 bit
 #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
-void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%3),%%xmm1                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqa     %%xmm0,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm9                    \n"
-    "palignr    $0x8,%%xmm1,%%xmm1               \n"
-    "palignr    $0x8,%%xmm9,%%xmm9               \n"
-    "movdqu     (%0,%3),%%xmm3                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm10                   \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm10                   \n"
-    "movdqa     %%xmm2,%%xmm3                    \n"
-    "movdqa     %%xmm10,%%xmm11                  \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "movdqu     (%0,%3),%%xmm5                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm12                   \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm12                   \n"
-    "movdqa     %%xmm4,%%xmm5                    \n"
-    "movdqa     %%xmm12,%%xmm13                  \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movdqu     (%0,%3),%%xmm7                   \n"
-    "lea        (%0,%3,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm14                   \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "punpckhbw  %%xmm7,%%xmm14                   \n"
-    "neg        %3                               \n"
-    "movdqa     %%xmm6,%%xmm7                    \n"
-    "movdqa     %%xmm14,%%xmm15                  \n"
-    "lea        0x10(%0,%3,8),%0                 \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "neg        %3                               \n"
-     // Second round of bit swap.
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "palignr    $0x8,%%xmm2,%%xmm2               \n"
-    "palignr    $0x8,%%xmm3,%%xmm3               \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm4,%%xmm6                    \n"
-    "movdqa     %%xmm5,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "punpcklwd  %%xmm10,%%xmm8                   \n"
-    "punpcklwd  %%xmm11,%%xmm9                   \n"
-    "movdqa     %%xmm8,%%xmm10                   \n"
-    "movdqa     %%xmm9,%%xmm11                   \n"
-    "palignr    $0x8,%%xmm10,%%xmm10             \n"
-    "palignr    $0x8,%%xmm11,%%xmm11             \n"
-    "punpcklwd  %%xmm14,%%xmm12                  \n"
-    "punpcklwd  %%xmm15,%%xmm13                  \n"
-    "movdqa     %%xmm12,%%xmm14                  \n"
-    "movdqa     %%xmm13,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movq       %%xmm0,(%1)                      \n"
-    "movdqa     %%xmm0,%%xmm4                    \n"
-    "palignr    $0x8,%%xmm4,%%xmm4               \n"
-    "movq       %%xmm4,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movdqa     %%xmm2,%%xmm6                    \n"
-    "movq       %%xmm2,(%1)                      \n"
-    "palignr    $0x8,%%xmm6,%%xmm6               \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movq       %%xmm6,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm1,%%xmm5                    \n"
-    "movq       %%xmm1,(%1)                      \n"
-    "palignr    $0x8,%%xmm5,%%xmm5               \n"
-    "movq       %%xmm5,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movq       %%xmm3,(%1)                      \n"
-    "movdqa     %%xmm3,%%xmm7                    \n"
-    "palignr    $0x8,%%xmm7,%%xmm7               \n"
-    "movq       %%xmm7,(%1,%4)                   \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm12,%%xmm8                   \n"
-    "movq       %%xmm8,(%1)                      \n"
-    "movdqa     %%xmm8,%%xmm12                   \n"
-    "palignr    $0x8,%%xmm12,%%xmm12             \n"
-    "movq       %%xmm12,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm14,%%xmm10                  \n"
-    "movdqa     %%xmm10,%%xmm14                  \n"
-    "movq       %%xmm10,(%1)                     \n"
-    "palignr    $0x8,%%xmm14,%%xmm14             \n"
-    "punpckldq  %%xmm13,%%xmm9                   \n"
-    "movq       %%xmm14,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "movdqa     %%xmm9,%%xmm13                   \n"
-    "movq       %%xmm9,(%1)                      \n"
-    "palignr    $0x8,%%xmm13,%%xmm13             \n"
-    "movq       %%xmm13,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "punpckldq  %%xmm15,%%xmm11                  \n"
-    "movq       %%xmm11,(%1)                     \n"
-    "movdqa     %%xmm11,%%xmm15                  \n"
-    "palignr    $0x8,%%xmm15,%%xmm15             \n"
-    "sub        $0x10,%2                         \n"
-    "movq       %%xmm15,(%1,%4)                  \n"
-    "lea        (%1,%4,2),%1                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst),    // %1
-      "+r"(width)   // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "r"((intptr_t)(dst_stride))   // %4
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13",  "xmm14",  "xmm15"
-  );
+void TransposeWx8_Fast_SSSE3(const uint8_t* src,
+                             int src_stride,
+                             uint8_t* dst,
+                             int dst_stride,
+                             int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%3),%%xmm1                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqa     %%xmm0,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm9                    \n"
+      "palignr    $0x8,%%xmm1,%%xmm1               \n"
+      "palignr    $0x8,%%xmm9,%%xmm9               \n"
+      "movdqu     (%0,%3),%%xmm3                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm10                   \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm10                   \n"
+      "movdqa     %%xmm2,%%xmm3                    \n"
+      "movdqa     %%xmm10,%%xmm11                  \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "movdqu     (%0,%3),%%xmm5                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm12                   \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm12                   \n"
+      "movdqa     %%xmm4,%%xmm5                    \n"
+      "movdqa     %%xmm12,%%xmm13                  \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movdqu     (%0,%3),%%xmm7                   \n"
+      "lea        (%0,%3,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm14                   \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "punpckhbw  %%xmm7,%%xmm14                   \n"
+      "neg        %3                               \n"
+      "movdqa     %%xmm6,%%xmm7                    \n"
+      "movdqa     %%xmm14,%%xmm15                  \n"
+      "lea        0x10(%0,%3,8),%0                 \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "neg        %3                               \n"
+      // Second round of bit swap.
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm0,%%xmm2                    \n"
+      "movdqa     %%xmm1,%%xmm3                    \n"
+      "palignr    $0x8,%%xmm2,%%xmm2               \n"
+      "palignr    $0x8,%%xmm3,%%xmm3               \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm4,%%xmm6                    \n"
+      "movdqa     %%xmm5,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "punpcklwd  %%xmm10,%%xmm8                   \n"
+      "punpcklwd  %%xmm11,%%xmm9                   \n"
+      "movdqa     %%xmm8,%%xmm10                   \n"
+      "movdqa     %%xmm9,%%xmm11                   \n"
+      "palignr    $0x8,%%xmm10,%%xmm10             \n"
+      "palignr    $0x8,%%xmm11,%%xmm11             \n"
+      "punpcklwd  %%xmm14,%%xmm12                  \n"
+      "punpcklwd  %%xmm15,%%xmm13                  \n"
+      "movdqa     %%xmm12,%%xmm14                  \n"
+      "movdqa     %%xmm13,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movq       %%xmm0,(%1)                      \n"
+      "movdqa     %%xmm0,%%xmm4                    \n"
+      "palignr    $0x8,%%xmm4,%%xmm4               \n"
+      "movq       %%xmm4,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movdqa     %%xmm2,%%xmm6                    \n"
+      "movq       %%xmm2,(%1)                      \n"
+      "palignr    $0x8,%%xmm6,%%xmm6               \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movq       %%xmm6,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm1,%%xmm5                    \n"
+      "movq       %%xmm1,(%1)                      \n"
+      "palignr    $0x8,%%xmm5,%%xmm5               \n"
+      "movq       %%xmm5,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movq       %%xmm3,(%1)                      \n"
+      "movdqa     %%xmm3,%%xmm7                    \n"
+      "palignr    $0x8,%%xmm7,%%xmm7               \n"
+      "movq       %%xmm7,(%1,%4)                   \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm12,%%xmm8                   \n"
+      "movq       %%xmm8,(%1)                      \n"
+      "movdqa     %%xmm8,%%xmm12                   \n"
+      "palignr    $0x8,%%xmm12,%%xmm12             \n"
+      "movq       %%xmm12,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm14,%%xmm10                  \n"
+      "movdqa     %%xmm10,%%xmm14                  \n"
+      "movq       %%xmm10,(%1)                     \n"
+      "palignr    $0x8,%%xmm14,%%xmm14             \n"
+      "punpckldq  %%xmm13,%%xmm9                   \n"
+      "movq       %%xmm14,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "movdqa     %%xmm9,%%xmm13                   \n"
+      "movq       %%xmm9,(%1)                      \n"
+      "palignr    $0x8,%%xmm13,%%xmm13             \n"
+      "movq       %%xmm13,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "punpckldq  %%xmm15,%%xmm11                  \n"
+      "movq       %%xmm11,(%1)                     \n"
+      "movdqa     %%xmm11,%%xmm15                  \n"
+      "palignr    $0x8,%%xmm15,%%xmm15             \n"
+      "sub        $0x10,%2                         \n"
+      "movq       %%xmm15,(%1,%4)                  \n"
+      "lea        (%1,%4,2),%1                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "r"((intptr_t)(dst_stride))   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+        "xmm15");
 }
 #endif  // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
 
 // Transpose UV 8x8.  64 bit.
 #if defined(HAS_TRANSPOSEUVWX8_SSE2)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b, int width) {
-  asm volatile (
-    // Read in the data from the source pointer.
-    // First round of bit swap.
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     (%0),%%xmm0                      \n"
-    "movdqu     (%0,%4),%%xmm1                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpcklbw  %%xmm1,%%xmm0                    \n"
-    "punpckhbw  %%xmm1,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm1                    \n"
-    "movdqu     (%0),%%xmm2                      \n"
-    "movdqu     (%0,%4),%%xmm3                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpcklbw  %%xmm3,%%xmm2                    \n"
-    "punpckhbw  %%xmm3,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm3                    \n"
-    "movdqu     (%0),%%xmm4                      \n"
-    "movdqu     (%0,%4),%%xmm5                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "punpcklbw  %%xmm5,%%xmm4                    \n"
-    "punpckhbw  %%xmm5,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm5                    \n"
-    "movdqu     (%0),%%xmm6                      \n"
-    "movdqu     (%0,%4),%%xmm7                   \n"
-    "lea        (%0,%4,2),%0                     \n"
-    "movdqa     %%xmm6,%%xmm8                    \n"
-    "punpcklbw  %%xmm7,%%xmm6                    \n"
-    "neg        %4                               \n"
-    "lea        0x10(%0,%4,8),%0                 \n"
-    "punpckhbw  %%xmm7,%%xmm8                    \n"
-    "movdqa     %%xmm8,%%xmm7                    \n"
-    "neg        %4                               \n"
-     // Second round of bit swap.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "movdqa     %%xmm1,%%xmm9                    \n"
-    "punpckhwd  %%xmm2,%%xmm8                    \n"
-    "punpckhwd  %%xmm3,%%xmm9                    \n"
-    "punpcklwd  %%xmm2,%%xmm0                    \n"
-    "punpcklwd  %%xmm3,%%xmm1                    \n"
-    "movdqa     %%xmm8,%%xmm2                    \n"
-    "movdqa     %%xmm9,%%xmm3                    \n"
-    "movdqa     %%xmm4,%%xmm8                    \n"
-    "movdqa     %%xmm5,%%xmm9                    \n"
-    "punpckhwd  %%xmm6,%%xmm8                    \n"
-    "punpckhwd  %%xmm7,%%xmm9                    \n"
-    "punpcklwd  %%xmm6,%%xmm4                    \n"
-    "punpcklwd  %%xmm7,%%xmm5                    \n"
-    "movdqa     %%xmm8,%%xmm6                    \n"
-    "movdqa     %%xmm9,%%xmm7                    \n"
-    // Third round of bit swap.
-    // Write to the destination pointer.
-    "movdqa     %%xmm0,%%xmm8                    \n"
-    "punpckldq  %%xmm4,%%xmm0                    \n"
-    "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
-    "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
-    "punpckhdq  %%xmm4,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm2,%%xmm8                    \n"
-    "punpckldq  %%xmm6,%%xmm2                    \n"
-    "movlpd     %%xmm2,(%1)                      \n"
-    "movhpd     %%xmm2,(%2)                      \n"
-    "punpckhdq  %%xmm6,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm1,%%xmm8                    \n"
-    "punpckldq  %%xmm5,%%xmm1                    \n"
-    "movlpd     %%xmm1,(%1)                      \n"
-    "movhpd     %%xmm1,(%2)                      \n"
-    "punpckhdq  %%xmm5,%%xmm8                    \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "movdqa     %%xmm3,%%xmm8                    \n"
-    "punpckldq  %%xmm7,%%xmm3                    \n"
-    "movlpd     %%xmm3,(%1)                      \n"
-    "movhpd     %%xmm3,(%2)                      \n"
-    "punpckhdq  %%xmm7,%%xmm8                    \n"
-    "sub        $0x8,%3                          \n"
-    "movlpd     %%xmm8,(%1,%5)                   \n"
-    "lea        (%1,%5,2),%1                     \n"
-    "movhpd     %%xmm8,(%2,%6)                   \n"
-    "lea        (%2,%6,2),%2                     \n"
-    "jg         1b                               \n"
-    : "+r"(src),    // %0
-      "+r"(dst_a),  // %1
-      "+r"(dst_b),  // %2
-      "+r"(width)   // %3
-    : "r"((intptr_t)(src_stride)),    // %4
-      "r"((intptr_t)(dst_stride_a)),  // %5
-      "r"((intptr_t)(dst_stride_b))   // %6
-    : "memory", "cc",
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
-      "xmm8", "xmm9"
-  );
+void TransposeUVWx8_SSE2(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  asm volatile(
+      // Read in the data from the source pointer.
+      // First round of bit swap.
+      LABELALIGN
+      "1:                                          \n"
+      "movdqu     (%0),%%xmm0                      \n"
+      "movdqu     (%0,%4),%%xmm1                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpcklbw  %%xmm1,%%xmm0                    \n"
+      "punpckhbw  %%xmm1,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm1                    \n"
+      "movdqu     (%0),%%xmm2                      \n"
+      "movdqu     (%0,%4),%%xmm3                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpcklbw  %%xmm3,%%xmm2                    \n"
+      "punpckhbw  %%xmm3,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm3                    \n"
+      "movdqu     (%0),%%xmm4                      \n"
+      "movdqu     (%0,%4),%%xmm5                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "punpcklbw  %%xmm5,%%xmm4                    \n"
+      "punpckhbw  %%xmm5,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm5                    \n"
+      "movdqu     (%0),%%xmm6                      \n"
+      "movdqu     (%0,%4),%%xmm7                   \n"
+      "lea        (%0,%4,2),%0                     \n"
+      "movdqa     %%xmm6,%%xmm8                    \n"
+      "punpcklbw  %%xmm7,%%xmm6                    \n"
+      "neg        %4                               \n"
+      "lea        0x10(%0,%4,8),%0                 \n"
+      "punpckhbw  %%xmm7,%%xmm8                    \n"
+      "movdqa     %%xmm8,%%xmm7                    \n"
+      "neg        %4                               \n"
+      // Second round of bit swap.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "movdqa     %%xmm1,%%xmm9                    \n"
+      "punpckhwd  %%xmm2,%%xmm8                    \n"
+      "punpckhwd  %%xmm3,%%xmm9                    \n"
+      "punpcklwd  %%xmm2,%%xmm0                    \n"
+      "punpcklwd  %%xmm3,%%xmm1                    \n"
+      "movdqa     %%xmm8,%%xmm2                    \n"
+      "movdqa     %%xmm9,%%xmm3                    \n"
+      "movdqa     %%xmm4,%%xmm8                    \n"
+      "movdqa     %%xmm5,%%xmm9                    \n"
+      "punpckhwd  %%xmm6,%%xmm8                    \n"
+      "punpckhwd  %%xmm7,%%xmm9                    \n"
+      "punpcklwd  %%xmm6,%%xmm4                    \n"
+      "punpcklwd  %%xmm7,%%xmm5                    \n"
+      "movdqa     %%xmm8,%%xmm6                    \n"
+      "movdqa     %%xmm9,%%xmm7                    \n"
+      // Third round of bit swap.
+      // Write to the destination pointer.
+      "movdqa     %%xmm0,%%xmm8                    \n"
+      "punpckldq  %%xmm4,%%xmm0                    \n"
+      "movlpd     %%xmm0,(%1)                      \n"  // Write back U channel
+      "movhpd     %%xmm0,(%2)                      \n"  // Write back V channel
+      "punpckhdq  %%xmm4,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm2,%%xmm8                    \n"
+      "punpckldq  %%xmm6,%%xmm2                    \n"
+      "movlpd     %%xmm2,(%1)                      \n"
+      "movhpd     %%xmm2,(%2)                      \n"
+      "punpckhdq  %%xmm6,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm1,%%xmm8                    \n"
+      "punpckldq  %%xmm5,%%xmm1                    \n"
+      "movlpd     %%xmm1,(%1)                      \n"
+      "movhpd     %%xmm1,(%2)                      \n"
+      "punpckhdq  %%xmm5,%%xmm8                    \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "movdqa     %%xmm3,%%xmm8                    \n"
+      "punpckldq  %%xmm7,%%xmm3                    \n"
+      "movlpd     %%xmm3,(%1)                      \n"
+      "movhpd     %%xmm3,(%2)                      \n"
+      "punpckhdq  %%xmm7,%%xmm8                    \n"
+      "sub        $0x8,%3                          \n"
+      "movlpd     %%xmm8,(%1,%5)                   \n"
+      "lea        (%1,%5,2),%1                     \n"
+      "movhpd     %%xmm8,(%2,%6)                   \n"
+      "lea        (%2,%6,2),%2                     \n"
+      "jg         1b                               \n"
+      : "+r"(src),                      // %0
+        "+r"(dst_a),                    // %1
+        "+r"(dst_b),                    // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride)),    // %4
+        "r"((intptr_t)(dst_stride_a)),  // %5
+        "r"((intptr_t)(dst_stride_b))   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7", "xmm8", "xmm9");
 }
 #endif  // defined(HAS_TRANSPOSEUVWX8_SSE2)
 #endif  // defined(__x86_64__) || defined(__i386__)
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_mips.cc b/libs/libvpx/third_party/libyuv/source/rotate_mips.cc
deleted file mode 100644
index 1e8ce25197a..00000000000
--- a/libs/libvpx/third_party/libyuv/source/rotate_mips.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/*
- *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-#include "libyuv/rotate_row.h"
-
-#include "libyuv/basic_types.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void TransposeWx8_DSPR2(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
-   __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-    "1:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "sw               $s0, 0(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "sw               $s1, 4(%[dst])                   \n"
-      "bnez             %[width], 1b                     \n"
-      " addu            %[dst], %[dst], %[dst_stride]    \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-   "11:                                                  \n"
-      "lbu              $t0, 0(%[src])                   \n"
-      "lbux             $t1, %[src_stride](%[src])       \n"
-      "lbux             $t8, $t2(%[src])                 \n"
-      "lbux             $t9, $t3(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s0, $t8, $t0                    \n"
-      "lbux             $t0, $t4(%[src])                 \n"
-      "lbux             $t1, $t5(%[src])                 \n"
-      "lbux             $t8, $t6(%[src])                 \n"
-      "lbux             $t9, $t7(%[src])                 \n"
-      "sll              $t1, $t1, 16                     \n"
-      "sll              $t9, $t9, 16                     \n"
-      "or               $t0, $t0, $t1                    \n"
-      "or               $t8, $t8, $t9                    \n"
-      "precr.qb.ph      $s1, $t8, $t0                    \n"
-      "swr              $s0, 0(%[dst])                   \n"
-      "swl              $s0, 3(%[dst])                   \n"
-      "addiu            %[width], -1                     \n"
-      "addiu            %[src], 1                        \n"
-      "swr              $s1, 4(%[dst])                   \n"
-      "swl              $s1, 7(%[dst])                   \n"
-      "bnez             %[width], 11b                    \n"
-       "addu             %[dst], %[dst], %[dst_stride]   \n"
-    "2:                                                  \n"
-      ".set pop                                          \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1",  "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1"
-  );
-}
-
-void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
-                             uint8* dst, int dst_stride, int width) {
-  __asm__ __volatile__ (
-      ".set noat                                         \n"
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz             %[width], 2f                     \n"
-      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
-      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
-      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
-      "addu             $t3, $t2, %[src_stride]          \n"
-      "addu             $t5, $t4, %[src_stride]          \n"
-      "addu             $t6, $t2, $t4                    \n"
-
-      "srl              $AT, %[width], 0x2               \n"
-      "andi             $t0, %[dst], 0x3                 \n"
-      "andi             $t1, %[dst_stride], 0x3          \n"
-      "or               $t0, $t0, $t1                    \n"
-      "bnez             $t0, 11f                         \n"
-      " subu            $t7, $t9, %[src_stride]          \n"
-//dst + dst_stride word aligned
-      "1:                                                \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "sw              $s4, 0(%[dst])                    \n"
-      "sw              $t0, 4(%[dst])                    \n"
-      "sw              $s6, 0($s0)                       \n"
-      "sw              $t8, 4($s0)                       \n"
-      "sw              $s5, 0($s1)                       \n"
-      "sw              $t1, 4($s1)                       \n"
-      "sw              $s7, 0($s2)                       \n"
-      "sw              $t9, 4($s2)                       \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 1b                          \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "b                2f                               \n"
-//dst + dst_stride unaligned
-      "11:                                               \n"
-      "lw               $t0, 0(%[src])                   \n"
-      "lwx              $t1, %[src_stride](%[src])       \n"
-      "lwx              $t8, $t2(%[src])                 \n"
-      "lwx              $t9, $t3(%[src])                 \n"
-
-// t0 = | 30 | 20 | 10 | 00 |
-// t1 = | 31 | 21 | 11 | 01 |
-// t8 = | 32 | 22 | 12 | 02 |
-// t9 = | 33 | 23 | 13 | 03 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 21 | 01 | 20 | 00 |
-  // s1 = | 23 | 03 | 22 | 02 |
-  // s2 = | 31 | 11 | 30 | 10 |
-  // s3 = | 33 | 13 | 32 | 12 |
-
-      "precr.qb.ph     $s4, $s1, $s0                     \n"
-      "precrq.qb.ph    $s5, $s1, $s0                     \n"
-      "precr.qb.ph     $s6, $s3, $s2                     \n"
-      "precrq.qb.ph    $s7, $s3, $s2                     \n"
-
-  // s4 = | 03 | 02 | 01 | 00 |
-  // s5 = | 23 | 22 | 21 | 20 |
-  // s6 = | 13 | 12 | 11 | 10 |
-  // s7 = | 33 | 32 | 31 | 30 |
-
-      "lwx              $t0, $t4(%[src])                 \n"
-      "lwx              $t1, $t5(%[src])                 \n"
-      "lwx              $t8, $t6(%[src])                 \n"
-      "lwx              $t9, $t7(%[src])                 \n"
-
-// t0 = | 34 | 24 | 14 | 04 |
-// t1 = | 35 | 25 | 15 | 05 |
-// t8 = | 36 | 26 | 16 | 06 |
-// t9 = | 37 | 27 | 17 | 07 |
-
-      "precr.qb.ph     $s0, $t1, $t0                     \n"
-      "precr.qb.ph     $s1, $t9, $t8                     \n"
-      "precrq.qb.ph    $s2, $t1, $t0                     \n"
-      "precrq.qb.ph    $s3, $t9, $t8                     \n"
-
-  // s0 = | 25 | 05 | 24 | 04 |
-  // s1 = | 27 | 07 | 26 | 06 |
-  // s2 = | 35 | 15 | 34 | 14 |
-  // s3 = | 37 | 17 | 36 | 16 |
-
-      "precr.qb.ph     $t0, $s1, $s0                     \n"
-      "precrq.qb.ph    $t1, $s1, $s0                     \n"
-      "precr.qb.ph     $t8, $s3, $s2                     \n"
-      "precrq.qb.ph    $t9, $s3, $s2                     \n"
-
-  // t0 = | 07 | 06 | 05 | 04 |
-  // t1 = | 27 | 26 | 25 | 24 |
-  // t8 = | 17 | 16 | 15 | 14 |
-  // t9 = | 37 | 36 | 35 | 34 |
-
-      "addu            $s0, %[dst], %[dst_stride]        \n"
-      "addu            $s1, $s0, %[dst_stride]           \n"
-      "addu            $s2, $s1, %[dst_stride]           \n"
-
-      "swr              $s4, 0(%[dst])                   \n"
-      "swl              $s4, 3(%[dst])                   \n"
-      "swr              $t0, 4(%[dst])                   \n"
-      "swl              $t0, 7(%[dst])                   \n"
-      "swr              $s6, 0($s0)                      \n"
-      "swl              $s6, 3($s0)                      \n"
-      "swr              $t8, 4($s0)                      \n"
-      "swl              $t8, 7($s0)                      \n"
-      "swr              $s5, 0($s1)                      \n"
-      "swl              $s5, 3($s1)                      \n"
-      "swr              $t1, 4($s1)                      \n"
-      "swl              $t1, 7($s1)                      \n"
-      "swr              $s7, 0($s2)                      \n"
-      "swl              $s7, 3($s2)                      \n"
-      "swr              $t9, 4($s2)                      \n"
-      "swl              $t9, 7($s2)                      \n"
-
-      "addiu            $AT, -1                          \n"
-      "addiu            %[src], 4                        \n"
-
-      "bnez             $AT, 11b                         \n"
-      " addu            %[dst], $s2, %[dst_stride]       \n"
-      "2:                                                \n"
-      ".set pop                                          \n"
-      ".set at                                           \n"
-      :[src] "+r" (src),
-       [dst] "+r" (dst),
-       [width] "+r" (width)
-      :[src_stride] "r" (src_stride),
-       [dst_stride] "r" (dst_stride)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
-  );
-}
-
-void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
-                          uint8* dst_a, int dst_stride_a,
-                          uint8* dst_b, int dst_stride_b,
-                          int width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "beqz            %[width], 2f                      \n"
-      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
-      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
-      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
-      "addu            $t3, $t2, %[src_stride]           \n"
-      "addu            $t5, $t4, %[src_stride]           \n"
-      "addu            $t6, $t2, $t4                     \n"
-      "subu            $t7, $t9, %[src_stride]           \n"
-      "srl             $t1, %[width], 1                  \n"
-
-// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
-      "andi            $t0, %[dst_a], 0x3                \n"
-      "andi            $t8, %[dst_b], 0x3                \n"
-      "or              $t0, $t0, $t8                     \n"
-      "andi            $t8, %[dst_stride_a], 0x3         \n"
-      "andi            $s5, %[dst_stride_b], 0x3         \n"
-      "or              $t8, $t8, $s5                     \n"
-      "or              $t0, $t0, $t8                     \n"
-      "bnez            $t0, 11f                          \n"
-      " nop                                              \n"
-// dst + dst_stride word aligned (both, a & b dst addresses)
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "sw              $s3, 0($s5)                       \n"
-      "sw              $s4, 0($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "sw              $s3, 0(%[dst_a])                  \n"
-      "sw              $s4, 0(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-      "sw              $s3, 4($s5)                       \n"
-      "sw              $s4, 4($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "sw              $s3, 4(%[dst_a])                  \n"
-      "sw              $s4, 4(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 1b                           \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-      "b               2f                                \n"
-      " nop                                              \n"
-
-// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
-   "11:                                                  \n"
-      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
-      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
-      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
-      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
-      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
-      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
-
-      "swr             $s3, 0($s5)                       \n"
-      "swl             $s3, 3($s5)                       \n"
-      "swr             $s4, 0($s6)                       \n"
-      "swl             $s4, 3($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
-
-      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
-      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
-      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
-      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
-      "swr             $s3, 0(%[dst_a])                  \n"
-      "swl             $s3, 3(%[dst_a])                  \n"
-      "swr             $s4, 0(%[dst_b])                  \n"
-      "swl             $s4, 3(%[dst_b])                  \n"
-
-      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
-      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
-
-      "sll             $t0, $t0, 16                      \n"
-      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
-      "sll             $t9, $t9, 16                      \n"
-      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
-
-      "swr             $s3, 4($s5)                       \n"
-      "swl             $s3, 7($s5)                       \n"
-      "swr             $s4, 4($s6)                       \n"
-      "swl             $s4, 7($s6)                       \n"
-
-      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
-      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
-
-      "addiu           %[src], 4                         \n"
-      "addiu           $t1, -1                           \n"
-      "sll             $t0, %[dst_stride_a], 1           \n"
-      "sll             $t8, %[dst_stride_b], 1           \n"
-      "swr             $s3, 4(%[dst_a])                  \n"
-      "swl             $s3, 7(%[dst_a])                  \n"
-      "swr             $s4, 4(%[dst_b])                  \n"
-      "swl             $s4, 7(%[dst_b])                  \n"
-      "addu            %[dst_a], %[dst_a], $t0           \n"
-      "bnez            $t1, 11b                          \n"
-      " addu           %[dst_b], %[dst_b], $t8           \n"
-
-      "2:                                                \n"
-      ".set pop                                          \n"
-      : [src] "+r" (src),
-        [dst_a] "+r" (dst_a),
-        [dst_b] "+r" (dst_b),
-        [width] "+r" (width),
-        [src_stride] "+r" (src_stride)
-      : [dst_stride_a] "r" (dst_stride_a),
-        [dst_stride_b] "r" (dst_stride_b)
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-        "t6", "t7", "t8", "t9",
-        "s0", "s1", "s2", "s3",
-        "s4", "s5", "s6"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_msa.cc b/libs/libvpx/third_party/libyuv/source/rotate_msa.cc
new file mode 100644
index 00000000000..99bdca65b32
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/rotate_msa.cc
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ILVRL_B(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_b((v16i8)in1, (v16i8)in0);     \
+    out1 = (v16u8)__msa_ilvl_b((v16i8)in1, (v16i8)in0);     \
+    out2 = (v16u8)__msa_ilvr_b((v16i8)in3, (v16i8)in2);     \
+    out3 = (v16u8)__msa_ilvl_b((v16i8)in3, (v16i8)in2);     \
+  }
+
+#define ILVRL_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_h((v8i16)in1, (v8i16)in0);     \
+    out1 = (v16u8)__msa_ilvl_h((v8i16)in1, (v8i16)in0);     \
+    out2 = (v16u8)__msa_ilvr_h((v8i16)in3, (v8i16)in2);     \
+    out3 = (v16u8)__msa_ilvl_h((v8i16)in3, (v8i16)in2);     \
+  }
+
+#define ILVRL_W(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_w((v4i32)in1, (v4i32)in0);     \
+    out1 = (v16u8)__msa_ilvl_w((v4i32)in1, (v4i32)in0);     \
+    out2 = (v16u8)__msa_ilvr_w((v4i32)in3, (v4i32)in2);     \
+    out3 = (v16u8)__msa_ilvl_w((v4i32)in3, (v4i32)in2);     \
+  }
+
+#define ILVRL_D(in0, in1, in2, in3, out0, out1, out2, out3) \
+  {                                                         \
+    out0 = (v16u8)__msa_ilvr_d((v2i64)in1, (v2i64)in0);     \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)in1, (v2i64)in0);     \
+    out2 = (v16u8)__msa_ilvr_d((v2i64)in3, (v2i64)in2);     \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)in3, (v2i64)in2);     \
+  }
+
+void TransposeWx16_C(const uint8_t* src,
+                     int src_stride,
+                     uint8_t* dst,
+                     int dst_stride,
+                     int width) {
+  TransposeWx8_C(src, src_stride, dst, dst_stride, width);
+  TransposeWx8_C((src + 8 * src_stride), src_stride, (dst + 8), dst_stride,
+                 width);
+}
+
+void TransposeUVWx16_C(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst_a,
+                       int dst_stride_a,
+                       uint8_t* dst_b,
+                       int dst_stride_b,
+                       int width) {
+  TransposeUVWx8_C(src, src_stride, dst_a, dst_stride_a, dst_b, dst_stride_b,
+                   width);
+  TransposeUVWx8_C((src + 8 * src_stride), src_stride, (dst_a + 8),
+                   dst_stride_a, (dst_b + 8), dst_stride_b, width);
+}
+
+void TransposeWx16_MSA(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 16) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    dst += dst_stride * 4;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
+    src += 16;
+    dst += dst_stride * 4;
+  }
+}
+
+void TransposeUVWx16_MSA(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
+                         int width) {
+  int x;
+  const uint8_t* s;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 res0, res1, res2, res3, res4, res5, res6, res7, res8, res9;
+
+  for (x = 0; x < width; x += 8) {
+    s = src;
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    ILVRL_W(reg0, reg4, reg1, reg5, res0, res1, res2, res3);
+    ILVRL_W(reg2, reg6, reg3, reg7, res4, res5, res6, res7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg0, reg1, reg2, reg3);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    s += src_stride;
+    ILVRL_B(src0, src1, src2, src3, vec0, vec1, vec2, vec3);
+    ILVRL_H(vec0, vec2, vec1, vec3, reg4, reg5, reg6, reg7);
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg4, (v4i32)reg0);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg4, (v4i32)reg0);
+    ILVRL_D(res0, res8, res1, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg5, (v4i32)reg1);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg5, (v4i32)reg1);
+    ILVRL_D(res2, res8, res3, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg6, (v4i32)reg2);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg6, (v4i32)reg2);
+    ILVRL_D(res4, res8, res5, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+    res8 = (v16u8)__msa_ilvr_w((v4i32)reg7, (v4i32)reg3);
+    res9 = (v16u8)__msa_ilvl_w((v4i32)reg7, (v4i32)reg3);
+    ILVRL_D(res6, res8, res7, res9, dst0, dst1, dst2, dst3);
+    ST_UB2(dst0, dst2, dst_a, dst_stride_a);
+    ST_UB2(dst1, dst3, dst_b, dst_stride_b);
+    src += 16;
+    dst_a += dst_stride_a * 2;
+    dst_b += dst_stride_b * 2;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc
index 1c22b472bc1..fdc0dd476c6 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_neon.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_neon.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,38 +21,32 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
     !defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15 };
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
 
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride,
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
                        int width) {
-  const uint8* src_temp;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %5, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %5, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
       "mov         %0, %1                      \n"
 
-      MEMACCESS(0)
       "vld1.8      {d0}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d1}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d2}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d3}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d4}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d5}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d6}, [%0], %2              \n"
-      MEMACCESS(0)
       "vld1.8      {d7}, [%0]                  \n"
 
       "vtrn.8      d1, d0                      \n"
@@ -77,21 +71,13 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %3                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d1}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d0}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d3}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d2}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d5}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d4}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d7}, [%0], %4              \n"
-    MEMACCESS(0)
       "vst1.8      {d6}, [%0]                  \n"
 
       "add         %1, #8                      \n"  // src += 8
@@ -99,180 +85,138 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
       "subs        %5,  #8                     \n"  // w   -= 8
       "bge         1b                          \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %5, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %5, #4                        \n"
-    "blt         2f                            \n"
-
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d2[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.32     {d3[1]}, [%0]                 \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(6)
-    "vld1.8      {q3}, [%6]                    \n"
-
-    "vtbl.8      d4, {d0, d1}, d6              \n"
-    "vtbl.8      d5, {d0, d1}, d7              \n"
-    "vtbl.8      d0, {d2, d3}, d6              \n"
-    "vtbl.8      d1, {d2, d3}, d7              \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "vst1.32     {d4[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d4[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d5[1]}, [%0]                 \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d0[1]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[0]}, [%0], %4             \n"
-    MEMACCESS(0)
-    "vst1.32     {d1[1]}, [%0]                 \n"
-
-    "add         %1, #4                        \n"  // src += 4
-    "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
-    "subs        %5,  #4                       \n"  // w   -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %5, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[0]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[1]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[2]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d0[3]}, [%0], %2             \n"
-    MEMACCESS(0)
-    "vld1.16     {d1[3]}, [%0]                 \n"
-
-    "vtrn.8      d0, d1                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0]                    \n"
-
-    "add         %1, #2                        \n"  // src += 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
-    "subs        %5,  #2                       \n"  // w   -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[0]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[1]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[2]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[3]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[4]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[5]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[6]}, [%1], %2             \n"
-    MEMACCESS(1)
-    "vld1.8      {d0[7]}, [%1]                 \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-
-    "4:                                        \n"
-
-    : "=&r"(src_temp),         // %0
-      "+r"(src),               // %1
-      "+r"(src_stride),        // %2
-      "+r"(dst),               // %3
-      "+r"(dst_stride),        // %4
-      "+r"(width)              // %5
-    : "r"(&kVTbl4x4Transpose)  // %6
-    : "memory", "cc", "q0", "q1", "q2", "q3"
-  );
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %5, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %5, #4                        \n"
+      "blt         2f                            \n"
+
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.32     {d0[0]}, [%0], %2             \n"
+      "vld1.32     {d0[1]}, [%0], %2             \n"
+      "vld1.32     {d1[0]}, [%0], %2             \n"
+      "vld1.32     {d1[1]}, [%0], %2             \n"
+      "vld1.32     {d2[0]}, [%0], %2             \n"
+      "vld1.32     {d2[1]}, [%0], %2             \n"
+      "vld1.32     {d3[0]}, [%0], %2             \n"
+      "vld1.32     {d3[1]}, [%0]                 \n"
+
+      "mov         %0, %3                        \n"
+
+      "vld1.8      {q3}, [%6]                    \n"
+
+      "vtbl.8      d4, {d0, d1}, d6              \n"
+      "vtbl.8      d5, {d0, d1}, d7              \n"
+      "vtbl.8      d0, {d2, d3}, d6              \n"
+      "vtbl.8      d1, {d2, d3}, d7              \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "vst1.32     {d4[0]}, [%0], %4             \n"
+      "vst1.32     {d4[1]}, [%0], %4             \n"
+      "vst1.32     {d5[0]}, [%0], %4             \n"
+      "vst1.32     {d5[1]}, [%0]                 \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d0[0]}, [%0], %4             \n"
+      "vst1.32     {d0[1]}, [%0], %4             \n"
+      "vst1.32     {d1[0]}, [%0], %4             \n"
+      "vst1.32     {d1[1]}, [%0]                 \n"
+
+      "add         %1, #4                        \n"  // src += 4
+      "add         %3, %3, %4, lsl #2            \n"  // dst += 4 * dst_stride
+      "subs        %5,  #4                       \n"  // w   -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %5, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld1.16     {d0[0]}, [%0], %2             \n"
+      "vld1.16     {d1[0]}, [%0], %2             \n"
+      "vld1.16     {d0[1]}, [%0], %2             \n"
+      "vld1.16     {d1[1]}, [%0], %2             \n"
+      "vld1.16     {d0[2]}, [%0], %2             \n"
+      "vld1.16     {d1[2]}, [%0], %2             \n"
+      "vld1.16     {d0[3]}, [%0], %2             \n"
+      "vld1.16     {d1[3]}, [%0]                 \n"
+
+      "vtrn.8      d0, d1                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d1}, [%0]                    \n"
+
+      "add         %1, #2                        \n"  // src += 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst += 2 * dst_stride
+      "subs        %5,  #2                       \n"  // w   -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld1.8      {d0[0]}, [%1], %2             \n"
+      "vld1.8      {d0[1]}, [%1], %2             \n"
+      "vld1.8      {d0[2]}, [%1], %2             \n"
+      "vld1.8      {d0[3]}, [%1], %2             \n"
+      "vld1.8      {d0[4]}, [%1], %2             \n"
+      "vld1.8      {d0[5]}, [%1], %2             \n"
+      "vld1.8      {d0[6]}, [%1], %2             \n"
+      "vld1.8      {d0[7]}, [%1]                 \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),         // %0
+        "+r"(src),               // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst),               // %3
+        "+r"(dst_stride),        // %4
+        "+r"(width)              // %5
+      : "r"(&kVTbl4x4Transpose)  // %6
+      : "memory", "cc", "q0", "q1", "q2", "q3");
 }
 
-static uvec8 kVTbl4x4TransposeDi =
-  { 0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15 };
+static const uvec8 kVTbl4x4TransposeDi = {0, 8,  1, 9,  2, 10, 3, 11,
+                                          4, 12, 5, 13, 6, 14, 7, 15};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp;
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %7, #8                        \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %7, #8                        \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
       "mov         %0, %1                      \n"
 
-      MEMACCESS(0)
       "vld2.8      {d0,  d1},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d2,  d3},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d4,  d5},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d6,  d7},  [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d16, d17}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d18, d19}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d20, d21}, [%0], %2        \n"
-      MEMACCESS(0)
       "vld2.8      {d22, d23}, [%0]            \n"
 
       "vtrn.8      q1, q0                      \n"
@@ -301,40 +245,24 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %3                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d2},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d0},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d6},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d4},  [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d18}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d16}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d22}, [%0], %4             \n"
-    MEMACCESS(0)
       "vst1.8      {d20}, [%0]                 \n"
 
       "mov         %0, %5                      \n"
 
-    MEMACCESS(0)
       "vst1.8      {d3},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d1},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d7},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d5},  [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d19}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d17}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d23}, [%0], %6             \n"
-    MEMACCESS(0)
       "vst1.8      {d21}, [%0]                 \n"
 
       "add         %1, #8*2                    \n"  // src   += 8*2
@@ -343,187 +271,142 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
       "subs        %7,  #8                     \n"  // w     -= 8
       "bge         1b                          \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %7, #8                        \n"
-    "beq         4f                            \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    "cmp         %7, #4                        \n"
-    "blt         2f                            \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld1.64     {d0}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d1}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d2}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d3}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d4}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d5}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d6}, [%0], %2                \n"
-    MEMACCESS(0)
-    "vld1.64     {d7}, [%0]                    \n"
-
-    MEMACCESS(8)
-    "vld1.8      {q15}, [%8]                   \n"
-
-    "vtrn.8      q0, q1                        \n"
-    "vtrn.8      q2, q3                        \n"
-
-    "vtbl.8      d16, {d0, d1}, d30            \n"
-    "vtbl.8      d17, {d0, d1}, d31            \n"
-    "vtbl.8      d18, {d2, d3}, d30            \n"
-    "vtbl.8      d19, {d2, d3}, d31            \n"
-    "vtbl.8      d20, {d4, d5}, d30            \n"
-    "vtbl.8      d21, {d4, d5}, d31            \n"
-    "vtbl.8      d22, {d6, d7}, d30            \n"
-    "vtbl.8      d23, {d6, d7}, d31            \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d16[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d16[1]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[0]},  [%0], %4           \n"
-    MEMACCESS(0)
-    "vst1.32     {d17[1]},  [%0], %4           \n"
-
-    "add         %0, %3, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d20[1]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[0]}, [%0], %4            \n"
-    MEMACCESS(0)
-    "vst1.32     {d21[1]}, [%0]                \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.32     {d18[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d18[1]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[0]}, [%0], %6            \n"
-    MEMACCESS(0)
-    "vst1.32     {d19[1]}, [%0], %6            \n"
-
-    "add         %0, %5, #4                    \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d22[1]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[0]},  [%0], %6           \n"
-    MEMACCESS(0)
-    "vst1.32     {d23[1]},  [%0]               \n"
-
-    "add         %1, #4*2                      \n"  // src   += 4 * 2
-    "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 * dst_stride_a
-    "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 * dst_stride_b
-    "subs        %7,  #4                       \n"  // w     -= 4
-    "beq         4f                            \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %7, #2                        \n"
-    "blt         3f                            \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov         %0, %1                        \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
-    MEMACCESS(0)
-    "vld2.16     {d1[3], d3[3]}, [%0]          \n"
-
-    "vtrn.8      d0, d1                        \n"
-    "vtrn.8      d2, d3                        \n"
-
-    "mov         %0, %3                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d0}, [%0], %4                \n"
-    MEMACCESS(0)
-    "vst1.64     {d2}, [%0]                    \n"
-
-    "mov         %0, %5                        \n"
-
-    MEMACCESS(0)
-    "vst1.64     {d1}, [%0], %6                \n"
-    MEMACCESS(0)
-    "vst1.64     {d3}, [%0]                    \n"
-
-    "add         %1, #2*2                      \n"  // src   += 2 * 2
-    "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 * dst_stride_a
-    "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 * dst_stride_b
-    "subs        %7,  #2                       \n"  // w     -= 2
-    "beq         4f                            \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
-    MEMACCESS(1)
-    "vld2.8      {d0[7], d1[7]}, [%1]          \n"
-
-    MEMACCESS(3)
-    "vst1.64     {d0}, [%3]                    \n"
-    MEMACCESS(5)
-    "vst1.64     {d1}, [%5]                    \n"
-
-    "4:                                        \n"
-
-    : "=&r"(src_temp),           // %0
-      "+r"(src),                 // %1
-      "+r"(src_stride),          // %2
-      "+r"(dst_a),               // %3
-      "+r"(dst_stride_a),        // %4
-      "+r"(dst_b),               // %5
-      "+r"(dst_stride_b),        // %6
-      "+r"(width)                // %7
-    : "r"(&kVTbl4x4TransposeDi)  // %8
-    : "memory", "cc",
-      "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %7, #8                        \n"
+      "beq         4f                            \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      "cmp         %7, #4                        \n"
+      "blt         2f                            \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov         %0, %1                        \n"
+      "vld1.64     {d0}, [%0], %2                \n"
+      "vld1.64     {d1}, [%0], %2                \n"
+      "vld1.64     {d2}, [%0], %2                \n"
+      "vld1.64     {d3}, [%0], %2                \n"
+      "vld1.64     {d4}, [%0], %2                \n"
+      "vld1.64     {d5}, [%0], %2                \n"
+      "vld1.64     {d6}, [%0], %2                \n"
+      "vld1.64     {d7}, [%0]                    \n"
+
+      "vld1.8      {q15}, [%8]                   \n"
+
+      "vtrn.8      q0, q1                        \n"
+      "vtrn.8      q2, q3                        \n"
+
+      "vtbl.8      d16, {d0, d1}, d30            \n"
+      "vtbl.8      d17, {d0, d1}, d31            \n"
+      "vtbl.8      d18, {d2, d3}, d30            \n"
+      "vtbl.8      d19, {d2, d3}, d31            \n"
+      "vtbl.8      d20, {d4, d5}, d30            \n"
+      "vtbl.8      d21, {d4, d5}, d31            \n"
+      "vtbl.8      d22, {d6, d7}, d30            \n"
+      "vtbl.8      d23, {d6, d7}, d31            \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.32     {d16[0]},  [%0], %4           \n"
+      "vst1.32     {d16[1]},  [%0], %4           \n"
+      "vst1.32     {d17[0]},  [%0], %4           \n"
+      "vst1.32     {d17[1]},  [%0], %4           \n"
+
+      "add         %0, %3, #4                    \n"
+      "vst1.32     {d20[0]}, [%0], %4            \n"
+      "vst1.32     {d20[1]}, [%0], %4            \n"
+      "vst1.32     {d21[0]}, [%0], %4            \n"
+      "vst1.32     {d21[1]}, [%0]                \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.32     {d18[0]}, [%0], %6            \n"
+      "vst1.32     {d18[1]}, [%0], %6            \n"
+      "vst1.32     {d19[0]}, [%0], %6            \n"
+      "vst1.32     {d19[1]}, [%0], %6            \n"
+
+      "add         %0, %5, #4                    \n"
+      "vst1.32     {d22[0]},  [%0], %6           \n"
+      "vst1.32     {d22[1]},  [%0], %6           \n"
+      "vst1.32     {d23[0]},  [%0], %6           \n"
+      "vst1.32     {d23[1]},  [%0]               \n"
+
+      "add         %1, #4*2                      \n"  // src   += 4 * 2
+      "add         %3, %3, %4, lsl #2            \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #2            \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs        %7,  #4                       \n"  // w     -= 4
+      "beq         4f                            \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %7, #2                        \n"
+      "blt         3f                            \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov         %0, %1                        \n"
+      "vld2.16     {d0[0], d2[0]}, [%0], %2      \n"
+      "vld2.16     {d1[0], d3[0]}, [%0], %2      \n"
+      "vld2.16     {d0[1], d2[1]}, [%0], %2      \n"
+      "vld2.16     {d1[1], d3[1]}, [%0], %2      \n"
+      "vld2.16     {d0[2], d2[2]}, [%0], %2      \n"
+      "vld2.16     {d1[2], d3[2]}, [%0], %2      \n"
+      "vld2.16     {d0[3], d2[3]}, [%0], %2      \n"
+      "vld2.16     {d1[3], d3[3]}, [%0]          \n"
+
+      "vtrn.8      d0, d1                        \n"
+      "vtrn.8      d2, d3                        \n"
+
+      "mov         %0, %3                        \n"
+
+      "vst1.64     {d0}, [%0], %4                \n"
+      "vst1.64     {d2}, [%0]                    \n"
+
+      "mov         %0, %5                        \n"
+
+      "vst1.64     {d1}, [%0], %6                \n"
+      "vst1.64     {d3}, [%0]                    \n"
+
+      "add         %1, #2*2                      \n"  // src   += 2 * 2
+      "add         %3, %3, %4, lsl #1            \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add         %5, %5, %6, lsl #1            \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs        %7,  #2                       \n"  // w     -= 2
+      "beq         4f                            \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "vld2.8      {d0[0], d1[0]}, [%1], %2      \n"
+      "vld2.8      {d0[1], d1[1]}, [%1], %2      \n"
+      "vld2.8      {d0[2], d1[2]}, [%1], %2      \n"
+      "vld2.8      {d0[3], d1[3]}, [%1], %2      \n"
+      "vld2.8      {d0[4], d1[4]}, [%1], %2      \n"
+      "vld2.8      {d0[5], d1[5]}, [%1], %2      \n"
+      "vld2.8      {d0[6], d1[6]}, [%1], %2      \n"
+      "vld2.8      {d0[7], d1[7]}, [%1]          \n"
+
+      "vst1.64     {d0}, [%3]                    \n"
+      "vst1.64     {d1}, [%5]                    \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),           // %0
+        "+r"(src),                 // %1
+        "+r"(src_stride),          // %2
+        "+r"(dst_a),               // %3
+        "+r"(dst_stride_a),        // %4
+        "+r"(dst_b),               // %5
+        "+r"(dst_stride_b),        // %6
+        "+r"(width)                // %7
+      : "r"(&kVTbl4x4TransposeDi)  // %8
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 #endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
 
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc
index 1ab448f3ab4..f469baacf68 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #include "libyuv/basic_types.h"
 
@@ -21,38 +21,32 @@ extern "C" {
 // This module is for GCC Neon armv8 64 bit.
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
-static uvec8 kVTbl4x4Transpose =
-  { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
-
-void TransposeWx8_NEON(const uint8* src, int src_stride,
-                       uint8* dst, int dst_stride, int width) {
-  const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub         %3, %3, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                          \n"
+static const uvec8 kVTbl4x4Transpose = {0, 4, 8,  12, 1, 5, 9,  13,
+                                        2, 6, 10, 14, 3, 7, 11, 15};
+
+void TransposeWx8_NEON(const uint8_t* src,
+                       int src_stride,
+                       uint8_t* dst,
+                       int dst_stride,
+                       int width) {
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub         %w3, %w3, #8                     \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                          \n"
       "mov         %0, %1                        \n"
 
-      MEMACCESS(0)
       "ld1        {v0.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v1.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v2.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v3.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v4.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v5.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v6.8b}, [%0], %5              \n"
-      MEMACCESS(0)
       "ld1        {v7.8b}, [%0]                  \n"
 
       "trn2     v16.8b, v0.8b, v1.8b             \n"
@@ -84,456 +78,345 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
 
       "mov         %0, %2                        \n"
 
-    MEMACCESS(0)
       "st1      {v17.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v16.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v19.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v18.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v21.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v20.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v23.8b}, [%0], %6               \n"
-    MEMACCESS(0)
       "st1      {v22.8b}, [%0]                   \n"
 
       "add         %1, %1, #8                    \n"  // src += 8
       "add         %2, %2, %6, lsl #3            \n"  // dst += 8 * dst_stride
-      "subs        %3, %3, #8                    \n"  // w   -= 8
+      "subs        %w3, %w3, #8                  \n"  // w   -= 8
       "b.ge        1b                            \n"
 
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds        %3, %3, #8                      \n"
-    "b.eq        4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    "cmp         %3, #4                          \n"
-    "b.lt        2f                              \n"
-
-    // 4x8 block
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.s}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.s}[3], [%0]                     \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(4)
-    "ld1      {v2.16b}, [%4]                     \n"
-
-    "tbl      v3.16b, {v0.16b}, v2.16b           \n"
-    "tbl      v0.16b, {v1.16b}, v2.16b           \n"
-
-    // TODO(frkoenig): Rework shuffle above to
-    // write out with 4 instead of 8 writes.
-    MEMACCESS(0)
-    "st1 {v3.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v3.s}[3], [%0]                         \n"
-
-    "add         %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[0], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[1], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[2], [%0], %6                     \n"
-    MEMACCESS(0)
-    "st1 {v0.s}[3], [%0]                         \n"
-
-    "add         %1, %1, #4                      \n"  // src += 4
-    "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
-    "subs        %3, %3, #4                      \n"  // w   -= 4
-    "b.eq        4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp         %3, #2                          \n"
-    "b.lt        3f                              \n"
-
-    // 2x8 block
-    "2:                                          \n"
-    "mov         %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[0], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[1], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[2], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v0.h}[3], [%0], %5                 \n"
-    MEMACCESS(0)
-    "ld1     {v1.h}[3], [%0]                     \n"
-
-    "trn2    v2.8b, v0.8b, v1.8b                 \n"
-    "trn1    v3.8b, v0.8b, v1.8b                 \n"
-
-    "mov         %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1     {v3.8b}, [%0], %6                   \n"
-    MEMACCESS(0)
-    "st1     {v2.8b}, [%0]                       \n"
-
-    "add         %1, %1, #2                      \n"  // src += 2
-    "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
-    "subs        %3, %3,  #2                     \n"  // w   -= 2
-    "b.eq        4f                              \n"
-
-    // 1x8 block
-    "3:                                          \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[0], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[1], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[2], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[3], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[4], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[5], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[6], [%1], %5             \n"
-    MEMACCESS(1)
-    "ld1         {v0.b}[7], [%1]                 \n"
-
-    MEMACCESS(2)
-    "st1         {v0.8b}, [%2]                   \n"
-
-    "4:                                          \n"
-
-    : "=&r"(src_temp),                            // %0
-      "+r"(src),                                  // %1
-      "+r"(dst),                                  // %2
-      "+r"(width64)                               // %3
-    : "r"(&kVTbl4x4Transpose),                    // %4
-      "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride))     // %6
-    : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
-      "v17", "v18", "v19", "v20", "v21", "v22", "v23"
-  );
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds        %w3, %w3, #8                    \n"
+      "b.eq        4f                              \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp         %w3, #2                          \n"
+      "b.lt        3f                              \n"
+
+      "cmp         %w3, #4                          \n"
+      "b.lt        2f                              \n"
+
+      // 4x8 block
+      "mov         %0, %1                          \n"
+      "ld1     {v0.s}[0], [%0], %5                 \n"
+      "ld1     {v0.s}[1], [%0], %5                 \n"
+      "ld1     {v0.s}[2], [%0], %5                 \n"
+      "ld1     {v0.s}[3], [%0], %5                 \n"
+      "ld1     {v1.s}[0], [%0], %5                 \n"
+      "ld1     {v1.s}[1], [%0], %5                 \n"
+      "ld1     {v1.s}[2], [%0], %5                 \n"
+      "ld1     {v1.s}[3], [%0]                     \n"
+
+      "mov         %0, %2                          \n"
+
+      "ld1      {v2.16b}, [%4]                     \n"
+
+      "tbl      v3.16b, {v0.16b}, v2.16b           \n"
+      "tbl      v0.16b, {v1.16b}, v2.16b           \n"
+
+      // TODO(frkoenig): Rework shuffle above to
+      // write out with 4 instead of 8 writes.
+      "st1 {v3.s}[0], [%0], %6                     \n"
+      "st1 {v3.s}[1], [%0], %6                     \n"
+      "st1 {v3.s}[2], [%0], %6                     \n"
+      "st1 {v3.s}[3], [%0]                         \n"
+
+      "add         %0, %2, #4                      \n"
+      "st1 {v0.s}[0], [%0], %6                     \n"
+      "st1 {v0.s}[1], [%0], %6                     \n"
+      "st1 {v0.s}[2], [%0], %6                     \n"
+      "st1 {v0.s}[3], [%0]                         \n"
+
+      "add         %1, %1, #4                      \n"  // src += 4
+      "add         %2, %2, %6, lsl #2              \n"  // dst += 4 * dst_stride
+      "subs        %w3, %w3, #4                    \n"  // w   -= 4
+      "b.eq        4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp         %w3, #2                         \n"
+      "b.lt        3f                              \n"
+
+      // 2x8 block
+      "2:                                          \n"
+      "mov         %0, %1                          \n"
+      "ld1     {v0.h}[0], [%0], %5                 \n"
+      "ld1     {v1.h}[0], [%0], %5                 \n"
+      "ld1     {v0.h}[1], [%0], %5                 \n"
+      "ld1     {v1.h}[1], [%0], %5                 \n"
+      "ld1     {v0.h}[2], [%0], %5                 \n"
+      "ld1     {v1.h}[2], [%0], %5                 \n"
+      "ld1     {v0.h}[3], [%0], %5                 \n"
+      "ld1     {v1.h}[3], [%0]                     \n"
+
+      "trn2    v2.8b, v0.8b, v1.8b                 \n"
+      "trn1    v3.8b, v0.8b, v1.8b                 \n"
+
+      "mov         %0, %2                          \n"
+
+      "st1     {v3.8b}, [%0], %6                   \n"
+      "st1     {v2.8b}, [%0]                       \n"
+
+      "add         %1, %1, #2                      \n"  // src += 2
+      "add         %2, %2, %6, lsl #1              \n"  // dst += 2 * dst_stride
+      "subs        %w3, %w3,  #2                   \n"  // w   -= 2
+      "b.eq        4f                              \n"
+
+      // 1x8 block
+      "3:                                          \n"
+      "ld1         {v0.b}[0], [%1], %5             \n"
+      "ld1         {v0.b}[1], [%1], %5             \n"
+      "ld1         {v0.b}[2], [%1], %5             \n"
+      "ld1         {v0.b}[3], [%1], %5             \n"
+      "ld1         {v0.b}[4], [%1], %5             \n"
+      "ld1         {v0.b}[5], [%1], %5             \n"
+      "ld1         {v0.b}[6], [%1], %5             \n"
+      "ld1         {v0.b}[7], [%1]                 \n"
+
+      "st1         {v0.8b}, [%2]                   \n"
+
+      "4:                                          \n"
+
+      : "=&r"(src_temp),                          // %0
+        "+r"(src),                                // %1
+        "+r"(dst),                                // %2
+        "+r"(width)                               // %3
+      : "r"(&kVTbl4x4Transpose),                  // %4
+        "r"(static_cast<ptrdiff_t>(src_stride)),  // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride))   // %6
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23");
 }
 
-static uint8 kVTbl4x4TransposeDi[32] =
-  { 0,  16, 32, 48,  2, 18, 34, 50,  4, 20, 36, 52,  6, 22, 38, 54,
-    1,  17, 33, 49,  3, 19, 35, 51,  5, 21, 37, 53,  7, 23, 39, 55};
+static const uint8_t kVTbl4x4TransposeDi[32] = {
+    0, 16, 32, 48, 2, 18, 34, 50, 4, 20, 36, 52, 6, 22, 38, 54,
+    1, 17, 33, 49, 3, 19, 35, 51, 5, 21, 37, 53, 7, 23, 39, 55};
 
-void TransposeUVWx8_NEON(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
+void TransposeUVWx8_NEON(const uint8_t* src,
+                         int src_stride,
+                         uint8_t* dst_a,
+                         int dst_stride_a,
+                         uint8_t* dst_b,
+                         int dst_stride_b,
                          int width) {
-  const uint8* src_temp;
-  int64 width64 = (int64) width;  // Work around clang 3.4 warning.
-  asm volatile (
-    // loops are on blocks of 8. loop will stop when
-    // counter gets to or below 0. starting the counter
-    // at w-8 allow for this
-    "sub       %4, %4, #8                      \n"
-
-    // handle 8x8 blocks. this should be the majority of the plane
-    "1:                                        \n"
-    "mov       %0, %1                          \n"
-
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v1.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v2.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v3.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v4.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v5.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v6.16b}, [%0], %5              \n"
-    MEMACCESS(0)
-    "ld1       {v7.16b}, [%0]                  \n"
-
-    "trn1      v16.16b, v0.16b, v1.16b         \n"
-    "trn2      v17.16b, v0.16b, v1.16b         \n"
-    "trn1      v18.16b, v2.16b, v3.16b         \n"
-    "trn2      v19.16b, v2.16b, v3.16b         \n"
-    "trn1      v20.16b, v4.16b, v5.16b         \n"
-    "trn2      v21.16b, v4.16b, v5.16b         \n"
-    "trn1      v22.16b, v6.16b, v7.16b         \n"
-    "trn2      v23.16b, v6.16b, v7.16b         \n"
-
-    "trn1      v0.8h, v16.8h, v18.8h           \n"
-    "trn2      v1.8h, v16.8h, v18.8h           \n"
-    "trn1      v2.8h, v20.8h, v22.8h           \n"
-    "trn2      v3.8h, v20.8h, v22.8h           \n"
-    "trn1      v4.8h, v17.8h, v19.8h           \n"
-    "trn2      v5.8h, v17.8h, v19.8h           \n"
-    "trn1      v6.8h, v21.8h, v23.8h           \n"
-    "trn2      v7.8h, v21.8h, v23.8h           \n"
-
-    "trn1      v16.4s, v0.4s, v2.4s            \n"
-    "trn2      v17.4s, v0.4s, v2.4s            \n"
-    "trn1      v18.4s, v1.4s, v3.4s            \n"
-    "trn2      v19.4s, v1.4s, v3.4s            \n"
-    "trn1      v20.4s, v4.4s, v6.4s            \n"
-    "trn2      v21.4s, v4.4s, v6.4s            \n"
-    "trn1      v22.4s, v5.4s, v7.4s            \n"
-    "trn2      v23.4s, v5.4s, v7.4s            \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v16.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v17.d}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v19.d}[1], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v20.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v20.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v22.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v21.d}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v23.d}[1], [%0]                \n"
-
-    "add       %1, %1, #16                     \n"  // src   += 8*2
-    "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 * dst_stride_a
-    "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 * dst_stride_b
-    "subs      %4, %4,  #8                     \n"  // w     -= 8
-    "b.ge      1b                              \n"
-
-    // add 8 back to counter. if the result is 0 there are
-    // no residuals.
-    "adds      %4, %4, #8                      \n"
-    "b.eq      4f                              \n"
-
-    // some residual, so between 1 and 7 lines left to transpose
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    "cmp       %4, #4                          \n"
-    "b.lt      2f                              \n"
-
-    // TODO(frkoenig): Clean this up
-    // 4x8 block
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld1       {v0.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v1.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v2.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v3.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v4.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v5.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v6.8b}, [%0], %5               \n"
-    MEMACCESS(0)
-    "ld1       {v7.8b}, [%0]                   \n"
-
-    MEMACCESS(8)
-    "ld1       {v30.16b}, [%8], #16            \n"
-    "ld1       {v31.16b}, [%8]                 \n"
-
-    "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
-    "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
-    "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
-    "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v16.s}[0],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[1],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[2],  [%0], %6           \n"
-    MEMACCESS(0)
-    "st1       {v16.s}[3],  [%0], %6           \n"
-
-    "add       %0, %2, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[0], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[1], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[2], [%0], %6            \n"
-    MEMACCESS(0)
-    "st1       {v18.s}[3], [%0]                \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v17.s}[0], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[1], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[2], [%0], %7            \n"
-    MEMACCESS(0)
-    "st1       {v17.s}[3], [%0], %7            \n"
-
-    "add       %0, %3, #4                      \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[0],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[1],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[2],  [%0], %7           \n"
-    MEMACCESS(0)
-    "st1       {v19.s}[3],  [%0]               \n"
-
-    "add       %1, %1, #8                      \n"  // src   += 4 * 2
-    "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 * dst_stride_a
-    "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 * dst_stride_b
-    "subs      %4,  %4,  #4                    \n"  // w     -= 4
-    "b.eq      4f                              \n"
-
-    // some residual, check to see if it includes a 2x8 block,
-    // or less
-    "cmp       %4, #2                          \n"
-    "b.lt      3f                              \n"
-
-    // 2x8 block
-    "2:                                        \n"
-    "mov       %0, %1                          \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
-    MEMACCESS(0)
-    "ld2       {v2.h, v3.h}[3], [%0]           \n"
-
-    "trn1      v4.8b, v0.8b, v2.8b             \n"
-    "trn2      v5.8b, v0.8b, v2.8b             \n"
-    "trn1      v6.8b, v1.8b, v3.8b             \n"
-    "trn2      v7.8b, v1.8b, v3.8b             \n"
-
-    "mov       %0, %2                          \n"
-
-    MEMACCESS(0)
-    "st1       {v4.d}[0], [%0], %6             \n"
-    MEMACCESS(0)
-    "st1       {v6.d}[0], [%0]                 \n"
-
-    "mov       %0, %3                          \n"
-
-    MEMACCESS(0)
-    "st1       {v5.d}[0], [%0], %7             \n"
-    MEMACCESS(0)
-    "st1       {v7.d}[0], [%0]                 \n"
-
-    "add       %1, %1, #4                      \n"  // src   += 2 * 2
-    "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 * dst_stride_a
-    "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 * dst_stride_b
-    "subs      %4,  %4,  #2                    \n"  // w     -= 2
-    "b.eq      4f                              \n"
-
-    // 1x8 block
-    "3:                                        \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
-    MEMACCESS(1)
-    "ld2       {v0.b, v1.b}[7], [%1]           \n"
-
-    MEMACCESS(2)
-    "st1       {v0.d}[0], [%2]                 \n"
-    MEMACCESS(3)
-    "st1       {v1.d}[0], [%3]                 \n"
-
-    "4:                                        \n"
-
-    : "=&r"(src_temp),                            // %0
-      "+r"(src),                                  // %1
-      "+r"(dst_a),                                // %2
-      "+r"(dst_b),                                // %3
-      "+r"(width64)                               // %4
-    : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
-      "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
-      "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
-      "r"(&kVTbl4x4TransposeDi)                   // %8
-    : "memory", "cc",
-      "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
-      "v30", "v31"
-  );
+  const uint8_t* src_temp;
+  asm volatile(
+      // loops are on blocks of 8. loop will stop when
+      // counter gets to or below 0. starting the counter
+      // at w-8 allow for this
+      "sub       %w4, %w4, #8                    \n"
+
+      // handle 8x8 blocks. this should be the majority of the plane
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+
+      "ld1       {v0.16b}, [%0], %5              \n"
+      "ld1       {v1.16b}, [%0], %5              \n"
+      "ld1       {v2.16b}, [%0], %5              \n"
+      "ld1       {v3.16b}, [%0], %5              \n"
+      "ld1       {v4.16b}, [%0], %5              \n"
+      "ld1       {v5.16b}, [%0], %5              \n"
+      "ld1       {v6.16b}, [%0], %5              \n"
+      "ld1       {v7.16b}, [%0]                  \n"
+
+      "trn1      v16.16b, v0.16b, v1.16b         \n"
+      "trn2      v17.16b, v0.16b, v1.16b         \n"
+      "trn1      v18.16b, v2.16b, v3.16b         \n"
+      "trn2      v19.16b, v2.16b, v3.16b         \n"
+      "trn1      v20.16b, v4.16b, v5.16b         \n"
+      "trn2      v21.16b, v4.16b, v5.16b         \n"
+      "trn1      v22.16b, v6.16b, v7.16b         \n"
+      "trn2      v23.16b, v6.16b, v7.16b         \n"
+
+      "trn1      v0.8h, v16.8h, v18.8h           \n"
+      "trn2      v1.8h, v16.8h, v18.8h           \n"
+      "trn1      v2.8h, v20.8h, v22.8h           \n"
+      "trn2      v3.8h, v20.8h, v22.8h           \n"
+      "trn1      v4.8h, v17.8h, v19.8h           \n"
+      "trn2      v5.8h, v17.8h, v19.8h           \n"
+      "trn1      v6.8h, v21.8h, v23.8h           \n"
+      "trn2      v7.8h, v21.8h, v23.8h           \n"
+
+      "trn1      v16.4s, v0.4s, v2.4s            \n"
+      "trn2      v17.4s, v0.4s, v2.4s            \n"
+      "trn1      v18.4s, v1.4s, v3.4s            \n"
+      "trn2      v19.4s, v1.4s, v3.4s            \n"
+      "trn1      v20.4s, v4.4s, v6.4s            \n"
+      "trn2      v21.4s, v4.4s, v6.4s            \n"
+      "trn1      v22.4s, v5.4s, v7.4s            \n"
+      "trn2      v23.4s, v5.4s, v7.4s            \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v16.d}[0], [%0], %6            \n"
+      "st1       {v18.d}[0], [%0], %6            \n"
+      "st1       {v17.d}[0], [%0], %6            \n"
+      "st1       {v19.d}[0], [%0], %6            \n"
+      "st1       {v16.d}[1], [%0], %6            \n"
+      "st1       {v18.d}[1], [%0], %6            \n"
+      "st1       {v17.d}[1], [%0], %6            \n"
+      "st1       {v19.d}[1], [%0]                \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v20.d}[0], [%0], %7            \n"
+      "st1       {v22.d}[0], [%0], %7            \n"
+      "st1       {v21.d}[0], [%0], %7            \n"
+      "st1       {v23.d}[0], [%0], %7            \n"
+      "st1       {v20.d}[1], [%0], %7            \n"
+      "st1       {v22.d}[1], [%0], %7            \n"
+      "st1       {v21.d}[1], [%0], %7            \n"
+      "st1       {v23.d}[1], [%0]                \n"
+
+      "add       %1, %1, #16                     \n"  // src   += 8*2
+      "add       %2, %2, %6, lsl #3              \n"  // dst_a += 8 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #3              \n"  // dst_b += 8 *
+                                                      // dst_stride_b
+      "subs      %w4, %w4,  #8                   \n"  // w     -= 8
+      "b.ge      1b                              \n"
+
+      // add 8 back to counter. if the result is 0 there are
+      // no residuals.
+      "adds      %w4, %w4, #8                    \n"
+      "b.eq      4f                              \n"
+
+      // some residual, so between 1 and 7 lines left to transpose
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
+
+      "cmp       %w4, #4                         \n"
+      "b.lt      2f                              \n"
+
+      // TODO(frkoenig): Clean this up
+      // 4x8 block
+      "mov       %0, %1                          \n"
+      "ld1       {v0.8b}, [%0], %5               \n"
+      "ld1       {v1.8b}, [%0], %5               \n"
+      "ld1       {v2.8b}, [%0], %5               \n"
+      "ld1       {v3.8b}, [%0], %5               \n"
+      "ld1       {v4.8b}, [%0], %5               \n"
+      "ld1       {v5.8b}, [%0], %5               \n"
+      "ld1       {v6.8b}, [%0], %5               \n"
+      "ld1       {v7.8b}, [%0]                   \n"
+
+      "ld1       {v30.16b}, [%8], #16            \n"
+      "ld1       {v31.16b}, [%8]                 \n"
+
+      "tbl       v16.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b  \n"
+      "tbl       v17.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v31.16b  \n"
+      "tbl       v18.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v30.16b  \n"
+      "tbl       v19.16b, {v4.16b, v5.16b, v6.16b, v7.16b}, v31.16b  \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v16.s}[0],  [%0], %6           \n"
+      "st1       {v16.s}[1],  [%0], %6           \n"
+      "st1       {v16.s}[2],  [%0], %6           \n"
+      "st1       {v16.s}[3],  [%0], %6           \n"
+
+      "add       %0, %2, #4                      \n"
+      "st1       {v18.s}[0], [%0], %6            \n"
+      "st1       {v18.s}[1], [%0], %6            \n"
+      "st1       {v18.s}[2], [%0], %6            \n"
+      "st1       {v18.s}[3], [%0]                \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v17.s}[0], [%0], %7            \n"
+      "st1       {v17.s}[1], [%0], %7            \n"
+      "st1       {v17.s}[2], [%0], %7            \n"
+      "st1       {v17.s}[3], [%0], %7            \n"
+
+      "add       %0, %3, #4                      \n"
+      "st1       {v19.s}[0],  [%0], %7           \n"
+      "st1       {v19.s}[1],  [%0], %7           \n"
+      "st1       {v19.s}[2],  [%0], %7           \n"
+      "st1       {v19.s}[3],  [%0]               \n"
+
+      "add       %1, %1, #8                      \n"  // src   += 4 * 2
+      "add       %2, %2, %6, lsl #2              \n"  // dst_a += 4 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #2              \n"  // dst_b += 4 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #4                  \n"  // w     -= 4
+      "b.eq      4f                              \n"
+
+      // some residual, check to see if it includes a 2x8 block,
+      // or less
+      "cmp       %w4, #2                         \n"
+      "b.lt      3f                              \n"
+
+      // 2x8 block
+      "2:                                        \n"
+      "mov       %0, %1                          \n"
+      "ld2       {v0.h, v1.h}[0], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[0], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[1], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[1], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[2], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[2], [%0], %5       \n"
+      "ld2       {v0.h, v1.h}[3], [%0], %5       \n"
+      "ld2       {v2.h, v3.h}[3], [%0]           \n"
+
+      "trn1      v4.8b, v0.8b, v2.8b             \n"
+      "trn2      v5.8b, v0.8b, v2.8b             \n"
+      "trn1      v6.8b, v1.8b, v3.8b             \n"
+      "trn2      v7.8b, v1.8b, v3.8b             \n"
+
+      "mov       %0, %2                          \n"
+
+      "st1       {v4.d}[0], [%0], %6             \n"
+      "st1       {v6.d}[0], [%0]                 \n"
+
+      "mov       %0, %3                          \n"
+
+      "st1       {v5.d}[0], [%0], %7             \n"
+      "st1       {v7.d}[0], [%0]                 \n"
+
+      "add       %1, %1, #4                      \n"  // src   += 2 * 2
+      "add       %2, %2, %6, lsl #1              \n"  // dst_a += 2 *
+                                                      // dst_stride_a
+      "add       %3, %3, %7, lsl #1              \n"  // dst_b += 2 *
+                                                      // dst_stride_b
+      "subs      %w4,  %w4,  #2                  \n"  // w     -= 2
+      "b.eq      4f                              \n"
+
+      // 1x8 block
+      "3:                                        \n"
+      "ld2       {v0.b, v1.b}[0], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[1], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[2], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[3], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[4], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[5], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[6], [%1], %5       \n"
+      "ld2       {v0.b, v1.b}[7], [%1]           \n"
+
+      "st1       {v0.d}[0], [%2]                 \n"
+      "st1       {v1.d}[0], [%3]                 \n"
+
+      "4:                                        \n"
+
+      : "=&r"(src_temp),                            // %0
+        "+r"(src),                                  // %1
+        "+r"(dst_a),                                // %2
+        "+r"(dst_b),                                // %3
+        "+r"(width)                                 // %4
+      : "r"(static_cast<ptrdiff_t>(src_stride)),    // %5
+        "r"(static_cast<ptrdiff_t>(dst_stride_a)),  // %6
+        "r"(static_cast<ptrdiff_t>(dst_stride_b)),  // %7
+        "r"(&kVTbl4x4TransposeDi)                   // %8
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
diff --git a/libs/libvpx/third_party/libyuv/source/rotate_win.cc b/libs/libvpx/third_party/libyuv/source/rotate_win.cc
index 1300fc0feb8..e887dd525c7 100644
--- a/libs/libvpx/third_party/libyuv/source/rotate_win.cc
+++ b/libs/libvpx/third_party/libyuv/source/rotate_win.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/row.h"
 #include "libyuv/rotate_row.h"
+#include "libyuv/row.h"
 
 #ifdef __cplusplus
 namespace libyuv {
@@ -17,17 +17,19 @@ extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
-__declspec(naked)
-void TransposeWx8_SSSE3(const uint8* src, int src_stride,
-                        uint8* dst, int dst_stride, int width) {
+__declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src,
+                                          int src_stride,
+                                          uint8_t* dst,
+                                          int dst_stride,
+                                          int width) {
   __asm {
     push      edi
     push      esi
     push      ebp
-    mov       eax, [esp + 12 + 4]   // src
-    mov       edi, [esp + 12 + 8]   // src_stride
+    mov       eax, [esp + 12 + 4]  // src
+    mov       edi, [esp + 12 + 8]  // src_stride
     mov       edx, [esp + 12 + 12]  // dst
     mov       esi, [esp + 12 + 16]  // dst_stride
     mov       ecx, [esp + 12 + 20]  // width
@@ -110,18 +112,20 @@ void TransposeWx8_SSSE3(const uint8* src, int src_stride,
   }
 }
 
-__declspec(naked)
-void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
-                         uint8* dst_a, int dst_stride_a,
-                         uint8* dst_b, int dst_stride_b,
-                         int w) {
+__declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src,
+                                           int src_stride,
+                                           uint8_t* dst_a,
+                                           int dst_stride_a,
+                                           uint8_t* dst_b,
+                                           int dst_stride_b,
+                                           int w) {
   __asm {
     push      ebx
     push      esi
     push      edi
     push      ebp
-    mov       eax, [esp + 16 + 4]   // src
-    mov       edi, [esp + 16 + 8]   // src_stride
+    mov       eax, [esp + 16 + 4]  // src
+    mov       edi, [esp + 16 + 8]  // src_stride
     mov       edx, [esp + 16 + 12]  // dst_a
     mov       esi, [esp + 16 + 16]  // dst_stride_a
     mov       ebx, [esp + 16 + 20]  // dst_b
@@ -133,9 +137,9 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     mov       ecx, [ecx + 16 + 28]  // w
 
     align      4
- convertloop:
     // Read in the data from the source pointer.
     // First round of bit swap.
+  convertloop:
     movdqu    xmm0, [eax]
     movdqu    xmm1, [eax + edi]
     lea       eax, [eax + 2 * edi]
@@ -162,13 +166,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       eax, [eax + 2 * edi]
     movdqu    [esp], xmm5  // backup xmm5
     neg       edi
-    movdqa    xmm5, xmm6   // use xmm5 as temp register.
+    movdqa    xmm5, xmm6  // use xmm5 as temp register.
     punpcklbw xmm6, xmm7
     punpckhbw xmm5, xmm7
     movdqa    xmm7, xmm5
     lea       eax, [eax + 8 * edi + 16]
     neg       edi
-    // Second round of bit swap.
+        // Second round of bit swap.
     movdqa    xmm5, xmm0
     punpcklwd xmm0, xmm2
     punpckhwd xmm5, xmm2
@@ -183,12 +187,13 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     movdqa    xmm6, xmm5
     movdqu    xmm5, [esp]  // restore xmm5
     movdqu    [esp], xmm6  // backup xmm6
-    movdqa    xmm6, xmm5    // use xmm6 as temp register.
+    movdqa    xmm6, xmm5  // use xmm6 as temp register.
     punpcklwd xmm5, xmm7
     punpckhwd xmm6, xmm7
     movdqa    xmm7, xmm6
-    // Third round of bit swap.
-    // Write to the destination pointer.
+
+        // Third round of bit swap.
+        // Write to the destination pointer.
     movdqa    xmm6, xmm0
     punpckldq xmm0, xmm4
     punpckhdq xmm6, xmm4
@@ -200,7 +205,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm4
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm2   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm2  // use xmm0 as the temp register.
     punpckldq xmm2, xmm6
     movlpd    qword ptr [edx], xmm2
     movhpd    qword ptr [ebx], xmm2
@@ -209,7 +214,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm1   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm1  // use xmm0 as the temp register.
     punpckldq xmm1, xmm5
     movlpd    qword ptr [edx], xmm1
     movhpd    qword ptr [ebx], xmm1
@@ -218,7 +223,7 @@ void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
     lea       edx, [edx + 2 * esi]
     movhpd    qword ptr [ebx + ebp], xmm0
     lea       ebx, [ebx + 2 * ebp]
-    movdqa    xmm0, xmm3   // use xmm0 as the temp register.
+    movdqa    xmm0, xmm3  // use xmm0 as the temp register.
     punpckldq xmm3, xmm7
     movlpd    qword ptr [edx], xmm3
     movhpd    qword ptr [ebx], xmm3
diff --git a/libs/libvpx/third_party/libyuv/source/row_any.cc b/libs/libvpx/third_party/libyuv/source/row_any.cc
index 494164fd023..e91560c44c6 100644
--- a/libs/libvpx/third_party/libyuv/source/row_any.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_any.cc
@@ -19,30 +19,38 @@ namespace libyuv {
 extern "C" {
 #endif
 
+// memset for temp is meant to clear the source buffer (not dest) so that
+// SIMD that reads full multiple of 16 bytes will not trigger msan errors.
+// memset is not needed for production, as the garbage values are processed but
+// not used, although there may be edge cases for subsampling.
+// The size of the buffer is based on the largest read, which can be inferred
+// by the source type (e.g. ARGB) and the mask (last parameter), or by examining
+// the source code for how much the source pointers are advanced.
+
 // Subsampled source needs to be increase by 1 of not even.
 #define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift))
 
 // Any 4 planes to 1 with yuvconstants
-#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 const uint8* a_buf, uint8* dst_ptr,                           \
-                 const struct YuvConstants* yuvconstants,  int width) {        \
-      SIMD_ALIGNED(uint8 temp[64 * 5]);                                        \
-      memset(temp, 0, 64 * 4);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      memcpy(temp + 192, a_buf + n, r);                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY41C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)              \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,                   \
+               const uint8_t* v_buf, const uint8_t* a_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {         \
+    SIMD_ALIGNED(uint8_t temp[64 * 5]);                                      \
+    memset(temp, 0, 64 * 4); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n);        \
+    }                                                                        \
+    memcpy(temp, y_buf + n, r);                                              \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
+    memcpy(temp + 192, a_buf + n, r);                                        \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, temp + 256,            \
+             yuvconstants, MASK + 1);                                        \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 256,                      \
+           SS(r, DUVSHIFT) * BPP);                                           \
+  }
 
 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
 ANY41C(I422AlphaToARGBRow_Any_SSSE3, I422AlphaToARGBRow_SSSE3, 1, 0, 4, 7)
@@ -53,36 +61,57 @@ ANY41C(I422AlphaToARGBRow_Any_AVX2, I422AlphaToARGBRow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I422ALPHATOARGBROW_NEON
 ANY41C(I422AlphaToARGBRow_Any_NEON, I422AlphaToARGBRow_NEON, 1, 0, 4, 7)
 #endif
+#ifdef HAS_I422ALPHATOARGBROW_MSA
+ANY41C(I422AlphaToARGBRow_Any_MSA, I422AlphaToARGBRow_MSA, 1, 0, 4, 7)
+#endif
 #undef ANY41C
 
 // Any 3 planes to 1.
-#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                 \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                             \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);             \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,          \
+               const uint8_t* v_buf, uint8_t* dst_ptr, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 4]);                             \
+    memset(temp, 0, 64 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                           \
+    int n = width & ~MASK;                                          \
+    if (n > 0) {                                                    \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n);                    \
+    }                                                               \
+    memcpy(temp, y_buf + n, r);                                     \
+    memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));     \
+    ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,             \
+           SS(r, DUVSHIFT) * BPP);                                  \
+  }
+
+// Merge functions.
+#ifdef HAS_MERGERGBROW_SSSE3
+ANY31(MergeRGBRow_Any_SSSE3, MergeRGBRow_SSSE3, 0, 0, 3, 15)
+#endif
+#ifdef HAS_MERGERGBROW_NEON
+ANY31(MergeRGBRow_Any_NEON, MergeRGBRow_NEON, 0, 0, 3, 15)
+#endif
 #ifdef HAS_I422TOYUY2ROW_SSE2
 ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15)
 ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_AVX2
+ANY31(I422ToYUY2Row_Any_AVX2, I422ToYUY2Row_AVX2, 1, 1, 4, 31)
+ANY31(I422ToUYVYRow_Any_AVX2, I422ToUYVYRow_AVX2, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOYUY2ROW_NEON
 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
@@ -94,35 +123,38 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
 // Note that odd width replication includes 444 due to implementation
 // on arm that subsamples 444 to 422 internally.
 // Any 3 planes to 1 with yuvconstants
-#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)                \
-    void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf,   \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 4]);                                        \
-      memset(temp, 0, 64 * 3);  /* for YUY2 and msan */                        \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);               \
-      }                                                                        \
-      memcpy(temp, y_buf + n, r);                                              \
-      memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));               \
-      memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));              \
-      if (width & 1) {                                                         \
-        temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1];             \
-        temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];           \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 64, temp + 128, temp + 192,                        \
-               yuvconstants, MASK + 1);                                        \
-      memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192,                      \
-             SS(r, DUVSHIFT) * BPP);                                           \
-    }
+#define ANY31C(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK)      \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf,           \
+               const uint8_t* v_buf, uint8_t* dst_ptr,               \
+               const struct YuvConstants* yuvconstants, int width) { \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                             \
+    memset(temp, 0, 128 * 3); /* for YUY2 and msan */                \
+    int r = width & MASK;                                            \
+    int n = width & ~MASK;                                           \
+    if (n > 0) {                                                     \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);       \
+    }                                                                \
+    memcpy(temp, y_buf + n, r);                                      \
+    memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT));      \
+    if (width & 1) {                                                 \
+      temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1];   \
+      temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1];   \
+    }                                                                \
+    ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
+             MASK + 1);                                              \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384,              \
+           SS(r, DUVSHIFT) * BPP);                                   \
+  }
 
 #ifdef HAS_I422TOARGBROW_SSSE3
 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
 #endif
-#ifdef HAS_I411TOARGBROW_SSSE3
-ANY31C(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7)
+#ifdef HAS_I422TOAR30ROW_SSSE3
+ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
+#endif
+#ifdef HAS_I422TOAR30ROW_AVX2
+ANY31C(I422ToAR30Row_Any_AVX2, I422ToAR30Row_AVX2, 1, 0, 4, 15)
 #endif
 #ifdef HAS_I444TOARGBROW_SSSE3
 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
@@ -130,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
 ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
-ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
+ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
 #endif  // HAS_I444TOARGBROW_SSSE3
 #ifdef HAS_I422TORGB24ROW_AVX2
-ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
+ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
 #endif
 #ifdef HAS_I422TOARGBROW_AVX2
 ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
@@ -144,47 +176,87 @@ ANY31C(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15)
 #ifdef HAS_I444TOARGBROW_AVX2
 ANY31C(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15)
 #endif
-#ifdef HAS_I411TOARGBROW_AVX2
-ANY31C(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15)
-#endif
 #ifdef HAS_I422TOARGB4444ROW_AVX2
-ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGB1555ROW_AVX2
-ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TORGB565ROW_AVX2
-ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 15)
 #endif
 #ifdef HAS_I422TOARGBROW_NEON
 ANY31C(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7)
 ANY31C(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7)
-ANY31C(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7)
 ANY31C(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7)
 ANY31C(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7)
 ANY31C(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7)
 ANY31C(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7)
 #endif
+#ifdef HAS_I422TOARGBROW_MSA
+ANY31C(I444ToARGBRow_Any_MSA, I444ToARGBRow_MSA, 0, 0, 4, 7)
+ANY31C(I422ToARGBRow_Any_MSA, I422ToARGBRow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGBARow_Any_MSA, I422ToRGBARow_MSA, 1, 0, 4, 7)
+ANY31C(I422ToRGB24Row_Any_MSA, I422ToRGB24Row_MSA, 1, 0, 3, 15)
+ANY31C(I422ToARGB4444Row_Any_MSA, I422ToARGB4444Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToARGB1555Row_Any_MSA, I422ToARGB1555Row_MSA, 1, 0, 2, 7)
+ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
+#endif
 #undef ANY31C
 
+// Any 3 planes of 16 bit to 1 with yuvconstants
+// TODO(fbarchard): consider sharing this code with ANY31C
+#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
+  void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf,            \
+               uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \
+               int width) {                                               \
+    SIMD_ALIGNED(T temp[16 * 3]);                                         \
+    SIMD_ALIGNED(uint8_t out[64]);                                        \
+    memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n);            \
+    }                                                                     \
+    memcpy(temp, y_buf + n, r * SBPP);                                    \
+    memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP);     \
+    ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1);    \
+    memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP);  \
+  }
+
+#ifdef HAS_I210TOAR30ROW_SSSE3
+ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_SSSE3
+ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7)
+#endif
+#ifdef HAS_I210TOARGBROW_AVX2
+ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#ifdef HAS_I210TOAR30ROW_AVX2
+ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15)
+#endif
+#undef ANY31CT
+
 // Any 2 planes to 1.
-#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)              \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, int width) {                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                   \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                         \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               int width) {                                                   \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                       \
+    memset(temp, 0, 64 * 2); /* for msan */                                   \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, n);                                    \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                        \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1);                          \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                           \
+  }
 
 // Merge functions.
 #ifdef HAS_MERGEUVROW_SSE2
@@ -196,6 +268,9 @@ ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
+#ifdef HAS_MERGEUVROW_MSA
+ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
+#endif
 
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
@@ -225,44 +300,61 @@ ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7)
 #ifdef HAS_ARGBSUBTRACTROW_NEON
 ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBMULTIPLYROW_MSA
+ANY21(ARGBMultiplyRow_Any_MSA, ARGBMultiplyRow_MSA, 0, 4, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBADDROW_MSA
+ANY21(ARGBAddRow_Any_MSA, ARGBAddRow_MSA, 0, 4, 4, 4, 7)
+#endif
+#ifdef HAS_ARGBSUBTRACTROW_MSA
+ANY21(ARGBSubtractRow_Any_MSA, ARGBSubtractRow_MSA, 0, 4, 4, 4, 7)
+#endif
 #ifdef HAS_SOBELROW_SSE2
 ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELROW_NEON
 ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELROW_MSA
+ANY21(SobelRow_Any_MSA, SobelRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15)
 #endif
 #ifdef HAS_SOBELTOPLANEROW_NEON
 ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15)
 #endif
+#ifdef HAS_SOBELTOPLANEROW_MSA
+ANY21(SobelToPlaneRow_Any_MSA, SobelToPlaneRow_MSA, 0, 1, 1, 1, 31)
+#endif
 #ifdef HAS_SOBELXYROW_SSE2
 ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15)
 #endif
 #ifdef HAS_SOBELXYROW_NEON
 ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7)
 #endif
+#ifdef HAS_SOBELXYROW_MSA
+ANY21(SobelXYRow_Any_MSA, SobelXYRow_MSA, 0, 1, 1, 4, 15)
+#endif
 #undef ANY21
 
 // Any 2 planes to 1 with yuvconstants
-#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)             \
-    void NAMEANY(const uint8* y_buf, const uint8* uv_buf,                      \
-                 uint8* dst_ptr, const struct YuvConstants* yuvconstants,      \
-                 int width) {                                                  \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                     \
-      }                                                                        \
-      memcpy(temp, y_buf + n * SBPP, r * SBPP);                                \
-      memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
-             SS(r, UVSHIFT) * SBPP2);                                          \
-      ANY_SIMD(temp, temp + 64, temp + 128, yuvconstants, MASK + 1);           \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK)            \
+  void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
+               const struct YuvConstants* yuvconstants, int width) {          \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                      \
+    memset(temp, 0, 128 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                     \
+    int n = width & ~MASK;                                                    \
+    if (n > 0) {                                                              \
+      ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n);                      \
+    }                                                                         \
+    memcpy(temp, y_buf + n * SBPP, r * SBPP);                                 \
+    memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2,                       \
+           SS(r, UVSHIFT) * SBPP2);                                           \
+    ANY_SIMD(temp, temp + 128, temp + 256, yuvconstants, MASK + 1);           \
+    memcpy(dst_ptr + n * BPP, temp + 256, r * BPP);                           \
+  }
 
 // Biplanar to RGB.
 #ifdef HAS_NV12TOARGBROW_SSSE3
@@ -274,6 +366,9 @@ ANY21C(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV12TOARGBROW_NEON
 ANY21C(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV12TOARGBROW_MSA
+ANY21C(NV12ToARGBRow_Any_MSA, NV12ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
 #ifdef HAS_NV21TOARGBROW_SSSE3
 ANY21C(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7)
 #endif
@@ -283,6 +378,27 @@ ANY21C(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15)
 #ifdef HAS_NV21TOARGBROW_NEON
 ANY21C(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7)
 #endif
+#ifdef HAS_NV21TOARGBROW_MSA
+ANY21C(NV21ToARGBRow_Any_MSA, NV21ToARGBRow_MSA, 1, 1, 2, 4, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_NEON
+ANY21C(NV12ToRGB24Row_Any_NEON, NV12ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV21TORGB24ROW_NEON
+ANY21C(NV21ToRGB24Row_Any_NEON, NV21ToRGB24Row_NEON, 1, 1, 2, 3, 7)
+#endif
+#ifdef HAS_NV12TORGB24ROW_SSSE3
+ANY21C(NV12ToRGB24Row_Any_SSSE3, NV12ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TORGB24ROW_SSSE3
+ANY21C(NV21ToRGB24Row_Any_SSSE3, NV21ToRGB24Row_SSSE3, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV12TORGB24ROW_AVX2
+ANY21C(NV12ToRGB24Row_Any_AVX2, NV12ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
+#ifdef HAS_NV21TORGB24ROW_AVX2
+ANY21C(NV21ToRGB24Row_Any_AVX2, NV21ToRGB24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 #ifdef HAS_NV12TORGB565ROW_SSSE3
 ANY21C(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7)
 #endif
@@ -292,22 +408,25 @@ ANY21C(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15)
 #ifdef HAS_NV12TORGB565ROW_NEON
 ANY21C(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7)
 #endif
+#ifdef HAS_NV12TORGB565ROW_MSA
+ANY21C(NV12ToRGB565Row_Any_MSA, NV12ToRGB565Row_MSA, 1, 1, 2, 2, 7)
+#endif
 #undef ANY21C
 
 // Any 1 to 1.
-#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                     \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, MASK + 1);                                 \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 
 #ifdef HAS_COPYROW_AVX
 ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63)
@@ -325,6 +444,15 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
 ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORGB24ROW_AVX512VBMI)
+ANY11(ARGBToRGB24Row_Any_AVX512VBMI, ARGBToRGB24Row_AVX512VBMI, 0, 4, 3, 31)
+#endif
+#if defined(HAS_ARGBTORAWROW_AVX2)
+ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
+#endif
 #if defined(HAS_ARGBTORGB565ROW_AVX2)
 ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
 #endif
@@ -332,6 +460,18 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
 ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
 ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
 #endif
+#if defined(HAS_ABGRTOAR30ROW_SSSE3)
+ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_SSSE3)
+ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
+#endif
+#if defined(HAS_ABGRTOAR30ROW_AVX2)
+ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
+#if defined(HAS_ARGBTOAR30ROW_AVX2)
+ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
+#endif
 #if defined(HAS_J400TOARGBROW_SSE2)
 ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7)
 #endif
@@ -372,9 +512,21 @@ ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7)
 ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7)
 ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7)
 #endif
+#if defined(HAS_ARGBTORGB24ROW_MSA)
+ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRAWRow_Any_MSA, ARGBToRAWRow_MSA, 0, 4, 3, 15)
+ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7)
+ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7)
+ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15)
+ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15)
+#endif
 #if defined(HAS_RAWTORGB24ROW_NEON)
 ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7)
 #endif
+#if defined(HAS_RAWTORGB24ROW_MSA)
+ANY11(RAWToRGB24Row_Any_MSA, RAWToRGB24Row_MSA, 0, 3, 3, 15)
+#endif
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -403,30 +555,57 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
 #ifdef HAS_ARGBTOYROW_NEON
 ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYROW_MSA
+ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ARGBTOYJROW_NEON
 ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBTOYJROW_MSA
+ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_BGRATOYROW_NEON
 ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_BGRATOYROW_MSA
+ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_ABGRTOYROW_NEON
 ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ABGRTOYROW_MSA
+ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
+#endif
 #ifdef HAS_RGBATOYROW_NEON
 ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
 #endif
+#ifdef HAS_RGBATOYROW_MSA
+ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
+#endif
 #ifdef HAS_RGB24TOYROW_NEON
 ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RGB24TOYROW_MSA
+ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RAWTOYROW_NEON
 ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
 #endif
+#ifdef HAS_RAWTOYROW_MSA
+ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
+#endif
 #ifdef HAS_RGB565TOYROW_NEON
 ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_RGB565TOYROW_MSA
+ANY11(RGB565ToYRow_Any_MSA, RGB565ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB1555TOYROW_NEON
 ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7)
 #endif
+#ifdef HAS_ARGB1555TOYROW_MSA
+ANY11(ARGB1555ToYRow_Any_MSA, ARGB1555ToYRow_MSA, 0, 2, 1, 15)
+#endif
 #ifdef HAS_ARGB4444TOYROW_NEON
 ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 #endif
@@ -434,23 +613,44 @@ ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7)
 ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #endif
 #ifdef HAS_UYVYTOYROW_NEON
-ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
+ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 1, 4, 1, 15)
+#endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RGB24TOARGBROW_MSA
+ANY11(RGB24ToARGBRow_Any_MSA, RGB24ToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RAWTOARGBROW_NEON
 ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7)
 #endif
+#ifdef HAS_RAWTOARGBROW_MSA
+ANY11(RAWToARGBRow_Any_MSA, RAWToARGBRow_MSA, 0, 3, 4, 15)
+#endif
 #ifdef HAS_RGB565TOARGBROW_NEON
 ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_RGB565TOARGBROW_MSA
+ANY11(RGB565ToARGBRow_Any_MSA, RGB565ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB1555TOARGBROW_NEON
 ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB1555TOARGBROW_MSA
+ANY11(ARGB1555ToARGBRow_Any_MSA, ARGB1555ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGB4444TOARGBROW_NEON
 ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7)
 #endif
+#ifdef HAS_ARGB4444TOARGBROW_MSA
+ANY11(ARGB4444ToARGBRow_Any_MSA, ARGB4444ToARGBRow_MSA, 0, 2, 4, 15)
+#endif
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3)
 #endif
@@ -466,29 +666,38 @@ ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7)
 #ifdef HAS_ARGBATTENUATEROW_NEON
 ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7)
 #endif
+#ifdef HAS_ARGBATTENUATEROW_MSA
+ANY11(ARGBAttenuateRow_Any_MSA, ARGBAttenuateRow_MSA, 0, 4, 4, 7)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+ANY11(ARGBExtractAlphaRow_Any_AVX2, ARGBExtractAlphaRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBEXTRACTALPHAROW_NEON
 ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15)
 #endif
+#ifdef HAS_ARGBEXTRACTALPHAROW_MSA
+ANY11(ARGBExtractAlphaRow_Any_MSA, ARGBExtractAlphaRow_MSA, 0, 4, 1, 15)
+#endif
 #undef ANY11
 
 // Any 1 to 1 blended.  Destination is read, modify, write.
-#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128 * 2);  /* for YUY2 and msan */                       \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, n);                                         \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      memcpy(temp + 128, dst_ptr + n * BPP, r * BPP);                          \
-      ANY_SIMD(temp, temp + 128, MASK + 1);                                    \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11B(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64 * 2); /* for msan */                               \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, n);                                      \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    memcpy(temp + 64, dst_ptr + n * BPP, r * BPP);                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                        \
+  }
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 ANY11B(ARGBCopyAlphaRow_Any_AVX2, ARGBCopyAlphaRow_AVX2, 0, 4, 4, 15)
@@ -506,61 +715,184 @@ ANY11B(ARGBCopyYToAlphaRow_Any_SSE2, ARGBCopyYToAlphaRow_SSE2, 0, 1, 4, 7)
 
 // Any 1 to 1 with parameter.
 #define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 T shuffler, int width) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, shuffler, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      ANY_SIMD(temp, temp + 64, shuffler, MASK + 1);                           \
-      memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                           \
-    }
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, T param, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                        \
+    memset(temp, 0, 64); /* for msan */                                        \
+    int r = width & MASK;                                                      \
+    int n = width & ~MASK;                                                     \
+    if (n > 0) {                                                               \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                                    \
+    }                                                                          \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                                \
+    ANY_SIMD(temp, temp + 64, param, MASK + 1);                                \
+    memcpy(dst_ptr + n * BPP, temp + 64, r * BPP);                             \
+  }
 
 #if defined(HAS_ARGBTORGB565DITHERROW_SSE2)
-ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2,
-       const uint32, 4, 2, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_SSE2,
+       ARGBToRGB565DitherRow_SSE2,
+       const uint32_t,
+       4,
+       2,
+       3)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_AVX2)
-ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2,
-       const uint32, 4, 2, 7)
+ANY11P(ARGBToRGB565DitherRow_Any_AVX2,
+       ARGBToRGB565DitherRow_AVX2,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
 #if defined(HAS_ARGBTORGB565DITHERROW_NEON)
-ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
-       const uint32, 4, 2, 7)
-#endif
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3)
+ANY11P(ARGBToRGB565DitherRow_Any_NEON,
+       ARGBToRGB565DitherRow_NEON,
+       const uint32_t,
+       4,
+       2,
+       7)
+#endif
+#if defined(HAS_ARGBTORGB565DITHERROW_MSA)
+ANY11P(ARGBToRGB565DitherRow_Any_MSA,
+       ARGBToRGB565DitherRow_MSA,
+       const uint32_t,
+       4,
+       2,
+       7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
-ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7)
+ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8_t*, 4, 4, 7)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15)
+ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8_t*, 4, 4, 15)
 #endif
 #ifdef HAS_ARGBSHUFFLEROW_NEON
-ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
+ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8_t*, 4, 4, 3)
+#endif
+#ifdef HAS_ARGBSHUFFLEROW_MSA
+ANY11P(ARGBShuffleRow_Any_MSA, ARGBShuffleRow_MSA, const uint8_t*, 4, 4, 7)
 #endif
 #undef ANY11P
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
+    SIMD_ALIGNED(STYPE temp[32]);                                            \
+    SIMD_ALIGNED(DTYPE out[32]);                                             \
+    memset(temp, 0, 32 * SBPP); /* for msan */                               \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, dst_ptr, scale, n);                                  \
+    }                                                                        \
+    memcpy(temp, src_ptr + n, r * SBPP);                                     \
+    ANY_SIMD(temp, out, scale, MASK + 1);                                    \
+    memcpy(dst_ptr + n, out, r * BPP);                                       \
+  }
+
+#ifdef HAS_CONVERT16TO8ROW_SSSE3
+ANY11C(Convert16To8Row_Any_SSSE3,
+       Convert16To8Row_SSSE3,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       15)
+#endif
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+ANY11C(Convert16To8Row_Any_AVX2,
+       Convert16To8Row_AVX2,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       31)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_SSE2
+ANY11C(Convert8To16Row_Any_SSE2,
+       Convert8To16Row_SSE2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       15)
+#endif
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+ANY11C(Convert8To16Row_Any_AVX2,
+       Convert8To16Row_AVX2,
+       1,
+       2,
+       uint8_t,
+       uint16_t,
+       31)
+#endif
+#undef ANY11C
+
+// Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
+#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
+  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
+    SIMD_ALIGNED(ST temp[32]);                                          \
+    SIMD_ALIGNED(T out[32]);                                            \
+    memset(temp, 0, SBPP * 32); /* for msan */                          \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_ptr, param, n);                             \
+    }                                                                   \
+    memcpy(temp, src_ptr + n, r * SBPP);                                \
+    ANY_SIMD(temp, out, param, MASK + 1);                               \
+    memcpy(dst_ptr + n, out, r * BPP);                                  \
+  }
+
+#ifdef HAS_HALFFLOATROW_SSE2
+ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
+#endif
+#ifdef HAS_HALFFLOATROW_AVX2
+ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
+#endif
+#ifdef HAS_HALFFLOATROW_F16C
+ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
+ANY11P16(HalfFloat1Row_Any_F16C,
+         HalfFloat1Row_F16C,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         15)
+#endif
+#ifdef HAS_HALFFLOATROW_NEON
+ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
+ANY11P16(HalfFloat1Row_Any_NEON,
+         HalfFloat1Row_NEON,
+         uint16_t,
+         uint16_t,
+         2,
+         2,
+         7)
+#endif
+#ifdef HAS_HALFFLOATROW_MSA
+ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
+#endif
+#ifdef HAS_BYTETOFLOATROW_NEON
+ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
+#endif
+#undef ANY11P16
+
 // Any 1 to 1 with yuvconstants
-#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)                    \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr,                         \
-                 const struct YuvConstants* yuvconstants, int width) {         \
-      SIMD_ALIGNED(uint8 temp[128 * 2]);                                       \
-      memset(temp, 0, 128);  /* for YUY2 and msan */                           \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                           \
-      }                                                                        \
-      memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP);    \
-      ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                      \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK)               \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr,                  \
+               const struct YuvConstants* yuvconstants, int width) {      \
+    SIMD_ALIGNED(uint8_t temp[128 * 2]);                                  \
+    memset(temp, 0, 128); /* for YUY2 and msan */                         \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, yuvconstants, n);                        \
+    }                                                                     \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \
+    ANY_SIMD(temp, temp + 128, yuvconstants, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                       \
+  }
 #if defined(HAS_YUY2TOARGBROW_SSSE3)
 ANY11C(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15)
 ANY11C(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15)
@@ -573,25 +905,28 @@ ANY11C(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31)
 ANY11C(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7)
 ANY11C(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7)
 #endif
+#if defined(HAS_YUY2TOARGBROW_MSA)
+ANY11C(YUY2ToARGBRow_Any_MSA, YUY2ToARGBRow_MSA, 1, 4, 4, 7)
+ANY11C(UYVYToARGBRow_Any_MSA, UYVYToARGBRow_MSA, 1, 4, 4, 7)
+#endif
 #undef ANY11C
 
 // Any 1 to 1 interpolate.  Takes 2 rows of source via stride.
-#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                             \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 ptrdiff_t src_stride_ptr, int width,                          \
-                 int source_y_fraction) {                                      \
-      SIMD_ALIGNED(uint8 temp[64 * 3]);                                        \
-      memset(temp, 0, 64 * 2);  /* for msan */                                 \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
-      }                                                                        \
-      memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
-      memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
-      ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
-      memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
-    }
+#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK)                           \
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr,                     \
+               ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \
+    SIMD_ALIGNED(uint8_t temp[64 * 3]);                                      \
+    memset(temp, 0, 64 * 2); /* for msan */                                  \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction);      \
+    }                                                                        \
+    memcpy(temp, src_ptr + n * SBPP, r * SBPP);                              \
+    memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP);        \
+    ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction);             \
+    memcpy(dst_ptr + n * BPP, temp + 128, r * BPP);                          \
+  }
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31)
@@ -602,25 +937,25 @@ ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15)
 #ifdef HAS_INTERPOLATEROW_NEON
 ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15)
 #endif
-#ifdef HAS_INTERPOLATEROW_DSPR2
-ANY11T(InterpolateRow_Any_DSPR2, InterpolateRow_DSPR2, 1, 1, 3)
+#ifdef HAS_INTERPOLATEROW_MSA
+ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31)
 #endif
 #undef ANY11T
 
 // Any 1 to 1 mirror.
-#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                                   \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) {            \
-      SIMD_ALIGNED(uint8 temp[64 * 2]);                                        \
-      memset(temp, 0, 64);  /* for msan */                                     \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                               \
-      }                                                                        \
-      memcpy(temp, src_ptr, r * BPP);                                          \
-      ANY_SIMD(temp, temp + 64, MASK + 1);                                     \
-      memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP);    \
-    }
+#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK)                              \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) {     \
+    SIMD_ALIGNED(uint8_t temp[64 * 2]);                                   \
+    memset(temp, 0, 64); /* for msan */                                   \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr + r * BPP, dst_ptr, n);                            \
+    }                                                                     \
+    memcpy(temp, src_ptr, r* BPP);                                        \
+    ANY_SIMD(temp, temp + 64, MASK + 1);                                  \
+    memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \
+  }
 
 #ifdef HAS_MIRRORROW_AVX2
 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31)
@@ -631,6 +966,9 @@ ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15)
 #ifdef HAS_MIRRORROW_NEON
 ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15)
 #endif
+#ifdef HAS_MIRRORROW_MSA
+ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@@ -640,67 +978,54 @@ ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #ifdef HAS_ARGBMIRRORROW_NEON
 ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3)
 #endif
+#ifdef HAS_ARGBMIRRORROW_MSA
+ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
+#endif
 #undef ANY11M
 
 // Any 1 plane. (memset)
-#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)                                  \
-    void NAMEANY(uint8* dst_ptr, T v32, int width) {                           \
-      SIMD_ALIGNED(uint8 temp[64]);                                            \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(dst_ptr, v32, n);                                             \
-      }                                                                        \
-      ANY_SIMD(temp, v32, MASK + 1);                                           \
-      memcpy(dst_ptr + n * BPP, temp, r * BPP);                                \
-    }
+#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK)        \
+  void NAMEANY(uint8_t* dst_ptr, T v32, int width) { \
+    SIMD_ALIGNED(uint8_t temp[64]);                  \
+    int r = width & MASK;                            \
+    int n = width & ~MASK;                           \
+    if (n > 0) {                                     \
+      ANY_SIMD(dst_ptr, v32, n);                     \
+    }                                                \
+    ANY_SIMD(temp, v32, MASK + 1);                   \
+    memcpy(dst_ptr + n * BPP, temp, r * BPP);        \
+  }
 
 #ifdef HAS_SETROW_X86
-ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3)
+ANY1(SetRow_Any_X86, SetRow_X86, uint8_t, 1, 3)
 #endif
 #ifdef HAS_SETROW_NEON
-ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15)
+ANY1(SetRow_Any_NEON, SetRow_NEON, uint8_t, 1, 15)
 #endif
 #ifdef HAS_ARGBSETROW_NEON
-ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3)
+ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32_t, 4, 3)
+#endif
+#ifdef HAS_ARGBSETROW_MSA
+ANY1(ARGBSetRow_Any_MSA, ARGBSetRow_MSA, uint32_t, 4, 3)
 #endif
 #undef ANY1
 
 // Any 1 to 2.  Outputs UV planes.
-#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)                 \
-    void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\
-      SIMD_ALIGNED(uint8 temp[128 * 3]);                                       \
-      memset(temp, 0, 128);  /* for msan */                                    \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, dst_u, dst_v, n);                                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      /* repeat last 4 bytes for 422 subsampler */                             \
-      if ((width & 1) && BPP == 4 && DUVSHIFT == 1) {                          \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      /* repeat last 4 - 12 bytes for 411 subsampler */                        \
-      if (((width & 3) == 1) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + SS(r, UVSHIFT) * BPP + BPP,                              \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP * 2);                    \
-      }                                                                        \
-      if (((width & 3) == 2) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP * 2, BPP * 2);                \
-      }                                                                        \
-      if (((width & 3) == 3) && BPP == 4 && DUVSHIFT == 2) {                   \
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-      }                                                                        \
-      ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                        \
-      memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));            \
-      memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));            \
-    }
+#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK)          \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v,  \
+               int width) {                                             \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                \
+    memset(temp, 0, 128); /* for msan */                                \
+    int r = width & MASK;                                               \
+    int n = width & ~MASK;                                              \
+    if (n > 0) {                                                        \
+      ANY_SIMD(src_ptr, dst_u, dst_v, n);                               \
+    }                                                                   \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \
+    ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1);                   \
+    memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT));       \
+    memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT));       \
+  }
 
 #ifdef HAS_SPLITUVROW_SSE2
 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15)
@@ -711,8 +1036,8 @@ ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31)
 #ifdef HAS_SPLITUVROW_NEON
 ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15)
 #endif
-#ifdef HAS_SPLITUVROW_DSPR2
-ANY12(SplitUVRow_Any_DSPR2, SplitUVRow_DSPR2, 0, 2, 0, 15)
+#ifdef HAS_SPLITUVROW_MSA
+ANY12(SplitUVRow_Any_MSA, SplitUVRow_MSA, 0, 2, 0, 31)
 #endif
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
 ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15)
@@ -727,37 +1052,66 @@ ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15)
 #endif
 #ifdef HAS_YUY2TOUV422ROW_NEON
 ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7)
-ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(ARGBToUV444Row_Any_MSA, ARGBToUV444Row_MSA, 0, 4, 0, 15)
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
 #undef ANY12
 
+// Any 1 to 3.  Outputs RGB planes.
+#define ANY13(NAMEANY, ANY_SIMD, BPP, MASK)                                \
+  void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_r, uint8_t* dst_g,     \
+               uint8_t* dst_b, int width) {                                \
+    SIMD_ALIGNED(uint8_t temp[16 * 6]);                                    \
+    memset(temp, 0, 16 * 3); /* for msan */                                \
+    int r = width & MASK;                                                  \
+    int n = width & ~MASK;                                                 \
+    if (n > 0) {                                                           \
+      ANY_SIMD(src_ptr, dst_r, dst_g, dst_b, n);                           \
+    }                                                                      \
+    memcpy(temp, src_ptr + n * BPP, r * BPP);                              \
+    ANY_SIMD(temp, temp + 16 * 3, temp + 16 * 4, temp + 16 * 5, MASK + 1); \
+    memcpy(dst_r + n, temp + 16 * 3, r);                                   \
+    memcpy(dst_g + n, temp + 16 * 4, r);                                   \
+    memcpy(dst_b + n, temp + 16 * 5, r);                                   \
+  }
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+ANY13(SplitRGBRow_Any_SSSE3, SplitRGBRow_SSSE3, 3, 15)
+#endif
+#ifdef HAS_SPLITRGBROW_NEON
+ANY13(SplitRGBRow_Any_NEON, SplitRGBRow_NEON, 3, 15)
+#endif
+
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
 // 128 byte row allows for 32 avx ARGB pixels.
-#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                          \
-    void NAMEANY(const uint8* src_ptr, int src_stride_ptr,                     \
-                 uint8* dst_u, uint8* dst_v, int width) {                      \
-      SIMD_ALIGNED(uint8 temp[128 * 4]);                                       \
-      memset(temp, 0, 128 * 2);  /* for msan */                                \
-      int r = width & MASK;                                                    \
-      int n = width & ~MASK;                                                   \
-      if (n > 0) {                                                             \
-        ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
-      }                                                                        \
-      memcpy(temp, src_ptr  + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);     \
-      memcpy(temp + 128, src_ptr  + src_stride_ptr + (n >> UVSHIFT) * BPP,     \
-             SS(r, UVSHIFT) * BPP);                                            \
-      if ((width & 1) && UVSHIFT == 0) {  /* repeat last pixel for subsample */\
-        memcpy(temp + SS(r, UVSHIFT) * BPP,                                    \
-               temp + SS(r, UVSHIFT) * BPP - BPP, BPP);                        \
-        memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
-               temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
-      }                                                                        \
-      ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
-      memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
-      memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
-    }
+#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_u,   \
+               uint8_t* dst_v, int width) {                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 4]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n);                    \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1);                   \
+    memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1));                          \
+    memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1));                          \
+  }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
@@ -783,30 +1137,57 @@ ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15)
 #ifdef HAS_ARGBTOUVROW_NEON
 ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVROW_MSA
+ANY12S(ARGBToUVRow_Any_MSA, ARGBToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVJROW_NEON
 ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ARGBTOUVJROW_MSA
+ANY12S(ARGBToUVJRow_Any_MSA, ARGBToUVJRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_BGRATOUVROW_NEON
 ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_BGRATOUVROW_MSA
+ANY12S(BGRAToUVRow_Any_MSA, BGRAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_ABGRTOUVROW_NEON
 ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_ABGRTOUVROW_MSA
+ANY12S(ABGRToUVRow_Any_MSA, ABGRToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGBATOUVROW_NEON
 ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15)
 #endif
+#ifdef HAS_RGBATOUVROW_MSA
+ANY12S(RGBAToUVRow_Any_MSA, RGBAToUVRow_MSA, 0, 4, 31)
+#endif
 #ifdef HAS_RGB24TOUVROW_NEON
 ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RGB24TOUVROW_MSA
+ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RAWTOUVROW_NEON
 ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
 #endif
+#ifdef HAS_RAWTOUVROW_MSA
+ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
+#endif
 #ifdef HAS_RGB565TOUVROW_NEON
 ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_RGB565TOUVROW_MSA
+ANY12S(RGB565ToUVRow_Any_MSA, RGB565ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB1555TOUVROW_NEON
 ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15)
 #endif
+#ifdef HAS_ARGB1555TOUVROW_MSA
+ANY12S(ARGB1555ToUVRow_Any_MSA, ARGB1555ToUVRow_MSA, 0, 2, 15)
+#endif
 #ifdef HAS_ARGB4444TOUVROW_NEON
 ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15)
 #endif
@@ -816,6 +1197,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
 #undef ANY12S
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/third_party/libyuv/source/row_common.cc b/libs/libvpx/third_party/libyuv/source/row_common.cc
index aefa38c4954..2bbc5adbf14 100644
--- a/libs/libvpx/third_party/libyuv/source/row_common.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_common.cc
@@ -10,6 +10,7 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
 #include <string.h>  // For memcpy and memset.
 
 #include "libyuv/basic_types.h"
@@ -23,59 +24,69 @@ extern "C" {
 
 #define USE_BRANCHLESS 1
 #if USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+static __inline int32_t clamp0(int32_t v) {
   return ((-(v) >> 31) & (v));
 }
 
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
   return (((255 - (v)) >> 31) | (v)) & 255;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (((1023 - (v)) >> 31) | (v)) & 1023;
 }
 
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
   int m = v >> 31;
   return (v + m) ^ m;
 }
-#else  // USE_BRANCHLESS
-static __inline int32 clamp0(int32 v) {
+#else   // USE_BRANCHLESS
+static __inline int32_t clamp0(int32_t v) {
   return (v < 0) ? 0 : v;
 }
 
-static __inline int32 clamp255(int32 v) {
+static __inline int32_t clamp255(int32_t v) {
   return (v > 255) ? 255 : v;
 }
 
-static __inline uint32 Clamp(int32 val) {
-  int v = clamp0(val);
-  return (uint32)(clamp255(v));
+static __inline int32_t clamp1023(int32_t v) {
+  return (v > 1023) ? 1023 : v;
 }
 
-static __inline uint32 Abs(int32 v) {
+static __inline uint32_t Abs(int32_t v) {
   return (v < 0) ? -v : v;
 }
 #endif  // USE_BRANCHLESS
+static __inline uint32_t Clamp(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp255(v));
+}
+
+static __inline uint32_t Clamp10(int32_t val) {
+  int v = clamp0(val);
+  return (uint32_t)(clamp1023(v));
+}
 
-#ifdef LIBYUV_LITTLE_ENDIAN
-#define WRITEWORD(p, v) *(uint32*)(p) = v
+// Little Endian
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
+    defined(_M_IX86) || defined(__arm__) || defined(_M_ARM) ||     \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#define WRITEWORD(p, v) *(uint32_t*)(p) = v
 #else
-static inline void WRITEWORD(uint8* p, uint32 v) {
-  p[0] = (uint8)(v & 255);
-  p[1] = (uint8)((v >> 8) & 255);
-  p[2] = (uint8)((v >> 16) & 255);
-  p[3] = (uint8)((v >> 24) & 255);
+static inline void WRITEWORD(uint8_t* p, uint32_t v) {
+  p[0] = (uint8_t)(v & 255);
+  p[1] = (uint8_t)((v >> 8) & 255);
+  p[2] = (uint8_t)((v >> 16) & 255);
+  p[3] = (uint8_t)((v >> 24) & 255);
 }
 #endif
 
-void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
+void RGB24ToARGBRow_C(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb24[0];
-    uint8 g = src_rgb24[1];
-    uint8 r = src_rgb24[2];
+    uint8_t b = src_rgb24[0];
+    uint8_t g = src_rgb24[1];
+    uint8_t r = src_rgb24[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -85,12 +96,12 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
   }
 }
 
-void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
+void RAWToARGBRow_C(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -100,12 +111,12 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
   }
 }
 
-void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
+void RAWToRGB24Row_C(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 r = src_raw[0];
-    uint8 g = src_raw[1];
-    uint8 b = src_raw[2];
+    uint8_t r = src_raw[0];
+    uint8_t g = src_raw[1];
+    uint8_t b = src_raw[2];
     dst_rgb24[0] = b;
     dst_rgb24[1] = g;
     dst_rgb24[2] = r;
@@ -114,12 +125,14 @@ void RAWToRGB24Row_C(const uint8* src_raw, uint8* dst_rgb24, int width) {
   }
 }
 
-void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
+void RGB565ToARGBRow_C(const uint8_t* src_rgb565,
+                       uint8_t* dst_argb,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 2) | (g >> 4);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -129,14 +142,15 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) {
   }
 }
 
-void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
+void ARGB1555ToARGBRow_C(const uint8_t* src_argb1555,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 a = src_argb1555[1] >> 7;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t a = src_argb1555[1] >> 7;
     dst_argb[0] = (b << 3) | (b >> 2);
     dst_argb[1] = (g << 3) | (g >> 2);
     dst_argb[2] = (r << 3) | (r >> 2);
@@ -146,14 +160,15 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb,
   }
 }
 
-void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
+void ARGB4444ToARGBRow_C(const uint8_t* src_argb4444,
+                         uint8_t* dst_argb,
                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
-    uint8 a = src_argb4444[1] >> 4;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
+    uint8_t a = src_argb4444[1] >> 4;
     dst_argb[0] = (b << 4) | b;
     dst_argb[1] = (g << 4) | g;
     dst_argb[2] = (r << 4) | r;
@@ -163,12 +178,53 @@ void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
+    dst_argb += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = (ar30 >> 2) & 0xff;
+    uint32_t g = (ar30 >> 12) & 0xff;
+    uint32_t r = (ar30 >> 22) & 0xff;
+    uint32_t a = (ar30 >> 30) * 0x55;  // Replicate 2 bits to 8 bits.
+    *(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
+    dst_abgr += 4;
+    src_ar30 += 4;
+  }
+}
+
+void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t ar30 = *(const uint32_t*)src_ar30;
+    uint32_t b = ar30 & 0x3ff;
+    uint32_t ga = ar30 & 0xc00ffc00;
+    uint32_t r = (ar30 >> 20) & 0x3ff;
+    *(uint32_t*)(dst_ab30) = r | ga | (b << 20);
+    dst_ab30 += 4;
+    src_ar30 += 4;
+  }
+}
+
+void ARGBToRGB24Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = b;
     dst_rgb[1] = g;
     dst_rgb[2] = r;
@@ -177,12 +233,12 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb[0];
-    uint8 g = src_argb[1];
-    uint8 r = src_argb[2];
+    uint8_t b = src_argb[0];
+    uint8_t g = src_argb[1];
+    uint8_t r = src_argb[2];
     dst_rgb[0] = r;
     dst_rgb[1] = g;
     dst_rgb[2] = b;
@@ -191,25 +247,25 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 2;
-    uint8 r1 = src_argb[6] >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 2;
+    uint8_t r1 = src_argb[6] >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 2;
-    uint8 r0 = src_argb[2] >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 2;
+    uint8_t r0 = src_argb[2] >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
@@ -221,132 +277,160 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
 // endian will not affect order of the original matrix.  But the dither4
 // will containing the first pixel in the lower byte for little endian
 // or the upper byte for big endian.
-void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb,
-                             const uint32 dither4, int width) {
+void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_rgb,
+                             const uint32_t dither4,
+                             int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     int dither0 = ((const unsigned char*)(&dither4))[x & 3];
     int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    uint8 b1 = clamp255(src_argb[4] + dither1) >> 3;
-    uint8 g1 = clamp255(src_argb[5] + dither1) >> 2;
-    uint8 r1 = clamp255(src_argb[6] + dither1) >> 3;
-    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) |
-              (b1 << 16) | (g1 << 21) | (r1 << 27));
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    uint8_t b1 = clamp255(src_argb[4] + dither1) >> 3;
+    uint8_t g1 = clamp255(src_argb[5] + dither1) >> 2;
+    uint8_t r1 = clamp255(src_argb[6] + dither1) >> 3;
+    WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) |
+                           (r1 << 27));
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
     int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3];
-    uint8 b0 = clamp255(src_argb[0] + dither0) >> 3;
-    uint8 g0 = clamp255(src_argb[1] + dither0) >> 2;
-    uint8 r0 = clamp255(src_argb[2] + dither0) >> 3;
-    *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
+    uint8_t b0 = clamp255(src_argb[0] + dither0) >> 3;
+    uint8_t g0 = clamp255(src_argb[1] + dither0) >> 2;
+    uint8_t r0 = clamp255(src_argb[2] + dither0) >> 3;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    uint8 b1 = src_argb[4] >> 3;
-    uint8 g1 = src_argb[5] >> 3;
-    uint8 r1 = src_argb[6] >> 3;
-    uint8 a1 = src_argb[7] >> 7;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    uint8_t b1 = src_argb[4] >> 3;
+    uint8_t g1 = src_argb[5] >> 3;
+    uint8_t r1 = src_argb[6] >> 3;
+    uint8_t a1 = src_argb[7] >> 7;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15) |
+                            (b1 << 16) | (g1 << 21) | (r1 << 26) | (a1 << 31);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 3;
-    uint8 g0 = src_argb[1] >> 3;
-    uint8 r0 = src_argb[2] >> 3;
-    uint8 a0 = src_argb[3] >> 7;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
+    uint8_t b0 = src_argb[0] >> 3;
+    uint8_t g0 = src_argb[1] >> 3;
+    uint8_t r0 = src_argb[2] >> 3;
+    uint8_t a0 = src_argb[3] >> 7;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 10) | (a0 << 15);
   }
 }
 
-void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
+void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    uint8 b1 = src_argb[4] >> 4;
-    uint8 g1 = src_argb[5] >> 4;
-    uint8 r1 = src_argb[6] >> 4;
-    uint8 a1 = src_argb[7] >> 4;
-    *(uint32*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    uint8_t b1 = src_argb[4] >> 4;
+    uint8_t g1 = src_argb[5] >> 4;
+    uint8_t r1 = src_argb[6] >> 4;
+    uint8_t a1 = src_argb[7] >> 4;
+    *(uint32_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12) |
+                            (b1 << 16) | (g1 << 20) | (r1 << 24) | (a1 << 28);
     dst_rgb += 4;
     src_argb += 8;
   }
   if (width & 1) {
-    uint8 b0 = src_argb[0] >> 4;
-    uint8 g0 = src_argb[1] >> 4;
-    uint8 r0 = src_argb[2] >> 4;
-    uint8 a0 = src_argb[3] >> 4;
-    *(uint16*)(dst_rgb) =
-        b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+    uint8_t b0 = src_argb[0] >> 4;
+    uint8_t g0 = src_argb[1] >> 4;
+    uint8_t r0 = src_argb[2] >> 4;
+    uint8_t a0 = src_argb[3] >> 4;
+    *(uint16_t*)(dst_rgb) = b0 | (g0 << 4) | (r0 << 8) | (a0 << 12);
+  }
+}
+
+void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2);
+    uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2);
+    uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2);
+    uint32_t a0 = (src_abgr[3] >> 6);
+    *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_abgr += 4;
+  }
+}
+
+void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2);
+    uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2);
+    uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2);
+    uint32_t a0 = (src_argb[3] >> 6);
+    *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30);
+    dst_ar30 += 4;
+    src_argb += 4;
   }
 }
 
-static __inline int RGBToY(uint8 r, uint8 g, uint8 b) {
-  return (66 * r + 129 * g +  25 * b + 0x1080) >> 8;
+static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
+  return (66 * r + 129 * g + 25 * b + 0x1080) >> 8;
 }
 
-static __inline int RGBToU(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
 }
-static __inline int RGBToV(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
   return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
 }
 
-#define MAKEROWY(NAME, R, G, B, BPP) \
-void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {       \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);               \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb,              \
-                       uint8* dst_u, uint8* dst_v, int width) {                \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = (src_rgb0[B] + src_rgb0[B + BPP] +                              \
-               src_rgb1[B] + src_rgb1[B + BPP]) >> 2;                          \
-    uint8 ag = (src_rgb0[G] + src_rgb0[G + BPP] +                              \
-               src_rgb1[G] + src_rgb1[G + BPP]) >> 2;                          \
-    uint8 ar = (src_rgb0[R] + src_rgb0[R + BPP] +                              \
-               src_rgb1[R] + src_rgb1[R + BPP]) >> 2;                          \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                               \
-    uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                               \
-    uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                               \
-    dst_u[0] = RGBToU(ar, ag, ab);                                             \
-    dst_v[0] = RGBToV(ar, ag, ab);                                             \
-  }                                                                            \
-}
+// ARGBToY_C and ARGBToUV_C
+#define MAKEROWY(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                   \
+    for (x = 0; x < width; ++x) {                                            \
+      dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                      \
+      dst_y += 1;                                                            \
+    }                                                                        \
+  }                                                                          \
+  void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                       uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                     \
+    int x;                                                                   \
+    for (x = 0; x < width - 1; x += 2) {                                     \
+      uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] +          \
+                    src_rgb1[B + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] +          \
+                    src_rgb1[G + BPP]) >>                                    \
+                   2;                                                        \
+      uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] +          \
+                    src_rgb1[R + BPP]) >>                                    \
+                   2;                                                        \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                   \
+      src_rgb1 += BPP * 2;                                                   \
+      dst_u += 1;                                                            \
+      dst_v += 1;                                                            \
+    }                                                                        \
+    if (width & 1) {                                                         \
+      uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1;                         \
+      uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1;                         \
+      uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1;                         \
+      dst_u[0] = RGBToU(ar, ag, ab);                                         \
+      dst_v[0] = RGBToV(ar, ag, ab);                                         \
+    }                                                                        \
+  }
 
 MAKEROWY(ARGB, 2, 1, 0, 4)
 MAKEROWY(BGRA, 1, 2, 3, 4)
@@ -381,64 +465,65 @@ MAKEROWY(RAW, 0, 1, 2, 3)
 // g -0.41869 * 255 = -106.76595 = -107
 // r  0.50000 * 255 = 127.5 = 127
 
-static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) {
-  return (38 * r + 75 * g +  15 * b + 64) >> 7;
+static __inline int RGBToYJ(uint8_t r, uint8_t g, uint8_t b) {
+  return (38 * r + 75 * g + 15 * b + 64) >> 7;
 }
 
-static __inline int RGBToUJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToUJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * b - 84 * g - 43 * r + 0x8080) >> 8;
 }
-static __inline int RGBToVJ(uint8 r, uint8 g, uint8 b) {
+static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
   return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
 }
 
 #define AVGB(a, b) (((a) + (b) + 1) >> 1)
 
-#define MAKEROWYJ(NAME, R, G, B, BPP) \
-void NAME ## ToYJRow_C(const uint8* src_argb0, uint8* dst_y, int width) {      \
-  int x;                                                                       \
-  for (x = 0; x < width; ++x) {                                                \
-    dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);              \
-    src_argb0 += BPP;                                                          \
-    dst_y += 1;                                                                \
-  }                                                                            \
-}                                                                              \
-void NAME ## ToUVJRow_C(const uint8* src_rgb0, int src_stride_rgb,             \
-                        uint8* dst_u, uint8* dst_v, int width) {               \
-  const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;                           \
-  int x;                                                                       \
-  for (x = 0; x < width - 1; x += 2) {                                         \
-    uint8 ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                            \
-                    AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));               \
-    uint8 ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                            \
-                    AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));               \
-    uint8 ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                            \
-                    AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));               \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-    src_rgb0 += BPP * 2;                                                       \
-    src_rgb1 += BPP * 2;                                                       \
-    dst_u += 1;                                                                \
-    dst_v += 1;                                                                \
-  }                                                                            \
-  if (width & 1) {                                                             \
-    uint8 ab = AVGB(src_rgb0[B], src_rgb1[B]);                                 \
-    uint8 ag = AVGB(src_rgb0[G], src_rgb1[G]);                                 \
-    uint8 ar = AVGB(src_rgb0[R], src_rgb1[R]);                                 \
-    dst_u[0] = RGBToUJ(ar, ag, ab);                                            \
-    dst_v[0] = RGBToVJ(ar, ag, ab);                                            \
-  }                                                                            \
-}
+// ARGBToYJ_C and ARGBToUVJ_C
+#define MAKEROWYJ(NAME, R, G, B, BPP)                                         \
+  void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
+    int x;                                                                    \
+    for (x = 0; x < width; ++x) {                                             \
+      dst_y[0] = RGBToYJ(src_argb0[R], src_argb0[G], src_argb0[B]);           \
+      src_argb0 += BPP;                                                       \
+      dst_y += 1;                                                             \
+    }                                                                         \
+  }                                                                           \
+  void NAME##ToUVJRow_C(const uint8_t* src_rgb0, int src_stride_rgb,          \
+                        uint8_t* dst_u, uint8_t* dst_v, int width) {          \
+    const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb;                      \
+    int x;                                                                    \
+    for (x = 0; x < width - 1; x += 2) {                                      \
+      uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]),                       \
+                        AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP]));          \
+      uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]),                       \
+                        AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP]));          \
+      uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]),                       \
+                        AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP]));          \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+      src_rgb0 += BPP * 2;                                                    \
+      src_rgb1 += BPP * 2;                                                    \
+      dst_u += 1;                                                             \
+      dst_v += 1;                                                             \
+    }                                                                         \
+    if (width & 1) {                                                          \
+      uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]);                            \
+      uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]);                            \
+      uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]);                            \
+      dst_u[0] = RGBToUJ(ar, ag, ab);                                         \
+      dst_v[0] = RGBToVJ(ar, ag, ab);                                         \
+    }                                                                         \
+  }
 
 MAKEROWYJ(ARGB, 2, 1, 0, 4)
 #undef MAKEROWYJ
 
-void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
+void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_rgb565[0] & 0x1f;
-    uint8 g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r = src_rgb565[1] >> 3;
+    uint8_t b = src_rgb565[0] & 0x1f;
+    uint8_t g = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r = src_rgb565[1] >> 3;
     b = (b << 3) | (b >> 2);
     g = (g << 2) | (g >> 4);
     r = (r << 3) | (r >> 2);
@@ -448,12 +533,12 @@ void RGB565ToYRow_C(const uint8* src_rgb565, uint8* dst_y, int width) {
   }
 }
 
-void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
+void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb1555[0] & 0x1f;
-    uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b = src_argb1555[0] & 0x1f;
+    uint8_t g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r = (src_argb1555[1] & 0x7c) >> 2;
     b = (b << 3) | (b >> 2);
     g = (g << 3) | (g >> 2);
     r = (r << 3) | (r >> 2);
@@ -463,12 +548,12 @@ void ARGB1555ToYRow_C(const uint8* src_argb1555, uint8* dst_y, int width) {
   }
 }
 
-void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
+void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 b = src_argb4444[0] & 0x0f;
-    uint8 g = src_argb4444[0] >> 4;
-    uint8 r = src_argb4444[1] & 0x0f;
+    uint8_t b = src_argb4444[0] & 0x0f;
+    uint8_t g = src_argb4444[0] >> 4;
+    uint8_t r = src_argb4444[1] & 0x0f;
     b = (b << 4) | b;
     g = (g << 4) | g;
     r = (r << 4) | r;
@@ -478,26 +563,29 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) {
   }
 }
 
-void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565;
+void RGB565ToUVRow_C(const uint8_t* src_rgb565,
+                     int src_stride_rgb565,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* next_rgb565 = src_rgb565 + src_stride_rgb565;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b1 = src_rgb565[2] & 0x1f;
-    uint8 g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
-    uint8 r1 = src_rgb565[3] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b3 = next_rgb565[2] & 0x1f;
-    uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
-    uint8 r3 = next_rgb565[3] >> 3;
-    uint8 b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b1 = src_rgb565[2] & 0x1f;
+    uint8_t g1 = (src_rgb565[2] >> 5) | ((src_rgb565[3] & 0x07) << 3);
+    uint8_t r1 = src_rgb565[3] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b3 = next_rgb565[2] & 0x1f;
+    uint8_t g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3);
+    uint8_t r3 = next_rgb565[3] >> 3;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 565 * 4 = 787.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 1) | (b >> 6);  // 787 -> 888.
     r = (r << 1) | (r >> 6);
     dst_u[0] = RGBToU(r, g, b);
@@ -508,15 +596,15 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_rgb565[0] & 0x1f;
-    uint8 g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
-    uint8 r0 = src_rgb565[1] >> 3;
-    uint8 b2 = next_rgb565[0] & 0x1f;
-    uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
-    uint8 r2 = next_rgb565[1] >> 3;
-    uint8 b = (b0 + b2);  // 565 * 2 = 676.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_rgb565[0] & 0x1f;
+    uint8_t g0 = (src_rgb565[0] >> 5) | ((src_rgb565[1] & 0x07) << 3);
+    uint8_t r0 = src_rgb565[1] >> 3;
+    uint8_t b2 = next_rgb565[0] & 0x1f;
+    uint8_t g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3);
+    uint8_t r2 = next_rgb565[1] >> 3;
+    uint8_t b = (b0 + b2);  // 565 * 2 = 676.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 2) | (b >> 4);  // 676 -> 888
     g = (g << 1) | (g >> 6);
     r = (r << 2) | (r >> 4);
@@ -525,26 +613,29 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565,
   }
 }
 
-void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555;
+void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
+                       int src_stride_argb1555,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb1555 = src_argb1555 + src_stride_argb1555;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b1 = src_argb1555[2] & 0x1f;
-    uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
-    uint8 r1 = (src_argb1555[3] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = (next_argb1555[1] & 0x7c) >> 2;
-    uint8 b3 = next_argb1555[2] & 0x1f;
-    uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
-    uint8 r3 = (next_argb1555[3] & 0x7c) >> 2;
-    uint8 b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b1 = src_argb1555[2] & 0x1f;
+    uint8_t g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3);
+    uint8_t r1 = (src_argb1555[3] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = (next_argb1555[1] & 0x7c) >> 2;
+    uint8_t b3 = next_argb1555[2] & 0x1f;
+    uint8_t g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3);
+    uint8_t r3 = (next_argb1555[3] & 0x7c) >> 2;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 555 * 4 = 777.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 1) | (b >> 6);  // 777 -> 888.
     g = (g << 1) | (g >> 6);
     r = (r << 1) | (r >> 6);
@@ -556,15 +647,15 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_argb1555[0] & 0x1f;
-    uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
-    uint8 r0 = (src_argb1555[1] & 0x7c) >> 2;
-    uint8 b2 = next_argb1555[0] & 0x1f;
-    uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
-    uint8 r2 = next_argb1555[1] >> 3;
-    uint8 b = (b0 + b2);  // 555 * 2 = 666.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_argb1555[0] & 0x1f;
+    uint8_t g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3);
+    uint8_t r0 = (src_argb1555[1] & 0x7c) >> 2;
+    uint8_t b2 = next_argb1555[0] & 0x1f;
+    uint8_t g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3);
+    uint8_t r2 = next_argb1555[1] >> 3;
+    uint8_t b = (b0 + b2);  // 555 * 2 = 666.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 2) | (b >> 4);  // 666 -> 888.
     g = (g << 2) | (g >> 4);
     r = (r << 2) | (r >> 4);
@@ -573,26 +664,29 @@ void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555,
   }
 }
 
-void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444;
+void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
+                       int src_stride_argb4444,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* next_argb4444 = src_argb4444 + src_stride_argb4444;
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b1 = src_argb4444[2] & 0x0f;
-    uint8 g1 = src_argb4444[2] >> 4;
-    uint8 r1 = src_argb4444[3] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b3 = next_argb4444[2] & 0x0f;
-    uint8 g3 = next_argb4444[2] >> 4;
-    uint8 r3 = next_argb4444[3] & 0x0f;
-    uint8 b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
-    uint8 g = (g0 + g1 + g2 + g3);
-    uint8 r = (r0 + r1 + r2 + r3);
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b1 = src_argb4444[2] & 0x0f;
+    uint8_t g1 = src_argb4444[2] >> 4;
+    uint8_t r1 = src_argb4444[3] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b3 = next_argb4444[2] & 0x0f;
+    uint8_t g3 = next_argb4444[2] >> 4;
+    uint8_t r3 = next_argb4444[3] & 0x0f;
+    uint8_t b = (b0 + b1 + b2 + b3);  // 444 * 4 = 666.
+    uint8_t g = (g0 + g1 + g2 + g3);
+    uint8_t r = (r0 + r1 + r2 + r3);
     b = (b << 2) | (b >> 4);  // 666 -> 888.
     g = (g << 2) | (g >> 4);
     r = (r << 2) | (r >> 4);
@@ -604,15 +698,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
     dst_v += 1;
   }
   if (width & 1) {
-    uint8 b0 = src_argb4444[0] & 0x0f;
-    uint8 g0 = src_argb4444[0] >> 4;
-    uint8 r0 = src_argb4444[1] & 0x0f;
-    uint8 b2 = next_argb4444[0] & 0x0f;
-    uint8 g2 = next_argb4444[0] >> 4;
-    uint8 r2 = next_argb4444[1] & 0x0f;
-    uint8 b = (b0 + b2);  // 444 * 2 = 555.
-    uint8 g = (g0 + g2);
-    uint8 r = (r0 + r2);
+    uint8_t b0 = src_argb4444[0] & 0x0f;
+    uint8_t g0 = src_argb4444[0] >> 4;
+    uint8_t r0 = src_argb4444[1] & 0x0f;
+    uint8_t b2 = next_argb4444[0] & 0x0f;
+    uint8_t g2 = next_argb4444[0] >> 4;
+    uint8_t r2 = next_argb4444[1] & 0x0f;
+    uint8_t b = (b0 + b2);  // 444 * 2 = 555.
+    uint8_t g = (g0 + g2);
+    uint8_t r = (r0 + r2);
     b = (b << 3) | (b >> 2);  // 555 -> 888.
     g = (g << 3) | (g >> 2);
     r = (r << 3) | (r >> 2);
@@ -621,13 +715,15 @@ void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444,
   }
 }
 
-void ARGBToUV444Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUV444Row_C(const uint8_t* src_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
+    uint8_t ab = src_argb[0];
+    uint8_t ag = src_argb[1];
+    uint8_t ar = src_argb[2];
     dst_u[0] = RGBToU(ar, ag, ab);
     dst_v[0] = RGBToV(ar, ag, ab);
     src_argb += 4;
@@ -636,45 +732,10 @@ void ARGBToUV444Row_C(const uint8* src_argb,
   }
 }
 
-void ARGBToUV411Row_C(const uint8* src_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  int x;
-  for (x = 0; x < width - 3; x += 4) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[12]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[13]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[14]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-    src_argb += 16;
-    dst_u += 1;
-    dst_v += 1;
-  }
-  // Odd width handling mimics 'any' function which replicates last pixel.
-  if ((width & 3) == 3) {
-    uint8 ab = (src_argb[0] + src_argb[4] + src_argb[8] + src_argb[8]) >> 2;
-    uint8 ag = (src_argb[1] + src_argb[5] + src_argb[9] + src_argb[9]) >> 2;
-    uint8 ar = (src_argb[2] + src_argb[6] + src_argb[10] + src_argb[10]) >> 2;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 2) {
-    uint8 ab = (src_argb[0] + src_argb[4]) >> 1;
-    uint8 ag = (src_argb[1] + src_argb[5]) >> 1;
-    uint8 ar = (src_argb[2] + src_argb[6]) >> 1;
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  } else if ((width & 3) == 1) {
-    uint8 ab = src_argb[0];
-    uint8 ag = src_argb[1];
-    uint8 ar = src_argb[2];
-    dst_u[0] = RGBToU(ar, ag, ab);
-    dst_v[0] = RGBToV(ar, ag, ab);
-  }
-}
-
-void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBGrayRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
+    uint8_t y = RGBToYJ(src_argb[2], src_argb[1], src_argb[0]);
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = src_argb[3];
     dst_argb += 4;
@@ -683,7 +744,7 @@ void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 }
 
 // Convert a row of image to Sepia tone.
-void ARGBSepiaRow_C(uint8* dst_argb, int width) {
+void ARGBSepiaRow_C(uint8_t* dst_argb, int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -702,22 +763,28 @@ void ARGBSepiaRow_C(uint8* dst_argb, int width) {
 
 // Apply color matrix to a row of image. Matrix is signed.
 // TODO(fbarchard): Consider adding rounding (+32).
-void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
-                          const int8* matrix_argb, int width) {
+void ARGBColorMatrixRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const int8_t* matrix_argb,
+                          int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = src_argb[0];
     int g = src_argb[1];
     int r = src_argb[2];
     int a = src_argb[3];
-    int sb = (b * matrix_argb[0] + g * matrix_argb[1] +
-              r * matrix_argb[2] + a * matrix_argb[3]) >> 6;
-    int sg = (b * matrix_argb[4] + g * matrix_argb[5] +
-              r * matrix_argb[6] + a * matrix_argb[7]) >> 6;
-    int sr = (b * matrix_argb[8] + g * matrix_argb[9] +
-              r * matrix_argb[10] + a * matrix_argb[11]) >> 6;
-    int sa = (b * matrix_argb[12] + g * matrix_argb[13] +
-              r * matrix_argb[14] + a * matrix_argb[15]) >> 6;
+    int sb = (b * matrix_argb[0] + g * matrix_argb[1] + r * matrix_argb[2] +
+              a * matrix_argb[3]) >>
+             6;
+    int sg = (b * matrix_argb[4] + g * matrix_argb[5] + r * matrix_argb[6] +
+              a * matrix_argb[7]) >>
+             6;
+    int sr = (b * matrix_argb[8] + g * matrix_argb[9] + r * matrix_argb[10] +
+              a * matrix_argb[11]) >>
+             6;
+    int sa = (b * matrix_argb[12] + g * matrix_argb[13] + r * matrix_argb[14] +
+              a * matrix_argb[15]) >>
+             6;
     dst_argb[0] = Clamp(sb);
     dst_argb[1] = Clamp(sg);
     dst_argb[2] = Clamp(sr);
@@ -728,7 +795,9 @@ void ARGBColorMatrixRow_C(const uint8* src_argb, uint8* dst_argb,
 }
 
 // Apply color table to a row of image.
-void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void ARGBColorTableRow_C(uint8_t* dst_argb,
+                         const uint8_t* table_argb,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -744,7 +813,9 @@ void ARGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
 }
 
 // Apply color table to a row of image.
-void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_C(uint8_t* dst_argb,
+                        const uint8_t* table_argb,
+                        int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -757,8 +828,11 @@ void RGBColorTableRow_C(uint8* dst_argb, const uint8* table_argb, int width) {
   }
 }
 
-void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
-                       int interval_offset, int width) {
+void ARGBQuantizeRow_C(uint8_t* dst_argb,
+                       int scale,
+                       int interval_size,
+                       int interval_offset,
+                       int width) {
   int x;
   for (x = 0; x < width; ++x) {
     int b = dst_argb[0];
@@ -772,21 +846,23 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size,
 }
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 24
+#define SHADE(f, v) v* f >> 24
 
-void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                    uint32 value) {
-  const uint32 b_scale = REPEAT8(value & 0xff);
-  const uint32 g_scale = REPEAT8((value >> 8) & 0xff);
-  const uint32 r_scale = REPEAT8((value >> 16) & 0xff);
-  const uint32 a_scale = REPEAT8(value >> 24);
+void ARGBShadeRow_C(const uint8_t* src_argb,
+                    uint8_t* dst_argb,
+                    int width,
+                    uint32_t value) {
+  const uint32_t b_scale = REPEAT8(value & 0xff);
+  const uint32_t g_scale = REPEAT8((value >> 8) & 0xff);
+  const uint32_t r_scale = REPEAT8((value >> 16) & 0xff);
+  const uint32_t a_scale = REPEAT8(value >> 24);
 
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb[0]);
-    const uint32 g = REPEAT8(src_argb[1]);
-    const uint32 r = REPEAT8(src_argb[2]);
-    const uint32 a = REPEAT8(src_argb[3]);
+    const uint32_t b = REPEAT8(src_argb[0]);
+    const uint32_t g = REPEAT8(src_argb[1]);
+    const uint32_t r = REPEAT8(src_argb[2]);
+    const uint32_t a = REPEAT8(src_argb[3]);
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -799,20 +875,22 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width,
 #undef SHADE
 
 #define REPEAT8(v) (v) | ((v) << 8)
-#define SHADE(f, v) v * f >> 16
+#define SHADE(f, v) v* f >> 16
 
-void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBMultiplyRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    const uint32 b = REPEAT8(src_argb0[0]);
-    const uint32 g = REPEAT8(src_argb0[1]);
-    const uint32 r = REPEAT8(src_argb0[2]);
-    const uint32 a = REPEAT8(src_argb0[3]);
-    const uint32 b_scale = src_argb1[0];
-    const uint32 g_scale = src_argb1[1];
-    const uint32 r_scale = src_argb1[2];
-    const uint32 a_scale = src_argb1[3];
+    const uint32_t b = REPEAT8(src_argb0[0]);
+    const uint32_t g = REPEAT8(src_argb0[1]);
+    const uint32_t r = REPEAT8(src_argb0[2]);
+    const uint32_t a = REPEAT8(src_argb0[3]);
+    const uint32_t b_scale = src_argb1[0];
+    const uint32_t g_scale = src_argb1[1];
+    const uint32_t r_scale = src_argb1[2];
+    const uint32_t a_scale = src_argb1[3];
     dst_argb[0] = SHADE(b, b_scale);
     dst_argb[1] = SHADE(g, g_scale);
     dst_argb[2] = SHADE(r, r_scale);
@@ -827,8 +905,10 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp255(v + f)
 
-void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                  uint8* dst_argb, int width) {
+void ARGBAddRow_C(const uint8_t* src_argb0,
+                  const uint8_t* src_argb1,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -852,8 +932,10 @@ void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
 
 #define SHADE(f, v) clamp0(f - v)
 
-void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
+void ARGBSubtractRow_C(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     const int b = src_argb0[0];
@@ -876,8 +958,11 @@ void ARGBSubtractRow_C(const uint8* src_argb0, const uint8* src_argb1,
 #undef SHADE
 
 // Sobel functions which mimics SSSE3.
-void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
-                 uint8* dst_sobelx, int width) {
+void SobelXRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 const uint8_t* src_y2,
+                 uint8_t* dst_sobelx,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i];
@@ -890,12 +975,14 @@ void SobelXRow_C(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobelx[i] = (uint8)(clamp255(sobel));
+    dst_sobelx[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
-                 uint8* dst_sobely, int width) {
+void SobelYRow_C(const uint8_t* src_y0,
+                 const uint8_t* src_y1,
+                 uint8_t* dst_sobely,
+                 int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int a = src_y0[i + 0];
@@ -908,56 +995,62 @@ void SobelYRow_C(const uint8* src_y0, const uint8* src_y1,
     int b_diff = b - b_sub;
     int c_diff = c - c_sub;
     int sobel = Abs(a_diff + b_diff * 2 + c_diff);
-    dst_sobely[i] = (uint8)(clamp255(sobel));
+    dst_sobely[i] = (uint8_t)(clamp255(sobel));
   }
 }
 
-void SobelRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                uint8* dst_argb, int width) {
+void SobelRow_C(const uint8_t* src_sobelx,
+                const uint8_t* src_sobely,
+                uint8_t* dst_argb,
+                int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_argb[0] = (uint8)(s);
-    dst_argb[1] = (uint8)(s);
-    dst_argb[2] = (uint8)(s);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(s);
+    dst_argb[1] = (uint8_t)(s);
+    dst_argb[2] = (uint8_t)(s);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void SobelToPlaneRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                       uint8* dst_y, int width) {
+void SobelToPlaneRow_C(const uint8_t* src_sobelx,
+                       const uint8_t* src_sobely,
+                       uint8_t* dst_y,
+                       int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int s = clamp255(r + b);
-    dst_y[i] = (uint8)(s);
+    dst_y[i] = (uint8_t)(s);
   }
 }
 
-void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely,
-                  uint8* dst_argb, int width) {
+void SobelXYRow_C(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
   int i;
   for (i = 0; i < width; ++i) {
     int r = src_sobelx[i];
     int b = src_sobely[i];
     int g = clamp255(r + b);
-    dst_argb[0] = (uint8)(b);
-    dst_argb[1] = (uint8)(g);
-    dst_argb[2] = (uint8)(r);
-    dst_argb[3] = (uint8)(255u);
+    dst_argb[0] = (uint8_t)(b);
+    dst_argb[1] = (uint8_t)(g);
+    dst_argb[2] = (uint8_t)(r);
+    dst_argb[3] = (uint8_t)(255u);
     dst_argb += 4;
   }
 }
 
-void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
+void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   // Copy a Y to RGB.
   int x;
   for (x = 0; x < width; ++x) {
-    uint8 y = src_y[0];
+    uint8_t y = src_y[0];
     dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
     dst_argb[3] = 255u;
     dst_argb += 4;
@@ -974,75 +1067,69 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 //  B = (Y - 16) * 1.164 - U * -2.018
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -128 /* max(-128, round(-2.018 * 64)) */
-#define UG 25 /* round(0.391 * 64) */
-#define VG 52 /* round(0.813 * 64) */
+#define UG 25   /* round(0.391 * 64) */
+#define VG 52   /* round(0.813 * 64) */
 #define VR -102 /* round(-1.596 * 64) */
 
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)  // 64 bit arm
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)  // 32 bit arm
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1062,74 +1149,68 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = {
 
 // Y contribution to R,G,B.  Scale and bias.
 #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YGB 32   /* 64 / 2 */
 
 // U and V contributions to R,G,B.
 #define UB -113 /* round(-1.77200 * 64) */
-#define UG 22 /* round(0.34414 * 64) */
-#define VG 46 /* round(0.71414  * 64) */
-#define VR -90 /* round(-1.40200 * 64) */
+#define UG 22   /* round(0.34414 * 64) */
+#define VG 46   /* round(0.71414  * 64) */
+#define VR -90  /* round(-1.40200 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1143,81 +1224,76 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = {
 #undef YG
 
 // BT.709 YUV to RGB reference
-// *  R = Y                - V * -1.28033
-// *  G = Y - U *  0.21482 - V *  0.38059
-// *  B = Y - U * -2.12798
+//  R = (Y - 16) * 1.164              - V * -1.793
+//  G = (Y - 16) * 1.164 - U *  0.213 - V *  0.533
+//  B = (Y - 16) * 1.164 - U * -2.112
+// See also http://www.equasys.de/colorconversion.html
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
-#define YGB 32  /* 64 / 2 */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
-// TODO(fbarchard): Find way to express 2.12 instead of 2.0.
+// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
 // U and V contributions to R,G,B.
-#define UB -128 /* max(-128, round(-2.12798 * 64)) */
-#define UG 14 /* round(0.21482 * 64) */
-#define VG 24 /* round(0.38059  * 64) */
-#define VR -82 /* round(-1.28033 * 64) */
+#define UB -128 /* max(-128, round(-2.112 * 64)) */
+#define UG 14   /* round(0.213 * 64) */
+#define VG 34   /* round(0.533  * 64) */
+#define VR -115 /* round(-1.793 * 64) */
 
 // Bias values to round, and subtract 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 
 #if defined(__aarch64__)
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { -UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { UG, VG, UG, VG, UG, VG, UG, VG },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {UG, VG, UG, VG, UG, VG, UG, VG},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { -VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { VG, UG, VG, UG, VG, UG, VG, UG },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {VG, UG, VG, UG, VG, UG, VG, UG},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #elif defined(__arm__)
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { -UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BB, BG, BR, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0},
+    {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BB, BG, BR, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { -VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { BR, BG, BB, 0, 0, 0, 0, 0 },
-  { 0x0101 * YG, 0, 0, 0 }
-};
+    {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0},
+    {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0},
+    {BR, BG, BB, 0, 0, 0, 0, 0},
+    {0x0101 * YG, 0, 0, 0}};
 #else
 const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = {
-  { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
-    UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0 },
-  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
-    UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
-  { 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
-    0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0,
+     UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0},
+    {UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
+     UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG},
+    {0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR,
+     0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
-  { VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
-    VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0 },
-  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-    VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
-  { 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
-    0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB },
-  { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR },
-  { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG },
-  { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB },
-  { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }
-};
+    {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0,
+     VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0},
+    {VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+     VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG},
+    {0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB,
+     0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB},
+    {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR},
+    {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG},
+    {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB},
+    {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}};
 #endif
 
 #undef BB
@@ -1231,8 +1307,14 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
 #undef YG
 
 // C reference code that mimics the YUV assembly.
-static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
-                              uint8* b, uint8* g, uint8* r,
+// Reads 8 bit YUV and leaves result as 16 bit.
+
+static __inline void YuvPixel(uint8_t y,
+                              uint8_t u,
+                              uint8_t v,
+                              uint8_t* b,
+                              uint8_t* g,
+                              uint8_t* r,
                               const struct YuvConstants* yuvconstants) {
 #if defined(__aarch64__)
   int ub = -yuvconstants->kUVToRB[0];
@@ -1263,22 +1345,129 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
   int yg = yuvconstants->kYToRgb[0];
 #endif
 
-  uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
-  *b = Clamp((int32)(-(u * ub)          + y1 + bb) >> 6);
-  *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
-  *r = Clamp((int32)         (-(v * vr) + y1 + br) >> 6);
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = Clamp((int32_t)(-(u * ub) + y1 + bb) >> 6);
+  *g = Clamp((int32_t)(-(u * ug + v * vg) + y1 + bg) >> 6);
+  *r = Clamp((int32_t)(-(v * vr) + y1 + br) >> 6);
+}
+
+// Reads 8 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel8_16(uint8_t y,
+                                  uint8_t u,
+                                  uint8_t v,
+                                  int* b,
+                                  int* g,
+                                  int* r,
+                                  const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16;
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 16 bit assembly.
+// Reads 10 bit YUV and leaves result as 16 bit.
+static __inline void YuvPixel16(int16_t y,
+                                int16_t u,
+                                int16_t v,
+                                int* b,
+                                int* g,
+                                int* r,
+                                const struct YuvConstants* yuvconstants) {
+#if defined(__aarch64__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = -yuvconstants->kUVToRB[1];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#elif defined(__arm__)
+  int ub = -yuvconstants->kUVToRB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[4];
+  int vr = -yuvconstants->kUVToRB[4];
+  int bb = yuvconstants->kUVBiasBGR[0];
+  int bg = yuvconstants->kUVBiasBGR[1];
+  int br = yuvconstants->kUVBiasBGR[2];
+  int yg = yuvconstants->kYToRgb[0] / 0x0101;
+#else
+  int ub = yuvconstants->kUVToB[0];
+  int ug = yuvconstants->kUVToG[0];
+  int vg = yuvconstants->kUVToG[1];
+  int vr = yuvconstants->kUVToR[1];
+  int bb = yuvconstants->kUVBiasB[0];
+  int bg = yuvconstants->kUVBiasG[0];
+  int br = yuvconstants->kUVBiasR[0];
+  int yg = yuvconstants->kYToRgb[0];
+#endif
+
+  uint32_t y1 = (uint32_t)((y << 6) * yg) >> 16;
+  u = clamp255(u >> 2);
+  v = clamp255(v >> 2);
+  *b = (int)(-(u * ub) + y1 + bb);
+  *g = (int)(-(u * ug + v * vg) + y1 + bg);
+  *r = (int)(-(v * vr) + y1 + br);
+}
+
+// C reference code that mimics the YUV 10 bit assembly.
+// Reads 10 bit YUV and clamps down to 8 bit RGB.
+static __inline void YuvPixel10(uint16_t y,
+                                uint16_t u,
+                                uint16_t v,
+                                uint8_t* b,
+                                uint8_t* g,
+                                uint8_t* r,
+                                const struct YuvConstants* yuvconstants) {
+  int b16;
+  int g16;
+  int r16;
+  YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
+  *b = Clamp(b16 >> 6);
+  *g = Clamp(g16 >> 6);
+  *r = Clamp(r16 >> 6);
 }
 
 // Y contribution to R,G,B.  Scale and bias.
-#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YG 18997  /* round(1.164 * 64 * 256 * 256 / 257) */
 #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
 
 // C reference code that mimics the YUV assembly.
-static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
-  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(y1 + YGB) >> 6);
-  *g = Clamp((int32)(y1 + YGB) >> 6);
-  *r = Clamp((int32)(y1 + YGB) >> 6);
+static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) {
+  uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16;
+  *b = Clamp((int32_t)(y1 + YGB) >> 6);
+  *g = Clamp((int32_t)(y1 + YGB) >> 6);
+  *r = Clamp((int32_t)(y1 + YGB) >> 6);
 }
 
 #undef YG
@@ -1288,16 +1477,16 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
     (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON))
 // C mimic assembly.
 // TODO(fbarchard): Remove subsampling from Neon.
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint8 u = (src_u[0] + src_u[1] + 1) >> 1;
-    uint8 v = (src_v[0] + src_v[1] + 1) >> 1;
+    uint8_t u = (src_u[0] + src_u[1] + 1) >> 1;
+    uint8_t v = (src_v[0] + src_v[1] + 1) >> 1;
     YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2,
              yuvconstants);
     rgb_buf[3] = 255;
@@ -1310,22 +1499,22 @@ void I444ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 #else
-void I444ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I444ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
     src_y += 1;
     src_u += 1;
@@ -1336,19 +1525,46 @@ void I444ToARGBRow_C(const uint8* src_y,
 #endif
 
 // Also used for 420
-void I422ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
+    rgb_buf[7] = 255;
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    rgb_buf[3] = 255;
+  }
+}
+
+// 10 bit YUV to ARGB
+void I210ToARGBRow_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+               rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
     src_u += 1;
@@ -1356,26 +1572,93 @@ void I422ToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+               rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422AlphaToARGBRow_C(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          const uint8* src_a,
-                          uint8* rgb_buf,
+static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) {
+  uint32_t ar30;
+  b = b >> 4;  // convert 10.6 to 10 bit.
+  g = g >> 4;
+  r = r >> 4;
+  b = Clamp10(b);
+  g = Clamp10(g);
+  r = Clamp10(r);
+  ar30 = b | ((uint32_t)g << 10) | ((uint32_t)r << 20) | 0xc0000000;
+  (*(uint32_t*)rgb_buf) = ar30;
+}
+
+// 10 bit YUV to 10 bit AR30
+void I210ToAR30Row_C(const uint16_t* src_y,
+                     const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+// 8 bit YUV to 10 bit AR30
+// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
+void I422ToAR30Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
+                     const struct YuvConstants* yuvconstants,
+                     int width) {
+  int x;
+  int b;
+  int g;
+  int r;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+    YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf + 4, b, g, r);
+    src_y += 2;
+    src_u += 1;
+    src_v += 1;
+    rgb_buf += 8;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
+    StoreAR30(rgb_buf, b, g, r);
+  }
+}
+
+void I422AlphaToARGBRow_C(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          const uint8_t* src_a,
+                          uint8_t* rgb_buf,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = src_a[1];
     src_y += 2;
     src_u += 1;
@@ -1384,47 +1667,47 @@ void I422AlphaToARGBRow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = src_a[0];
   }
 }
 
-void I422ToRGB24Row_C(const uint8* src_y,
-                      const uint8* src_u,
-                      const uint8* src_v,
-                      uint8* rgb_buf,
+void I422ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_u,
+                      const uint8_t* src_v,
+                      uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 3, rgb_buf + 4, rgb_buf + 5, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
     src_u += 1;
     src_v += 1;
     rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void I422ToARGB4444Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb4444,
+void I422ToARGB4444Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb4444,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1435,8 +1718,8 @@ void I422ToARGB4444Row_C(const uint8* src_y,
     b1 = b1 >> 4;
     g1 = g1 >> 4;
     r1 = r1 >> 4;
-    *(uint32*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        (b1 << 16) | (g1 << 20) | (r1 << 24) | 0xf000f000;
+    *(uint32_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | (b1 << 16) |
+                                 (g1 << 20) | (r1 << 24) | 0xf000f000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1447,23 +1730,22 @@ void I422ToARGB4444Row_C(const uint8* src_y,
     b0 = b0 >> 4;
     g0 = g0 >> 4;
     r0 = r0 >> 4;
-    *(uint16*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) |
-        0xf000;
+    *(uint16_t*)(dst_argb4444) = b0 | (g0 << 4) | (r0 << 8) | 0xf000;
   }
 }
 
-void I422ToARGB1555Row_C(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_argb1555,
+void I422ToARGB1555Row_C(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_argb1555,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1474,8 +1756,8 @@ void I422ToARGB1555Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 3;
     r1 = r1 >> 3;
-    *(uint32*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        (b1 << 16) | (g1 << 21) | (r1 << 26) | 0x80008000;
+    *(uint32_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | (b1 << 16) |
+                                 (g1 << 21) | (r1 << 26) | 0x80008000;
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1486,23 +1768,22 @@ void I422ToARGB1555Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 3;
     r0 = r0 >> 3;
-    *(uint16*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) |
-        0x8000;
+    *(uint16_t*)(dst_argb1555) = b0 | (g0 << 5) | (r0 << 10) | 0x8000;
   }
 }
 
-void I422ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_u,
-                       const uint8* src_v,
-                       uint8* dst_rgb565,
+void I422ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants);
@@ -1513,8 +1794,8 @@ void I422ToRGB565Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_u += 1;
     src_v += 1;
@@ -1525,111 +1806,111 @@ void I422ToRGB565Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void I411ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void NV12ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_uv,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
-  for (x = 0; x < width - 3; x += 4) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
-    YuvPixel(src_y[2], src_u[0], src_v[0],
-             rgb_buf + 8, rgb_buf + 9, rgb_buf + 10, yuvconstants);
-    rgb_buf[11] = 255;
-    YuvPixel(src_y[3], src_u[0], src_v[0],
-             rgb_buf + 12, rgb_buf + 13, rgb_buf + 14, yuvconstants);
-    rgb_buf[15] = 255;
-    src_y += 4;
-    src_u += 1;
-    src_v += 1;
-    rgb_buf += 16;  // Advance 4 pixels.
-  }
-  if (width & 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
+    src_uv += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV12ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_uv,
-                     uint8* rgb_buf,
+void NV21ToARGBRow_C(const uint8_t* src_y,
+                     const uint8_t* src_vu,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_uv[0], src_uv[1],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_y += 2;
-    src_uv += 2;
+    src_vu += 2;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_uv[0], src_uv[1],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void NV21ToARGBRow_C(const uint8* src_y,
-                     const uint8* src_vu,
-                     uint8* rgb_buf,
-                     const struct YuvConstants* yuvconstants,
-                     int width) {
+void NV12ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_uv,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
-    YuvPixel(src_y[1], src_vu[1], src_vu[0],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
-    rgb_buf[7] = 255;
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
+    src_y += 2;
+    src_uv += 2;
+    rgb_buf += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+  }
+}
+
+void NV21ToRGB24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* rgb_buf,
+                      const struct YuvConstants* yuvconstants,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
+    YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4,
+             rgb_buf + 5, yuvconstants);
     src_y += 2;
     src_vu += 2;
-    rgb_buf += 8;  // Advance 2 pixels.
+    rgb_buf += 6;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_vu[1], src_vu[0],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
-    rgb_buf[3] = 255;
+    YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
   }
 }
 
-void NV12ToRGB565Row_C(const uint8* src_y,
-                       const uint8* src_uv,
-                       uint8* dst_rgb565,
+void NV12ToRGB565Row_C(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_rgb565,
                        const struct YuvConstants* yuvconstants,
                        int width) {
-  uint8 b0;
-  uint8 g0;
-  uint8 r0;
-  uint8 b1;
-  uint8 g1;
-  uint8 r1;
+  uint8_t b0;
+  uint8_t g0;
+  uint8_t r0;
+  uint8_t b1;
+  uint8_t g1;
+  uint8_t r1;
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants);
@@ -1640,8 +1921,8 @@ void NV12ToRGB565Row_C(const uint8* src_y,
     b1 = b1 >> 3;
     g1 = g1 >> 2;
     r1 = r1 >> 3;
-    *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) |
-        (b1 << 16) | (g1 << 21) | (r1 << 27);
+    *(uint32_t*)(dst_rgb565) =
+        b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27);
     src_y += 2;
     src_uv += 2;
     dst_rgb565 += 4;  // Advance 2 pixels.
@@ -1651,67 +1932,67 @@ void NV12ToRGB565Row_C(const uint8* src_y,
     b0 = b0 >> 3;
     g0 = g0 >> 2;
     r0 = r0 >> 3;
-    *(uint16*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
+    *(uint16_t*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11);
   }
 }
 
-void YUY2ToARGBRow_C(const uint8* src_yuy2,
-                     uint8* rgb_buf,
+void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_yuy2 += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void UYVYToARGBRow_C(const uint8* src_uyvy,
-                     uint8* rgb_buf,
+void UYVYToARGBRow_C(const uint8_t* src_uyvy,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
-    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants);
+    YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5,
+             rgb_buf + 6, yuvconstants);
     rgb_buf[7] = 255;
     src_uyvy += 4;
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2],
-             rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants);
+    YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1,
+             rgb_buf + 2, yuvconstants);
     rgb_buf[3] = 255;
   }
 }
 
-void I422ToRGBARow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* rgb_buf,
+void I422ToRGBARow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
                      int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
-    YuvPixel(src_y[1], src_u[0], src_v[0],
-             rgb_buf + 5, rgb_buf + 6, rgb_buf + 7, yuvconstants);
+    YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6,
+             rgb_buf + 7, yuvconstants);
     rgb_buf[4] = 255;
     src_y += 2;
     src_u += 1;
@@ -1719,13 +2000,13 @@ void I422ToRGBARow_C(const uint8* src_y,
     rgb_buf += 8;  // Advance 2 pixels.
   }
   if (width & 1) {
-    YuvPixel(src_y[0], src_u[0], src_v[0],
-             rgb_buf + 1, rgb_buf + 2, rgb_buf + 3, yuvconstants);
+    YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2,
+             rgb_buf + 3, yuvconstants);
     rgb_buf[0] = 255;
   }
 }
 
-void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
+void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2);
@@ -1741,7 +2022,7 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) {
   }
 }
 
-void MirrorRow_C(const uint8* src, uint8* dst, int width) {
+void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
   src += width - 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1754,7 +2035,10 @@ void MirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void MirrorUVRow_C(const uint8_t* src_uv,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   int x;
   src_uv += (width - 1) << 1;
   for (x = 0; x < width - 1; x += 2) {
@@ -1770,10 +2054,10 @@ void MirrorUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int x;
-  const uint32* src32 = (const uint32*)(src);
-  uint32* dst32 = (uint32*)(dst);
+  const uint32_t* src32 = (const uint32_t*)(src);
+  uint32_t* dst32 = (uint32_t*)(dst);
   src32 += width - 1;
   for (x = 0; x < width - 1; x += 2) {
     dst32[x] = src32[0];
@@ -1785,7 +2069,10 @@ void ARGBMirrorRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
+void SplitUVRow_C(const uint8_t* src_uv,
+                  uint8_t* dst_u,
+                  uint8_t* dst_v,
+                  int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_u[x] = src_uv[0];
@@ -1800,7 +2087,9 @@ void SplitUVRow_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
   }
 }
 
-void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_C(const uint8_t* src_u,
+                  const uint8_t* src_v,
+                  uint8_t* dst_uv,
                   int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1816,20 +2105,110 @@ void MergeUVRow_C(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
   }
 }
 
-void CopyRow_C(const uint8* src, uint8* dst, int count) {
+void SplitRGBRow_C(const uint8_t* src_rgb,
+                   uint8_t* dst_r,
+                   uint8_t* dst_g,
+                   uint8_t* dst_b,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_r[x] = src_rgb[0];
+    dst_g[x] = src_rgb[1];
+    dst_b[x] = src_rgb[2];
+    src_rgb += 3;
+  }
+}
+
+void MergeRGBRow_C(const uint8_t* src_r,
+                   const uint8_t* src_g,
+                   const uint8_t* src_b,
+                   uint8_t* dst_rgb,
+                   int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_rgb[0] = src_r[x];
+    dst_rgb[1] = src_g[x];
+    dst_rgb[2] = src_b[x];
+    dst_rgb += 3;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+void MergeUVRow_16_C(const uint16_t* src_u,
+                     const uint16_t* src_v,
+                     uint16_t* dst_uv,
+                     int scale,
+                     int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = src_u[x] * scale;
+    dst_uv[1] = src_v[x] * scale;
+    dst_uv[2] = src_u[x + 1] * scale;
+    dst_uv[3] = src_v[x + 1] * scale;
+    dst_uv += 4;
+  }
+  if (width & 1) {
+    dst_uv[0] = src_u[width - 1] * scale;
+    dst_uv[1] = src_v[width - 1] * scale;
+  }
+}
+
+void MultiplyRow_16_C(const uint16_t* src_y,
+                      uint16_t* dst_y,
+                      int scale,
+                      int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_y[x] * scale;
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_C(const uint16_t* src_y,
+                       uint8_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = clamp255((src_y[x] * scale) >> 16);
+  }
+}
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 1024 = 10 bits
+void Convert8To16Row_C(const uint8_t* src_y,
+                       uint16_t* dst_y,
+                       int scale,
+                       int width) {
+  int x;
+  scale *= 0x0101;  // replicates the byte.
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = (src_y[x] * scale) >> 16;
+  }
+}
+
+void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
   memcpy(dst, src, count);
 }
 
-void CopyRow_16_C(const uint16* src, uint16* dst, int count) {
+void CopyRow_16_C(const uint16_t* src, uint16_t* dst, int count) {
   memcpy(dst, src, count * 2);
 }
 
-void SetRow_C(uint8* dst, uint8 v8, int width) {
+void SetRow_C(uint8_t* dst, uint8_t v8, int width) {
   memset(dst, v8, width);
 }
 
-void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
-  uint32* d = (uint32*)(dst_argb);
+void ARGBSetRow_C(uint8_t* dst_argb, uint32_t v32, int width) {
+  uint32_t* d = (uint32_t*)(dst_argb);
   int x;
   for (x = 0; x < width; ++x) {
     d[x] = v32;
@@ -1837,8 +2216,11 @@ void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) {
 }
 
 // Filter 2 rows of YUY2 UV's (422) into U and V (420).
-void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUVRow_C(const uint8_t* src_yuy2,
+                   int src_stride_yuy2,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values, filtering 2 rows of YUY2.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1851,8 +2233,10 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
 }
 
 // Copy row of YUY2 UV's (422) into U and V (422).
-void YUY2ToUV422Row_C(const uint8* src_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1865,7 +2249,7 @@ void YUY2ToUV422Row_C(const uint8* src_yuy2,
 }
 
 // Copy row of YUY2 Y's (422) into Y (420/422).
-void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
+void YUY2ToYRow_C(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1879,8 +2263,11 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
 }
 
 // Filter 2 rows of UYVY UV's (422) into U and V (420).
-void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
-                   uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUVRow_C(const uint8_t* src_uyvy,
+                   int src_stride_uyvy,
+                   uint8_t* dst_u,
+                   uint8_t* dst_v,
+                   int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1893,8 +2280,10 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
 }
 
 // Copy row of UYVY UV's (422) into U and V (422).
-void UYVYToUV422Row_C(const uint8* src_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void UYVYToUV422Row_C(const uint8_t* src_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   // Output a row of UV values.
   int x;
   for (x = 0; x < width; x += 2) {
@@ -1907,7 +2296,7 @@ void UYVYToUV422Row_C(const uint8* src_uyvy,
 }
 
 // Copy row of UYVY Y's (422) into Y (420/422).
-void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
+void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
   // Output a row of Y values.
   int x;
   for (x = 0; x < width - 1; x += 2) {
@@ -1925,17 +2314,19 @@ void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) {
 // Blend src_argb0 over src_argb1 and store to dst_argb.
 // dst_argb may be src_argb0 or src_argb1.
 // This code mimics the SSSE3 version for better testability.
-void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
-                    uint8* dst_argb, int width) {
+void ARGBBlendRow_C(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1958,13 +2349,13 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
   }
 
   if (width & 1) {
-    uint32 fb = src_argb0[0];
-    uint32 fg = src_argb0[1];
-    uint32 fr = src_argb0[2];
-    uint32 a = src_argb0[3];
-    uint32 bb = src_argb1[0];
-    uint32 bg = src_argb1[1];
-    uint32 br = src_argb1[2];
+    uint32_t fb = src_argb0[0];
+    uint32_t fg = src_argb0[1];
+    uint32_t fr = src_argb0[2];
+    uint32_t a = src_argb0[3];
+    uint32_t bb = src_argb1[0];
+    uint32_t bg = src_argb1[1];
+    uint32_t br = src_argb1[2];
     dst_argb[0] = BLEND(fb, bb, a);
     dst_argb[1] = BLEND(fg, bg, a);
     dst_argb[2] = BLEND(fr, br, a);
@@ -1973,9 +2364,12 @@ void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
 }
 #undef BLEND
 
-#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8
-void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
-                     const uint8* alpha, uint8* dst, int width) {
+#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8
+void BlendPlaneRow_C(const uint8_t* src0,
+                     const uint8_t* src1,
+                     const uint8_t* alpha,
+                     uint8_t* dst,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst[0] = UBLEND(src0[0], src1[0], alpha[0]);
@@ -1995,13 +2389,13 @@ void BlendPlaneRow_C(const uint8* src0, const uint8* src1,
 
 // Multiply source RGB by alpha and store to destination.
 // This code mimics the SSSE3 version for better testability.
-void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    uint32 a = src_argb[3];
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -2019,10 +2413,10 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 
   if (width & 1) {
-    const uint32 b = src_argb[0];
-    const uint32 g = src_argb[1];
-    const uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
+    const uint32_t b = src_argb[0];
+    const uint32_t g = src_argb[1];
+    const uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
     dst_argb[0] = ATTENUATE(b, a);
     dst_argb[1] = ATTENUATE(g, a);
     dst_argb[2] = ATTENUATE(r, a);
@@ -2038,49 +2432,56 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // Reciprocal method is off by 1 on some values. ie 125
 // 8.8 fixed point inverse table with 1.0 in upper short and 1 / a in lower.
 #define T(a) 0x01000000 + (0x10000 / a)
-const uint32 fixed_invtbl8[256] = {
-  0x01000000, 0x0100ffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
-  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
-  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
-  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
-  T(0x20), T(0x21), T(0x22), T(0x23), T(0x24), T(0x25), T(0x26), T(0x27),
-  T(0x28), T(0x29), T(0x2a), T(0x2b), T(0x2c), T(0x2d), T(0x2e), T(0x2f),
-  T(0x30), T(0x31), T(0x32), T(0x33), T(0x34), T(0x35), T(0x36), T(0x37),
-  T(0x38), T(0x39), T(0x3a), T(0x3b), T(0x3c), T(0x3d), T(0x3e), T(0x3f),
-  T(0x40), T(0x41), T(0x42), T(0x43), T(0x44), T(0x45), T(0x46), T(0x47),
-  T(0x48), T(0x49), T(0x4a), T(0x4b), T(0x4c), T(0x4d), T(0x4e), T(0x4f),
-  T(0x50), T(0x51), T(0x52), T(0x53), T(0x54), T(0x55), T(0x56), T(0x57),
-  T(0x58), T(0x59), T(0x5a), T(0x5b), T(0x5c), T(0x5d), T(0x5e), T(0x5f),
-  T(0x60), T(0x61), T(0x62), T(0x63), T(0x64), T(0x65), T(0x66), T(0x67),
-  T(0x68), T(0x69), T(0x6a), T(0x6b), T(0x6c), T(0x6d), T(0x6e), T(0x6f),
-  T(0x70), T(0x71), T(0x72), T(0x73), T(0x74), T(0x75), T(0x76), T(0x77),
-  T(0x78), T(0x79), T(0x7a), T(0x7b), T(0x7c), T(0x7d), T(0x7e), T(0x7f),
-  T(0x80), T(0x81), T(0x82), T(0x83), T(0x84), T(0x85), T(0x86), T(0x87),
-  T(0x88), T(0x89), T(0x8a), T(0x8b), T(0x8c), T(0x8d), T(0x8e), T(0x8f),
-  T(0x90), T(0x91), T(0x92), T(0x93), T(0x94), T(0x95), T(0x96), T(0x97),
-  T(0x98), T(0x99), T(0x9a), T(0x9b), T(0x9c), T(0x9d), T(0x9e), T(0x9f),
-  T(0xa0), T(0xa1), T(0xa2), T(0xa3), T(0xa4), T(0xa5), T(0xa6), T(0xa7),
-  T(0xa8), T(0xa9), T(0xaa), T(0xab), T(0xac), T(0xad), T(0xae), T(0xaf),
-  T(0xb0), T(0xb1), T(0xb2), T(0xb3), T(0xb4), T(0xb5), T(0xb6), T(0xb7),
-  T(0xb8), T(0xb9), T(0xba), T(0xbb), T(0xbc), T(0xbd), T(0xbe), T(0xbf),
-  T(0xc0), T(0xc1), T(0xc2), T(0xc3), T(0xc4), T(0xc5), T(0xc6), T(0xc7),
-  T(0xc8), T(0xc9), T(0xca), T(0xcb), T(0xcc), T(0xcd), T(0xce), T(0xcf),
-  T(0xd0), T(0xd1), T(0xd2), T(0xd3), T(0xd4), T(0xd5), T(0xd6), T(0xd7),
-  T(0xd8), T(0xd9), T(0xda), T(0xdb), T(0xdc), T(0xdd), T(0xde), T(0xdf),
-  T(0xe0), T(0xe1), T(0xe2), T(0xe3), T(0xe4), T(0xe5), T(0xe6), T(0xe7),
-  T(0xe8), T(0xe9), T(0xea), T(0xeb), T(0xec), T(0xed), T(0xee), T(0xef),
-  T(0xf0), T(0xf1), T(0xf2), T(0xf3), T(0xf4), T(0xf5), T(0xf6), T(0xf7),
-  T(0xf8), T(0xf9), T(0xfa), T(0xfb), T(0xfc), T(0xfd), T(0xfe), 0x01000100 };
+const uint32_t fixed_invtbl8[256] = {
+    0x01000000, 0x0100ffff, T(0x02), T(0x03),   T(0x04), T(0x05), T(0x06),
+    T(0x07),    T(0x08),    T(0x09), T(0x0a),   T(0x0b), T(0x0c), T(0x0d),
+    T(0x0e),    T(0x0f),    T(0x10), T(0x11),   T(0x12), T(0x13), T(0x14),
+    T(0x15),    T(0x16),    T(0x17), T(0x18),   T(0x19), T(0x1a), T(0x1b),
+    T(0x1c),    T(0x1d),    T(0x1e), T(0x1f),   T(0x20), T(0x21), T(0x22),
+    T(0x23),    T(0x24),    T(0x25), T(0x26),   T(0x27), T(0x28), T(0x29),
+    T(0x2a),    T(0x2b),    T(0x2c), T(0x2d),   T(0x2e), T(0x2f), T(0x30),
+    T(0x31),    T(0x32),    T(0x33), T(0x34),   T(0x35), T(0x36), T(0x37),
+    T(0x38),    T(0x39),    T(0x3a), T(0x3b),   T(0x3c), T(0x3d), T(0x3e),
+    T(0x3f),    T(0x40),    T(0x41), T(0x42),   T(0x43), T(0x44), T(0x45),
+    T(0x46),    T(0x47),    T(0x48), T(0x49),   T(0x4a), T(0x4b), T(0x4c),
+    T(0x4d),    T(0x4e),    T(0x4f), T(0x50),   T(0x51), T(0x52), T(0x53),
+    T(0x54),    T(0x55),    T(0x56), T(0x57),   T(0x58), T(0x59), T(0x5a),
+    T(0x5b),    T(0x5c),    T(0x5d), T(0x5e),   T(0x5f), T(0x60), T(0x61),
+    T(0x62),    T(0x63),    T(0x64), T(0x65),   T(0x66), T(0x67), T(0x68),
+    T(0x69),    T(0x6a),    T(0x6b), T(0x6c),   T(0x6d), T(0x6e), T(0x6f),
+    T(0x70),    T(0x71),    T(0x72), T(0x73),   T(0x74), T(0x75), T(0x76),
+    T(0x77),    T(0x78),    T(0x79), T(0x7a),   T(0x7b), T(0x7c), T(0x7d),
+    T(0x7e),    T(0x7f),    T(0x80), T(0x81),   T(0x82), T(0x83), T(0x84),
+    T(0x85),    T(0x86),    T(0x87), T(0x88),   T(0x89), T(0x8a), T(0x8b),
+    T(0x8c),    T(0x8d),    T(0x8e), T(0x8f),   T(0x90), T(0x91), T(0x92),
+    T(0x93),    T(0x94),    T(0x95), T(0x96),   T(0x97), T(0x98), T(0x99),
+    T(0x9a),    T(0x9b),    T(0x9c), T(0x9d),   T(0x9e), T(0x9f), T(0xa0),
+    T(0xa1),    T(0xa2),    T(0xa3), T(0xa4),   T(0xa5), T(0xa6), T(0xa7),
+    T(0xa8),    T(0xa9),    T(0xaa), T(0xab),   T(0xac), T(0xad), T(0xae),
+    T(0xaf),    T(0xb0),    T(0xb1), T(0xb2),   T(0xb3), T(0xb4), T(0xb5),
+    T(0xb6),    T(0xb7),    T(0xb8), T(0xb9),   T(0xba), T(0xbb), T(0xbc),
+    T(0xbd),    T(0xbe),    T(0xbf), T(0xc0),   T(0xc1), T(0xc2), T(0xc3),
+    T(0xc4),    T(0xc5),    T(0xc6), T(0xc7),   T(0xc8), T(0xc9), T(0xca),
+    T(0xcb),    T(0xcc),    T(0xcd), T(0xce),   T(0xcf), T(0xd0), T(0xd1),
+    T(0xd2),    T(0xd3),    T(0xd4), T(0xd5),   T(0xd6), T(0xd7), T(0xd8),
+    T(0xd9),    T(0xda),    T(0xdb), T(0xdc),   T(0xdd), T(0xde), T(0xdf),
+    T(0xe0),    T(0xe1),    T(0xe2), T(0xe3),   T(0xe4), T(0xe5), T(0xe6),
+    T(0xe7),    T(0xe8),    T(0xe9), T(0xea),   T(0xeb), T(0xec), T(0xed),
+    T(0xee),    T(0xef),    T(0xf0), T(0xf1),   T(0xf2), T(0xf3), T(0xf4),
+    T(0xf5),    T(0xf6),    T(0xf7), T(0xf8),   T(0xf9), T(0xfa), T(0xfb),
+    T(0xfc),    T(0xfd),    T(0xfe), 0x01000100};
 #undef T
 
-void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
+void ARGBUnattenuateRow_C(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
   int i;
   for (i = 0; i < width; ++i) {
-    uint32 b = src_argb[0];
-    uint32 g = src_argb[1];
-    uint32 r = src_argb[2];
-    const uint32 a = src_argb[3];
-    const uint32 ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
+    uint32_t b = src_argb[0];
+    uint32_t g = src_argb[1];
+    uint32_t r = src_argb[2];
+    const uint32_t a = src_argb[3];
+    const uint32_t ia = fixed_invtbl8[a] & 0xffff;  // 8.8 fixed point
     b = (b * ia) >> 8;
     g = (g * ia) >> 8;
     r = (r * ia) >> 8;
@@ -2094,31 +2495,37 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
   }
 }
 
-void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum,
-                               const int32* previous_cumsum, int width) {
-  int32 row_sum[4] = {0, 0, 0, 0};
+void ComputeCumulativeSumRow_C(const uint8_t* row,
+                               int32_t* cumsum,
+                               const int32_t* previous_cumsum,
+                               int width) {
+  int32_t row_sum[4] = {0, 0, 0, 0};
   int x;
   for (x = 0; x < width; ++x) {
     row_sum[0] += row[x * 4 + 0];
     row_sum[1] += row[x * 4 + 1];
     row_sum[2] += row[x * 4 + 2];
     row_sum[3] += row[x * 4 + 3];
-    cumsum[x * 4 + 0] = row_sum[0]  + previous_cumsum[x * 4 + 0];
-    cumsum[x * 4 + 1] = row_sum[1]  + previous_cumsum[x * 4 + 1];
-    cumsum[x * 4 + 2] = row_sum[2]  + previous_cumsum[x * 4 + 2];
-    cumsum[x * 4 + 3] = row_sum[3]  + previous_cumsum[x * 4 + 3];
+    cumsum[x * 4 + 0] = row_sum[0] + previous_cumsum[x * 4 + 0];
+    cumsum[x * 4 + 1] = row_sum[1] + previous_cumsum[x * 4 + 1];
+    cumsum[x * 4 + 2] = row_sum[2] + previous_cumsum[x * 4 + 2];
+    cumsum[x * 4 + 3] = row_sum[3] + previous_cumsum[x * 4 + 3];
   }
 }
 
-void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
-                                int w, int area, uint8* dst, int count) {
+void CumulativeSumToAverageRow_C(const int32_t* tl,
+                                 const int32_t* bl,
+                                 int w,
+                                 int area,
+                                 uint8_t* dst,
+                                 int count) {
   float ooa = 1.0f / area;
   int i;
   for (i = 0; i < count; ++i) {
-    dst[0] = (uint8)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
-    dst[1] = (uint8)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
-    dst[2] = (uint8)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
-    dst[3] = (uint8)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
+    dst[0] = (uint8_t)((bl[w + 0] + tl[0] - bl[0] - tl[w + 0]) * ooa);
+    dst[1] = (uint8_t)((bl[w + 1] + tl[1] - bl[1] - tl[w + 1]) * ooa);
+    dst[2] = (uint8_t)((bl[w + 2] + tl[2] - bl[2] - tl[w + 2]) * ooa);
+    dst[3] = (uint8_t)((bl[w + 3] + tl[3] - bl[3] - tl[w + 3]) * ooa);
     dst += 4;
     tl += 4;
     bl += 4;
@@ -2127,8 +2534,11 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl,
 
 // Copy pixels from rotated source to destination row with a slope.
 LIBYUV_API
-void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
-                     uint8* dst_argb, const float* uv_dudv, int width) {
+void ARGBAffineRow_C(const uint8_t* src_argb,
+                     int src_argb_stride,
+                     uint8_t* dst_argb,
+                     const float* uv_dudv,
+                     int width) {
   int i;
   // Render a row of pixels from source into a buffer.
   float uv[2];
@@ -2137,9 +2547,8 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
   for (i = 0; i < width; ++i) {
     int x = (int)(uv[0]);
     int y = (int)(uv[1]);
-    *(uint32*)(dst_argb) =
-        *(const uint32*)(src_argb + y * src_argb_stride +
-                                         x * 4);
+    *(uint32_t*)(dst_argb) =
+        *(const uint32_t*)(src_argb + y * src_argb_stride + x * 4);
     dst_argb += 4;
     uv[0] += uv_dudv[2];
     uv[1] += uv_dudv[3];
@@ -2147,16 +2556,20 @@ void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride,
 }
 
 // Blend 2 rows into 1.
-static void HalfRow_C(const uint8* src_uv, ptrdiff_t src_uv_stride,
-                      uint8* dst_uv, int width) {
+static void HalfRow_C(const uint8_t* src_uv,
+                      ptrdiff_t src_uv_stride,
+                      uint8_t* dst_uv,
+                      int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
   }
 }
 
-static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
-                         uint16* dst_uv, int width) {
+static void HalfRow_16_C(const uint16_t* src_uv,
+                         ptrdiff_t src_uv_stride,
+                         uint16_t* dst_uv,
+                         int width) {
   int x;
   for (x = 0; x < width; ++x) {
     dst_uv[x] = (src_uv[x] + src_uv[src_uv_stride + x] + 1) >> 1;
@@ -2164,12 +2577,14 @@ static void HalfRow_16_C(const uint16* src_uv, ptrdiff_t src_uv_stride,
 }
 
 // C version 2x2 -> 2x1.
-void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
+void InterpolateRow_C(uint8_t* dst_ptr,
+                      const uint8_t* src_ptr,
                       ptrdiff_t src_stride,
-                      int width, int source_y_fraction) {
+                      int width,
+                      int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
   int x;
   if (y1_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width);
@@ -2194,12 +2609,14 @@ void InterpolateRow_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
+void InterpolateRow_16_C(uint16_t* dst_ptr,
+                         const uint16_t* src_ptr,
                          ptrdiff_t src_stride,
-                         int width, int source_y_fraction) {
+                         int width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint16* src_ptr1 = src_ptr + src_stride;
+  const uint16_t* src_ptr1 = src_ptr + src_stride;
   int x;
   if (source_y_fraction == 0) {
     memcpy(dst_ptr, src_ptr, width * 2);
@@ -2222,8 +2639,10 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Use first 4 shuffler values to reorder ARGB channels.
-void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
-                      const uint8* shuffler, int width) {
+void ARGBShuffleRow_C(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      const uint8_t* shuffler,
+                      int width) {
   int index0 = shuffler[0];
   int index1 = shuffler[1];
   int index2 = shuffler[2];
@@ -2232,10 +2651,10 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
   int x;
   for (x = 0; x < width; ++x) {
     // To support in-place conversion.
-    uint8 b = src_argb[index0];
-    uint8 g = src_argb[index1];
-    uint8 r = src_argb[index2];
-    uint8 a = src_argb[index3];
+    uint8_t b = src_argb[index0];
+    uint8_t g = src_argb[index1];
+    uint8_t r = src_argb[index2];
+    uint8_t a = src_argb[index3];
     dst_argb[0] = b;
     dst_argb[1] = g;
     dst_argb[2] = r;
@@ -2245,10 +2664,11 @@ void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
   }
 }
 
-void I422ToYUY2Row_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToYUY2Row_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_y[0];
@@ -2268,10 +2688,11 @@ void I422ToYUY2Row_C(const uint8* src_y,
   }
 }
 
-void I422ToUYVYRow_C(const uint8* src_y,
-                     const uint8* src_u,
-                     const uint8* src_v,
-                     uint8* dst_frame, int width) {
+void I422ToUYVYRow_C(const uint8_t* src_y,
+                     const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_frame,
+                     int width) {
   int x;
   for (x = 0; x < width - 1; x += 2) {
     dst_frame[0] = src_u[0];
@@ -2291,9 +2712,8 @@ void I422ToUYVYRow_C(const uint8* src_y,
   }
 }
 
-
-void ARGBPolynomialRow_C(const uint8* src_argb,
-                         uint8* dst_argb,
+void ARGBPolynomialRow_C(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
                          const float* poly,
                          int width) {
   int i;
@@ -2323,33 +2743,75 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
     dr += poly[14] * r3;
     da += poly[15] * a3;
 
-    dst_argb[0] = Clamp((int32)(db));
-    dst_argb[1] = Clamp((int32)(dg));
-    dst_argb[2] = Clamp((int32)(dr));
-    dst_argb[3] = Clamp((int32)(da));
+    dst_argb[0] = Clamp((int32_t)(db));
+    dst_argb[1] = Clamp((int32_t)(dg));
+    dst_argb[2] = Clamp((int32_t)(dr));
+    dst_argb[3] = Clamp((int32_t)(da));
     src_argb += 4;
     dst_argb += 4;
   }
 }
 
-void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
-                             const uint8* luma, uint32 lumacoeff) {
-  uint32 bc = lumacoeff & 0xff;
-  uint32 gc = (lumacoeff >> 8) & 0xff;
-  uint32 rc = (lumacoeff >> 16) & 0xff;
+// Samples assumed to be unsigned in low 9, 10 or 12 bits.  Scale factor
+// adjust the source integer range to the half float range desired.
+
+// This magic constant is 2^-112. Multiplying by this
+// is the same as subtracting 112 from the exponent, which
+// is the difference in exponent bias between 32-bit and
+// 16-bit floats. Once we've done this subtraction, we can
+// simply extract the low bits of the exponent and the high
+// bits of the mantissa from our float and we're done.
+
+// Work around GCC 7 punning warning -Wstrict-aliasing
+#if defined(__GNUC__)
+typedef uint32_t __attribute__((__may_alias__)) uint32_alias_t;
+#else
+typedef uint32_t uint32_alias_t;
+#endif
+
+void HalfFloatRow_C(const uint16_t* src,
+                    uint16_t* dst,
+                    float scale,
+                    int width) {
+  int i;
+  float mult = 1.9259299444e-34f * scale;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * mult;
+    dst[i] = (uint16_t)((*(const uint32_alias_t*)&value) >> 13);
+  }
+}
+
+void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    float value = src[i] * scale;
+    dst[i] = value;
+  }
+}
+
+void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             int width,
+                             const uint8_t* luma,
+                             uint32_t lumacoeff) {
+  uint32_t bc = lumacoeff & 0xff;
+  uint32_t gc = (lumacoeff >> 8) & 0xff;
+  uint32_t rc = (lumacoeff >> 16) & 0xff;
 
   int i;
   for (i = 0; i < width - 1; i += 2) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
-    const uint8* luma1;
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
+    const uint8_t* luma1;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
     dst_argb[3] = src_argb[3];
-    luma1 = ((src_argb[4] * bc + src_argb[5] * gc +
-              src_argb[6] * rc) & 0x7F00u) + luma;
+    luma1 =
+        ((src_argb[4] * bc + src_argb[5] * gc + src_argb[6] * rc) & 0x7F00u) +
+        luma;
     dst_argb[4] = luma1[src_argb[4]];
     dst_argb[5] = luma1[src_argb[5]];
     dst_argb[6] = luma1[src_argb[6]];
@@ -2359,8 +2821,9 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
   }
   if (width & 1) {
     // Luminance in rows, color values in columns.
-    const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc +
-                           src_argb[2] * rc) & 0x7F00u) + luma;
+    const uint8_t* luma0 =
+        ((src_argb[0] * bc + src_argb[1] * gc + src_argb[2] * rc) & 0x7F00u) +
+        luma;
     dst_argb[0] = luma0[src_argb[0]];
     dst_argb[1] = luma0[src_argb[1]];
     dst_argb[2] = luma0[src_argb[2]];
@@ -2368,7 +2831,7 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
   }
 }
 
-void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst[3] = src[3];
@@ -2381,7 +2844,7 @@ void ARGBCopyAlphaRow_C(const uint8* src, uint8* dst, int width) {
   }
 }
 
-void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
+void ARGBExtractAlphaRow_C(const uint8_t* src_argb, uint8_t* dst_a, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst_a[0] = src_argb[3];
@@ -2394,7 +2857,7 @@ void ARGBExtractAlphaRow_C(const uint8* src_argb, uint8* dst_a, int width) {
   }
 }
 
-void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
+void ARGBCopyYToAlphaRow_C(const uint8_t* src, uint8_t* dst, int width) {
   int i;
   for (i = 0; i < width - 1; i += 2) {
     dst[3] = src[0];
@@ -2413,13 +2876,13 @@ void ARGBCopyYToAlphaRow_C(const uint8* src, uint8* dst, int width) {
 #if !(defined(_MSC_VER) && defined(_M_IX86)) && \
     defined(HAS_I422TORGB565ROW_SSSE3)
 // row_win.cc has asm version, but GCC uses 2 step wrapper.
-void I422ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_u,
-                           const uint8* src_v,
-                           uint8* dst_rgb565,
+void I422ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2434,14 +2897,14 @@ void I422ToRGB565Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_SSSE3)
-void I422ToARGB1555Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb1555,
+void I422ToARGB1555Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb1555,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2456,14 +2919,14 @@ void I422ToARGB1555Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_SSSE3)
-void I422ToARGB4444Row_SSSE3(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             uint8* dst_argb4444,
+void I422ToARGB4444Row_SSSE3(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             uint8_t* dst_argb4444,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2478,13 +2941,13 @@ void I422ToARGB4444Row_SSSE3(const uint8* src_y,
 #endif
 
 #if defined(HAS_NV12TORGB565ROW_SSSE3)
-void NV12ToRGB565Row_SSSE3(const uint8* src_y,
-                           const uint8* src_uv,
-                           uint8* dst_rgb565,
+void NV12ToRGB565Row_SSSE3(const uint8_t* src_y,
+                           const uint8_t* src_uv,
+                           uint8_t* dst_rgb565,
                            const struct YuvConstants* yuvconstants,
                            int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
@@ -2497,14 +2960,102 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y,
 }
 #endif
 
+#if defined(HAS_NV12TORGB24ROW_SSSE3)
+void NV12ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_SSSE3(src_y, src_uv, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_SSSE3)
+void NV21ToRGB24Row_SSSE3(const uint8_t* src_y,
+                          const uint8_t* src_vu,
+                          uint8_t* dst_rgb24,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_SSSE3(src_y, src_vu, row, yuvconstants, twidth);
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV12TORGB24ROW_AVX2)
+void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_uv += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
+#if defined(HAS_NV21TORGB24ROW_AVX2)
+void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  // Row buffer for intermediate ARGB pixels.
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
+  while (width > 0) {
+    int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
+    NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
+    ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
+    src_y += twidth;
+    src_vu += twidth;
+    dst_rgb24 += twidth * 3;
+    width -= twidth;
+  }
+}
+#endif
+
 #if defined(HAS_I422TORGB565ROW_AVX2)
-void I422ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2523,14 +3074,14 @@ void I422ToRGB565Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB1555ROW_AVX2)
-void I422ToARGB1555Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+void I422ToARGB1555Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2549,14 +3100,14 @@ void I422ToARGB1555Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TOARGB4444ROW_AVX2)
-void I422ToARGB4444Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
@@ -2575,19 +3126,22 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_I422TORGB24ROW_AVX2)
-void I422ToRGB24Row_AVX2(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_rgb24,
-                            const struct YuvConstants* yuvconstants,
-                            int width) {
+void I422ToRGB24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
-    // TODO(fbarchard): ARGBToRGB24Row_AVX2
+#if defined(HAS_ARGBTORGB24ROW_AVX2)
+    ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
+#else
     ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
+#endif
     src_y += twidth;
     src_u += twidth / 2;
     src_v += twidth / 2;
@@ -2598,13 +3152,13 @@ void I422ToRGB24Row_AVX2(const uint8* src_y,
 #endif
 
 #if defined(HAS_NV12TORGB565ROW_AVX2)
-void NV12ToRGB565Row_AVX2(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
   // Row buffer for intermediate ARGB pixels.
-  SIMD_ALIGNED(uint8 row[MAXTWIDTH * 4]);
+  SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
   while (width > 0) {
     int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
     NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
@@ -2621,6 +3175,62 @@ void NV12ToRGB565Row_AVX2(const uint8* src_y,
 }
 #endif
 
+float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) {
+  float fsum = 0.f;
+  int i;
+#if defined(__clang__)
+#pragma clang loop vectorize_width(4)
+#endif
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    fsum += v * v;
+    *dst++ = v * scale;
+  }
+  return fsum;
+}
+
+float ScaleMaxSamples_C(const float* src, float* dst, float scale, int width) {
+  float fmax = 0.f;
+  int i;
+  for (i = 0; i < width; ++i) {
+    float v = *src++;
+    float vs = v * scale;
+    fmax = (v > fmax) ? v : fmax;
+    *dst++ = vs;
+  }
+  return fmax;
+}
+
+void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src++ * scale;
+  }
+}
+
+void GaussRow_C(const uint32_t* src, uint16_t* dst, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ =
+        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
+    ++src;
+  }
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_C(const uint16_t* src0,
+                const uint16_t* src1,
+                const uint16_t* src2,
+                const uint16_t* src3,
+                const uint16_t* src4,
+                uint32_t* dst,
+                int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    *dst++ = *src0++ + *src1++ * 4 + *src2++ * 6 + *src3++ * 4 + *src4++;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/libs/libvpx/third_party/libyuv/source/row_gcc.cc b/libs/libvpx/third_party/libyuv/source/row_gcc.cc
index 1ac7ef1aa39..8d3cb81cec2 100644
--- a/libs/libvpx/third_party/libyuv/source/row_gcc.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_gcc.cc
@@ -1,4 +1,3 @@
-// VERSION 2
 /*
  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
  *
@@ -23,1663 +22,2001 @@ extern "C" {
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 // Constants for ARGB
-static vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
-static vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
-static vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
-};
+static const vec8 kARGBToV = {-18, -94, 112, 0, -18, -94, 112, 0,
+                              -18, -94, 112, 0, -18, -94, 112, 0};
 
-static vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // Constants for BGRA
-static vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR
-static vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
 
 // Shuffle table for converting RGB24 to ARGB.
-static uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+static const uvec8 kShuffleMaskRGB24ToARGB = {
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
-static uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRGB24 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
-static uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kShuffleMaskARGBToRAW = {
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
-static uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+static const uvec8 kShuffleMaskARGBToRGB24_0 = {
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
 #ifdef HAS_J400TOARGBROW_SSE2
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y),     // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_J400TOARGBROW_SSE2
 
 #ifdef HAS_RGB24TOARGBROW_SSSE3
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRGB24ToARGB)  // %3
-  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgb24),              // %0
+        "+r"(dst_argb),               // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskRGB24ToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
-    "pslld     $0x18,%%xmm5                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x30,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "palignr   $0x8,%%xmm1,%%xmm2              \n"
-    "pshufb    %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "palignr   $0x4,%%xmm3,%%xmm3              \n"
-    "pshufb    %%xmm4,%%xmm3                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleMaskRAWToARGB)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"  // 0xff000000
+      "pslld     $0x18,%%xmm5                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm3                 \n"
+      "lea       0x30(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "palignr   $0x8,%%xmm1,%%xmm2              \n"
+      "pshufb    %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "palignr   $0xc,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "palignr   $0x4,%%xmm3,%%xmm3              \n"
+      "pshufb    %%xmm4,%%xmm3                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm3,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),              // %0
+        "+r"(dst_argb),             // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskRAWToARGB)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-   "movdqa     %3,%%xmm3                       \n"
-   "movdqa     %4,%%xmm4                       \n"
-   "movdqa     %5,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
-    "lea       " MEMLEA(0x18,0) ",%0           \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
-    "m"(kShuffleMaskRAWToRGB24_1),  // %4
-    "m"(kShuffleMaskRAWToRGB24_2)   // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm3                       \n"
+      "movdqa     %4,%%xmm4                       \n"
+      "movdqa     %5,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x4(%0),%%xmm1                  \n"
+      "movdqu    0x8(%0),%%xmm2                  \n"
+      "lea       0x18(%0),%0                     \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_raw),                  // %0
+        "+r"(dst_rgb24),                // %1
+        "+r"(width)                     // %2
+      : "m"(kShuffleMaskRAWToRGB24_0),  // %3
+        "m"(kShuffleMaskRAWToRGB24_1),  // %4
+        "m"(kShuffleMaskRAWToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x20802080,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xa,%%xmm4                     \n"
-    "psrlw     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x20802080,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xa,%%xmm4                     \n"
+      "psrlw     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0x1080108,%%eax                \n"
-    "movd      %%eax,%%xmm5                    \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x42004200,%%eax               \n"
-    "movd      %%eax,%%xmm6                    \n"
-    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psllw     $0xb,%%xmm3                     \n"
-    "movdqa    %%xmm3,%%xmm4                   \n"
-    "psrlw     $0x6,%%xmm4                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psllw     $0x8,%%xmm7                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "psllw     $0x1,%%xmm1                     \n"
-    "psllw     $0xb,%%xmm2                     \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "psllw     $0x8,%%xmm1                     \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "pmulhuw   %%xmm6,%%xmm0                   \n"
-    "pand      %%xmm7,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm1                   \n"
-    "punpckhbw %%xmm0,%%xmm2                   \n"
-    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGB1555ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0x1080108,%%eax                \n"
+      "movd      %%eax,%%xmm5                    \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x42004200,%%eax               \n"
+      "movd      %%eax,%%xmm6                    \n"
+      "pshufd    $0x0,%%xmm6,%%xmm6              \n"
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psllw     $0xb,%%xmm3                     \n"
+      "movdqa    %%xmm3,%%xmm4                   \n"
+      "psrlw     $0x6,%%xmm4                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psllw     $0x8,%%xmm7                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "psllw     $0x1,%%xmm1                     \n"
+      "psllw     $0xb,%%xmm2                     \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "psllw     $0x8,%%xmm1                     \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "pmulhuw   %%xmm6,%%xmm0                   \n"
+      "pand      %%xmm7,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm2,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
+        "xmm6", "xmm7");
 }
 
-void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "mov       $0xf0f0f0f,%%eax                \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x4,%%xmm5                     \n"
-    "sub       %0,%1                           \n"
-    "sub       %0,%1                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "psllw     $0x4,%%xmm1                     \n"
-    "psrlw     $0x4,%%xmm3                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm2,%%xmm0                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
-    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGB4444ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "mov       $0xf0f0f0f,%%eax                \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x4,%%xmm5                     \n"
+      "sub       %0,%1                           \n"
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "psllw     $0x4,%%xmm1                     \n"
+      "psrlw     $0x4,%%xmm3                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm2,%%xmm0                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,2)            \n"
+      "movdqu    %%xmm1,0x10(%1,%0,2)            \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRGB24)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(width)                   // %2
+      : "m"(kShuffleMaskARGBToRGB24)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm6                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "pshufb    %%xmm6,%%xmm0                   \n"
-    "pshufb    %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm6,%%xmm2                   \n"
-    "pshufb    %%xmm6,%%xmm3                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "psrldq    $0x4,%%xmm1                     \n"
-    "pslldq    $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm2,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "psrldq    $0x8,%%xmm2                     \n"
-    "pslldq    $0x4,%%xmm3                     \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x30,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(kShuffleMaskARGBToRAW)  // %3
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      "movdqa    %3,%%xmm6                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "pshufb    %%xmm6,%%xmm0                   \n"
+      "pshufb    %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm6,%%xmm2                   \n"
+      "pshufb    %%xmm6,%%xmm3                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "psrldq    $0x4,%%xmm1                     \n"
+      "pslldq    $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm2,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pslldq    $0x8,%%xmm5                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "psrldq    $0x8,%%xmm2                     \n"
+      "pslldq    $0x4,%%xmm3                     \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "movdqu    %%xmm2,0x20(%1)                 \n"
+      "lea       0x30(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2
+      : "m"(kShuffleMaskARGBToRAW)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "psrld     $0x1b,%%xmm3                    \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1a,%%xmm4                    \n"
-    "pslld     $0x5,%%xmm4                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0xb,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "pslld     $0x8,%%xmm0                     \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x5,%%xmm2                     \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "pand      %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm4,%%xmm2                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm1                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+#ifdef HAS_ARGBTORGB24ROW_AVX2
+// vpermd for 12+12 to 24
+static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
+
+void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa    %4,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                     // %0
+        "+r"(dst),                     // %1
+        "+r"(width)                    // %2
+      : "m"(kShuffleMaskARGBToRGB24),  // %3
+        "m"(kPermdRGB24_AVX)           // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
+#endif
 
-void ARGBToRGB565DitherRow_SSE2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "movd       %3,%%xmm6                      \n"
-    "punpcklbw  %%xmm6,%%xmm6                  \n"
-    "movdqa     %%xmm6,%%xmm7                  \n"
-    "punpcklwd  %%xmm6,%%xmm6                  \n"
-    "punpckhwd  %%xmm7,%%xmm7                  \n"
-    "pcmpeqb    %%xmm3,%%xmm3                  \n"
-    "psrld      $0x1b,%%xmm3                   \n"
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrld      $0x1a,%%xmm4                   \n"
-    "pslld      $0x5,%%xmm4                    \n"
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "pslld      $0xb,%%xmm5                    \n"
+#ifdef HAS_ARGBTORGB24ROW_AVX512VBMI
+// Shuffle table for converting ARGBToRGB24
+static const ulvec8 kPermARGBToRGB24_0 = {
+    0u,  1u,  2u,  4u,  5u,  6u,  8u,  9u,  10u, 12u, 13u,
+    14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u, 25u, 26u, 28u,
+    29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u, 40u, 41u};
+static const ulvec8 kPermARGBToRGB24_1 = {
+    10u, 12u, 13u, 14u, 16u, 17u, 18u, 20u, 21u, 22u, 24u,
+    25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u, 36u, 37u, 38u,
+    40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u, 50u, 52u};
+static const ulvec8 kPermARGBToRGB24_2 = {
+    21u, 22u, 24u, 25u, 26u, 28u, 29u, 30u, 32u, 33u, 34u,
+    36u, 37u, 38u, 40u, 41u, 42u, 44u, 45u, 46u, 48u, 49u,
+    50u, 52u, 53u, 54u, 56u, 57u, 58u, 60u, 61u, 62u};
+
+void ARGBToRGB24Row_AVX512VBMI(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm5                      \n"
+      "vmovdqa    %4,%%ymm6                      \n"
+      "vmovdqa    %5,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpermt2b   %%ymm1,%%ymm5,%%ymm0           \n"
+      "vpermt2b   %%ymm2,%%ymm6,%%ymm1           \n"
+      "vpermt2b   %%ymm3,%%ymm7,%%ymm2           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                // %0
+        "+r"(dst),                // %1
+        "+r"(width)               // %2
+      : "m"(kPermARGBToRGB24_0),  // %3
+        "m"(kPermARGBToRGB24_1),  // %4
+        "m"(kPermARGBToRGB24_2)   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6", "xmm7");
+}
+#endif
 
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu     (%0),%%xmm0                    \n"
-    "paddusb    %%xmm6,%%xmm0                  \n"
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "movdqa     %%xmm0,%%xmm2                  \n"
-    "pslld      $0x8,%%xmm0                    \n"
-    "psrld      $0x3,%%xmm1                    \n"
-    "psrld      $0x5,%%xmm2                    \n"
-    "psrad      $0x10,%%xmm0                   \n"
-    "pand       %%xmm3,%%xmm1                  \n"
-    "pand       %%xmm4,%%xmm2                  \n"
-    "pand       %%xmm5,%%xmm0                  \n"
-    "por        %%xmm2,%%xmm1                  \n"
-    "por        %%xmm1,%%xmm0                  \n"
-    "packssdw   %%xmm0,%%xmm0                  \n"
-    "lea        0x10(%0),%0                    \n"
-    "movq       %%xmm0,(%1)                    \n"
-    "lea        0x8(%1),%1                     \n"
-    "sub        $0x4,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+#ifdef HAS_ARGBTORAWROW_AVX2
+void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm6                  \n"
+      "vmovdqa    %4,%%ymm7                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "lea        0x80(%0),%0                    \n"
+      "vpshufb    %%ymm6,%%ymm0,%%ymm0           \n"  // xxx0yyy0
+      "vpshufb    %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpshufb    %%ymm6,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm6,%%ymm3,%%ymm3           \n"
+      "vpermd     %%ymm0,%%ymm7,%%ymm0           \n"  // pack to 24 bytes
+      "vpermd     %%ymm1,%%ymm7,%%ymm1           \n"
+      "vpermd     %%ymm2,%%ymm7,%%ymm2           \n"
+      "vpermd     %%ymm3,%%ymm7,%%ymm3           \n"
+      "vpermq     $0x3f,%%ymm1,%%ymm4            \n"  // combine 24 + 8
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vpermq     $0xf9,%%ymm1,%%ymm1            \n"  // combine 16 + 16
+      "vpermq     $0x4f,%%ymm2,%%ymm4            \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "vpermq     $0xfe,%%ymm2,%%ymm2            \n"  // combine 8 + 24
+      "vpermq     $0x93,%%ymm3,%%ymm3            \n"
+      "vpor       %%ymm3,%%ymm2,%%ymm2           \n"
+      "vmovdqu    %%ymm2,0x40(%1)                \n"
+      "lea        0x60(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                   // %0
+        "+r"(dst),                   // %1
+        "+r"(width)                  // %2
+      : "m"(kShuffleMaskARGBToRAW),  // %3
+        "m"(kPermdRGB24_AVX)         // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
+#endif
 
-#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-void ARGBToRGB565DitherRow_AVX2(const uint8* src, uint8* dst,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vbroadcastss %3,%%xmm6                    \n"
-    "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
-    "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
-    "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
-    "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
-    "vpslld     $0x5,%%ymm4,%%ymm4             \n"
-    "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "psrld     $0x1b,%%xmm3                    \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1a,%%xmm4                    \n"
+      "pslld     $0x5,%%xmm4                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0xb,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "pslld     $0x8,%%xmm0                     \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x5,%%xmm2                     \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "pand      %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm4,%%xmm2                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm1                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
 
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%0),%%ymm0                    \n"
-    "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
-    "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
-    "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "lea        0x20(%0),%0                    \n"
-    "vmovdqu    %%xmm0,(%1)                    \n"
-    "lea        0x10(%1),%1                    \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  : "m"(dither4) // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBToRGB565DitherRow_SSE2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "movd       %3,%%xmm6                      \n"
+      "punpcklbw  %%xmm6,%%xmm6                  \n"
+      "movdqa     %%xmm6,%%xmm7                  \n"
+      "punpcklwd  %%xmm6,%%xmm6                  \n"
+      "punpckhwd  %%xmm7,%%xmm7                  \n"
+      "pcmpeqb    %%xmm3,%%xmm3                  \n"
+      "psrld      $0x1b,%%xmm3                   \n"
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrld      $0x1a,%%xmm4                   \n"
+      "pslld      $0x5,%%xmm4                    \n"
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "pslld      $0xb,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "paddusb    %%xmm6,%%xmm0                  \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "pslld      $0x8,%%xmm0                    \n"
+      "psrld      $0x3,%%xmm1                    \n"
+      "psrld      $0x5,%%xmm2                    \n"
+      "psrad      $0x10,%%xmm0                   \n"
+      "pand       %%xmm3,%%xmm1                  \n"
+      "pand       %%xmm4,%%xmm2                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm1                  \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "packssdw   %%xmm0,%%xmm0                  \n"
+      "lea        0x10(%0),%0                    \n"
+      "movq       %%xmm0,(%1)                    \n"
+      "lea        0x8(%1),%1                     \n"
+      "sub        $0x4,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
-#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
+#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
+void ARGBToRGB565DitherRow_AVX2(const uint8_t* src,
+                                uint8_t* dst,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vbroadcastss %3,%%xmm6                    \n"
+      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
+      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
+      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
+      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
+      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
+      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
+      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
+      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
+      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),    // %0
+        "+r"(dst),    // %1
+        "+r"(width)   // %2
+      : "m"(dither4)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
-void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psrld     $0x1b,%%xmm4                    \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "pslld     $0x5,%%xmm5                     \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "pslld     $0xa,%%xmm6                     \n"
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "pslld     $0xf,%%xmm7                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "psrad     $0x10,%%xmm0                    \n"
-    "psrld     $0x3,%%xmm1                     \n"
-    "psrld     $0x6,%%xmm2                     \n"
-    "psrld     $0x9,%%xmm3                     \n"
-    "pand      %%xmm7,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm2                   \n"
-    "pand      %%xmm6,%%xmm3                   \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "por       %%xmm3,%%xmm2                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBToARGB1555Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psrld     $0x1b,%%xmm4                    \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "pslld     $0x5,%%xmm5                     \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "pslld     $0xa,%%xmm6                     \n"
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "pslld     $0xf,%%xmm7                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "psrad     $0x10,%%xmm0                    \n"
+      "psrld     $0x3,%%xmm1                     \n"
+      "psrld     $0x6,%%xmm2                     \n"
+      "psrld     $0x9,%%xmm3                     \n"
+      "pand      %%xmm7,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm2                   \n"
+      "pand      %%xmm6,%%xmm3                   \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "por       %%xmm3,%%xmm2                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
 }
 
-void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0xc,%%xmm4                     \n"
-    "movdqa    %%xmm4,%%xmm3                   \n"
-    "psrlw     $0x8,%%xmm3                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm3,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm1                   \n"
-    "psrlq     $0x4,%%xmm0                     \n"
-    "psrlq     $0x8,%%xmm1                     \n"
-    "por       %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0xc,%%xmm4                     \n"
+      "movdqa    %%xmm4,%%xmm3                   \n"
+      "psrlw     $0x8,%%xmm3                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm3,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm1                   \n"
+      "psrlq     $0x4,%%xmm0                     \n"
+      "psrlq     $0x8,%%xmm1                     \n"
+      "por       %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_RGB24TOARGBROW_SSSE3
 
+/*
+
+ARGBToAR30Row:
+
+Red Blue
+With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
+produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
+wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
+(1024+4)*16 for red.
+
+Alpha Green
+Alpha and Green are already in the high bits so vpand can zero out the other
+bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier
+could be used for Green - (1024+4) putting the 10 bit green in the lsb.  Alpha
+would be a simple multiplier to shift it into position.  It wants a gap of 10
+above the green.  Green is 10 bits, so there are 6 bits in the low short.  4
+more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits,
+and then a shift of 4 is a multiply of 16, so (4*16) = 64.  Then shift the
+result left 10 to position the A and G channels.
+*/
+
+// Shuffle table for converting RAW to RGB24.  Last 8.
+static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u,  128u, 4u,  128u, 6u,
+                                   128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
+
+static const uvec8 kShuffleBR30 = {128u, 2u,  128u, 0u, 128u, 6u,  128u, 4u,
+                                   128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u};
+
+static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028;
+static const uint32_t kMaskRB10 = 0x3ff003ff;
+static const uint32_t kMaskAG10 = 0xc000ff00;
+static const uint32_t kMulAG10 = 64 * 65536 + 1028;
+
+void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ARGB pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "movdqa     %3,%%xmm2                     \n"  // shuffler for RB
+      "movd       %4,%%xmm3                     \n"  // multipler for RB
+      "movd       %5,%%xmm4                     \n"  // mask for R10 B10
+      "movd       %6,%%xmm5                     \n"  // mask for AG
+      "movd       %7,%%xmm6                     \n"  // multipler for AG
+      "pshufd     $0x0,%%xmm3,%%xmm3            \n"
+      "pshufd     $0x0,%%xmm4,%%xmm4            \n"
+      "pshufd     $0x0,%%xmm5,%%xmm5            \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6            \n"
+      "sub        %0,%1                         \n"
+
+      "1:                                       \n"
+      "movdqu     (%0),%%xmm0                   \n"  // fetch 4 ABGR pixels
+      "movdqa     %%xmm0,%%xmm1                 \n"
+      "pshufb     %%xmm2,%%xmm1                 \n"  // R0B0
+      "pand       %%xmm5,%%xmm0                 \n"  // A0G0
+      "pmulhuw    %%xmm3,%%xmm1                 \n"  // X2 R16 X4  B10
+      "pmulhuw    %%xmm6,%%xmm0                 \n"  // X10 A2 X10 G10
+      "pand       %%xmm4,%%xmm1                 \n"  // X2 R10 X10 B10
+      "pslld      $10,%%xmm0                    \n"  // A2 x10 G10 x10
+      "por        %%xmm1,%%xmm0                 \n"  // A2 R10 G10 B10
+      "movdqu     %%xmm0,(%1,%0)                \n"  // store 4 AR30 pixels
+      "add        $0x10,%0                      \n"
+      "sub        $0x4,%2                       \n"
+      "jg         1b                            \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+
+#ifdef HAS_ARGBTOAR30ROW_AVX2
+void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ARGB pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleRB30),  // %3
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
+#ifdef HAS_ABGRTOAR30ROW_AVX2
+void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm2                  \n"  // shuffler for RB
+      "vbroadcastss  %4,%%ymm3                   \n"  // multipler for RB
+      "vbroadcastss  %5,%%ymm4                   \n"  // mask for R10 B10
+      "vbroadcastss  %6,%%ymm5                   \n"  // mask for AG
+      "vbroadcastss  %7,%%ymm6                   \n"  // multipler for AG
+      "sub        %0,%1                          \n"
+
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // fetch 8 ABGR pixels
+      "vpshufb    %%ymm2,%%ymm0,%%ymm1           \n"  // R0B0
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"  // A0G0
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"  // X2 R16 X4  B10
+      "vpmulhuw   %%ymm6,%%ymm0,%%ymm0           \n"  // X10 A2 X10 G10
+      "vpand      %%ymm4,%%ymm1,%%ymm1           \n"  // X2 R10 X10 B10
+      "vpslld     $10,%%ymm0,%%ymm0              \n"  // A2 x10 G10 x10
+      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"  // A2 R10 G10 B10
+      "vmovdqu    %%ymm0,(%1,%0)                 \n"  // store 8 AR30 pixels
+      "add        $0x20,%0                       \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+
+      : "+r"(src),          // %0
+        "+r"(dst),          // %1
+        "+r"(width)         // %2
+      : "m"(kShuffleBR30),  // %3  reversed shuffler
+        "m"(kMulRB10),      // %4
+        "m"(kMaskRB10),     // %5
+        "m"(kMaskAG10),     // %6
+        "m"(kMulAG10)       // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 #ifdef HAS_ARGBTOYJROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),  // %3
-    "m"(kAddYJ64)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBTOYJROW_SSSE3
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToY),   // %3
-    "m"(kAddY16),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToY),          // %3
+        "m"(kAddY16),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vbroadcastf128 %4,%%ymm5                  \n"
-    "vmovdqu    %5,%%ymm6                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
-    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
-    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
-    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
-    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64),    // %4
-    "m"(kPermdARGBToY_AVX)  // %5
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
+      "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kARGBToYJ),         // %3
+        "m"(kAddYJ64),          // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2
 
 #ifdef HAS_ARGBTOUVROW_SSSE3
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToV),  // %5
-    "m"(kARGBToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToV),                     // %5
+        "m"(kARGBToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_SSSE3
 
 #ifdef HAS_ARGBTOUVROW_AVX2
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
-
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUV128),  // %5
-    "m"(kARGBToV),   // %6
-    "m"(kARGBToU),   // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
+void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea        0x80(%0),%0                    \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kARGBToV),                     // %6
+        "m"(kARGBToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vbroadcastf128 %5,%%ymm5                  \n"
-    "vbroadcastf128 %6,%%ymm6                  \n"
-    "vbroadcastf128 %7,%%ymm7                  \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
-    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
-    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
-    "lea       " MEMLEA(0x80,0) ",%0           \n"
-    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
-    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
-    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
-    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
-
-    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
-    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
-    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
-    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpshufb    %8,%%ymm0,%%ymm0               \n"
-
-    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kAddUVJ128),  // %5
-    "m"(kARGBToVJ),  // %6
-    "m"(kARGBToUJ),  // %7
-    "m"(kShufARGBToUV_AVX)  // %8
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea       0x80(%0),%0                     \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kAddUVJ128),                   // %5
+        "m"(kARGBToVJ),                    // %6
+        "m"(kARGBToUJ),                    // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_SSSE3
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_argb)), // %4
-    "m"(kARGBToVJ),  // %5
-    "m"(kARGBToUJ),  // %6
-    "m"(kAddUVJ128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                        int src_stride_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_argb)),  // %4
+        "m"(kARGBToVJ),                    // %5
+        "m"(kARGBToUJ),                    // %6
+        "m"(kAddUVJ128)                    // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
 
 #ifdef HAS_ARGBTOUV444ROW_SSSE3
-void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
                           int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm3                       \n"
-    "movdqa    %5,%%xmm4                       \n"
-    "movdqa    %6,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm2                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm2                     \n"
-    "packsswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "m"(kARGBToV),  // %4
-    "m"(kARGBToU),  // %5
-    "m"(kAddUV128)  // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6"
-  );
+  asm volatile(
+      "movdqa    %4,%%xmm3                       \n"
+      "movdqa    %5,%%xmm4                       \n"
+      "movdqa    %6,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm2                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm2                     \n"
+      "packsswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+rm"(width)     // %3
+      : "m"(kARGBToV),   // %4
+        "m"(kARGBToU),   // %5
+        "m"(kAddUV128)   // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6");
 }
 #endif  // HAS_ARGBTOUV444ROW_SSSE3
 
-void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kBGRAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kBGRAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_bgra0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_bgra)), // %4
-    "m"(kBGRAToV),  // %5
-    "m"(kBGRAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+                       int src_stride_bgra,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_bgra0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_bgra)),  // %4
+        "m"(kBGRAToV),                     // %5
+        "m"(kBGRAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kABGRToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kABGRToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movdqa    %4,%%xmm5                       \n"
-    "movdqa    %3,%%xmm4                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm4,%%xmm3                   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "phaddw    %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  : "m"(kRGBAToY),   // %3
-    "m"(kAddY16)     // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movdqa    %4,%%xmm5                       \n"
+      "movdqa    %3,%%xmm4                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x40(%0),%0                     \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "phaddw    %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      : "m"(kRGBAToY),   // %3
+        "m"(kAddY16)     // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_abgr0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_abgr)), // %4
-    "m"(kABGRToV),  // %5
-    "m"(kABGRToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+                       int src_stride_abgr,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kABGRToV),                     // %5
+        "m"(kABGRToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
-void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "movdqa    %5,%%xmm3                       \n"
-    "movdqa    %6,%%xmm4                       \n"
-    "movdqa    %7,%%xmm5                       \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm7                   \n"
-    "shufps    $0x88,%%xmm6,%%xmm2             \n"
-    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
-    "pavgb     %%xmm7,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm2                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "phaddw    %%xmm2,%%xmm0                   \n"
-    "phaddw    %%xmm6,%%xmm1                   \n"
-    "psraw     $0x8,%%xmm0                     \n"
-    "psraw     $0x8,%%xmm1                     \n"
-    "packsswb  %%xmm1,%%xmm0                   \n"
-    "paddb     %%xmm5,%%xmm0                   \n"
-    "movlps    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_rgba0),       // %0
-    "+r"(dst_u),           // %1
-    "+r"(dst_v),           // %2
-    "+rm"(width)           // %3
-  : "r"((intptr_t)(src_stride_rgba)), // %4
-    "m"(kRGBAToV),  // %5
-    "m"(kRGBAToU),  // %6
-    "m"(kAddUV128)  // %7
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
-  );
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+                       int src_stride_rgba,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  asm volatile(
+      "movdqa    %5,%%xmm3                       \n"
+      "movdqa    %6,%%xmm4                       \n"
+      "movdqa    %7,%%xmm5                       \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x10(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm1                   \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x20(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqu    0x30(%0),%%xmm6                 \n"
+      "movdqu    0x30(%0,%4,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+
+      "lea       0x40(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm7                   \n"
+      "shufps    $0x88,%%xmm6,%%xmm2             \n"
+      "shufps    $0xdd,%%xmm6,%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "phaddw    %%xmm2,%%xmm0                   \n"
+      "phaddw    %%xmm6,%%xmm1                   \n"
+      "psraw     $0x8,%%xmm0                     \n"
+      "psraw     $0x8,%%xmm1                     \n"
+      "packsswb  %%xmm1,%%xmm0                   \n"
+      "paddb     %%xmm5,%%xmm0                   \n"
+      "movlps    %%xmm0,(%1)                     \n"
+      "movhps    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_rgba0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_rgba)),  // %4
+        "m"(kRGBAToV),                     // %5
+        "m"(kRGBAToU),                     // %6
+        "m"(kAddUV128)                     // %7
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 
 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
 
 // Read 8 UV from 444
-#define READYUV444                                                             \
-    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUV444                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 UV from 422, upsample to 8 UV
-#define READYUV422                                                             \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUV422                                                \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
+
+// Read 4 UV from 422 10 bit, upsample to 8 UV
+// TODO(fbarchard): Consider shufb to replace pack/unpack
+// TODO(fbarchard): Consider pmulhuw to replace psraw
+// TODO(fbarchard): Consider pmullw to replace psllw and allow different bits.
+#define READYUV210                                                \
+  "movq       (%[u_buf]),%%xmm0                               \n" \
+  "movq       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklwd  %%xmm1,%%xmm0                                   \n" \
+  "psraw      $0x2,%%xmm0                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movdqu     (%[y_buf]),%%xmm4                               \n" \
+  "psllw      $0x6,%%xmm4                                     \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                         \n"
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
-    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
-    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
-    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
-
-// Read 2 UV from 411, upsample to 8 UV.
-// reading 4 bytes is an msan violation.
-//    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"
-//    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)
-// pinsrw fails with drmemory
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_TEMP                                                        \
-    "movzwl     " MEMACCESS([u_buf]) ",%[temp]                  \n"            \
-    "movd       %[temp],%%xmm0                                  \n"            \
-    MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) "       \n"            \
-    "movd       %[temp],%%xmm1                                  \n"            \
-    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
-    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "punpckldq  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READYUVA422                                               \
+  "movd       (%[u_buf]),%%xmm0                               \n" \
+  "movd       0x00(%[u_buf],%[v_buf],1),%%xmm1                \n" \
+  "lea        0x4(%[u_buf]),%[u_buf]                          \n" \
+  "punpcklbw  %%xmm1,%%xmm0                                   \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n" \
+  "movq       (%[a_buf]),%%xmm5                               \n" \
+  "lea        0x8(%[a_buf]),%[a_buf]                          \n"
 
 // Read 4 UV from NV12, upsample to 8 UV
-#define READNV12                                                               \
-    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
-    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READNV12                                                  \
+  "movq       (%[uv_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[uv_buf]),%[uv_buf]                        \n" \
+  "punpcklwd  %%xmm0,%%xmm0                                   \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 VU from NV21, upsample to 8 UV
-#define READNV21                                                               \
-    "movq       " MEMACCESS([vu_buf]) ",%%xmm0                  \n"            \
-    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
-    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
-    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
-    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
-    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
+#define READNV21                                                  \
+  "movq       (%[vu_buf]),%%xmm0                              \n" \
+  "lea        0x8(%[vu_buf]),%[vu_buf]                        \n" \
+  "pshufb     %[kShuffleNV21], %%xmm0                         \n" \
+  "movq       (%[y_buf]),%%xmm4                               \n" \
+  "punpcklbw  %%xmm4,%%xmm4                                   \n" \
+  "lea        0x8(%[y_buf]),%[y_buf]                          \n"
 
 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
-#define READYUY2                                                               \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
+#define READYUY2                                                  \
+  "movdqu     (%[yuy2_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n" \
+  "movdqu     (%[yuy2_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n" \
+  "lea        0x10(%[yuy2_buf]),%[yuy2_buf]                   \n"
 
 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
-#define READUYVY                                                               \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                \n"            \
-    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
-    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
-    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
-    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
+#define READUYVY                                                  \
+  "movdqu     (%[uyvy_buf]),%%xmm4                            \n" \
+  "pshufb     %[kShuffleUYVYY], %%xmm4                        \n" \
+  "movdqu     (%[uyvy_buf]),%%xmm0                            \n" \
+  "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n" \
+  "lea        0x10(%[uyvy_buf]),%[uyvy_buf]                   \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP(yuvconstants)                                           \
-    "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8            \n"            \
-    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
-    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
-    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
+#define YUVTORGB_SETUP(yuvconstants)                              \
+  "movdqa     (%[yuvconstants]),%%xmm8                        \n" \
+  "movdqa     32(%[yuvconstants]),%%xmm9                      \n" \
+  "movdqa     64(%[yuvconstants]),%%xmm10                     \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm11                     \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm12                    \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm13                    \n" \
+  "movdqa     192(%[yuvconstants]),%%xmm14                    \n"
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     %%xmm11,%%xmm0                                  \n"            \
-    "pmaddubsw  %%xmm8,%%xmm1                                   \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     %%xmm12,%%xmm1                                  \n"            \
-    "pmaddubsw  %%xmm9,%%xmm2                                   \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     %%xmm13,%%xmm2                                  \n"            \
-    "pmaddubsw  %%xmm10,%%xmm3                                  \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    %%xmm14,%%xmm4                                  \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     %%xmm11,%%xmm0                                  \n" \
+  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     %%xmm12,%%xmm1                                  \n" \
+  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     %%xmm13,%%xmm2                                  \n" \
+  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
 
 #else
 #define YUVTORGB_SETUP(yuvconstants)
 // Convert 8 pixels: 8 UV and 8 Y
-#define YUVTORGB(yuvconstants)                                                 \
-    "movdqa     %%xmm0,%%xmm1                                   \n"            \
-    "movdqa     %%xmm0,%%xmm2                                   \n"            \
-    "movdqa     %%xmm0,%%xmm3                                   \n"            \
-    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
-    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
-    "psubw      %%xmm1,%%xmm0                                   \n"            \
-    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
-    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
-    "psubw      %%xmm2,%%xmm1                                   \n"            \
-    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
-    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
-    "psubw      %%xmm3,%%xmm2                                   \n"            \
-    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
-    "paddsw     %%xmm4,%%xmm0                                   \n"            \
-    "paddsw     %%xmm4,%%xmm1                                   \n"            \
-    "paddsw     %%xmm4,%%xmm2                                   \n"            \
-    "psraw      $0x6,%%xmm0                                     \n"            \
-    "psraw      $0x6,%%xmm1                                     \n"            \
-    "psraw      $0x6,%%xmm2                                     \n"            \
-    "packuswb   %%xmm0,%%xmm0                                   \n"            \
-    "packuswb   %%xmm1,%%xmm1                                   \n"            \
-    "packuswb   %%xmm2,%%xmm2                                   \n"
+#define YUVTORGB16(yuvconstants)                                  \
+  "movdqa     %%xmm0,%%xmm1                                   \n" \
+  "movdqa     %%xmm0,%%xmm2                                   \n" \
+  "movdqa     %%xmm0,%%xmm3                                   \n" \
+  "movdqa     96(%[yuvconstants]),%%xmm0                      \n" \
+  "pmaddubsw  (%[yuvconstants]),%%xmm1                        \n" \
+  "psubw      %%xmm1,%%xmm0                                   \n" \
+  "movdqa     128(%[yuvconstants]),%%xmm1                     \n" \
+  "pmaddubsw  32(%[yuvconstants]),%%xmm2                      \n" \
+  "psubw      %%xmm2,%%xmm1                                   \n" \
+  "movdqa     160(%[yuvconstants]),%%xmm2                     \n" \
+  "pmaddubsw  64(%[yuvconstants]),%%xmm3                      \n" \
+  "psubw      %%xmm3,%%xmm2                                   \n" \
+  "pmulhuw    192(%[yuvconstants]),%%xmm4                     \n" \
+  "paddsw     %%xmm4,%%xmm0                                   \n" \
+  "paddsw     %%xmm4,%%xmm1                                   \n" \
+  "paddsw     %%xmm4,%%xmm2                                   \n"
 #define YUVTORGB_REGS
 #endif
 
+#define YUVTORGB(yuvconstants)                                    \
+  YUVTORGB16(yuvconstants)                                        \
+  "psraw      $0x6,%%xmm0                                     \n" \
+  "psraw      $0x6,%%xmm1                                     \n" \
+  "psraw      $0x6,%%xmm2                                     \n" \
+  "packuswb   %%xmm0,%%xmm0                                   \n" \
+  "packuswb   %%xmm1,%%xmm1                                   \n" \
+  "packuswb   %%xmm2,%%xmm2                                   \n"
+
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    "punpcklbw  %%xmm1,%%xmm0                                    \n"           \
-    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
-    "movdqa     %%xmm0,%%xmm1                                    \n"           \
-    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
-    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
-    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
+#define STOREARGB                                                  \
+  "punpcklbw  %%xmm1,%%xmm0                                    \n" \
+  "punpcklbw  %%xmm5,%%xmm2                                    \n" \
+  "movdqa     %%xmm0,%%xmm1                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm1                                    \n" \
+  "movdqu     %%xmm0,(%[dst_argb])                             \n" \
+  "movdqu     %%xmm1,0x10(%[dst_argb])                         \n" \
+  "lea        0x20(%[dst_argb]), %[dst_argb]                   \n"
 
 // Store 8 RGBA values.
-#define STORERGBA                                                              \
-    "pcmpeqb   %%xmm5,%%xmm5                                     \n"           \
-    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
-    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
-    "movdqa    %%xmm5,%%xmm0                                     \n"           \
-    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
-    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
-    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
-
-void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+#define STORERGBA                                                  \
+  "pcmpeqb   %%xmm5,%%xmm5                                     \n" \
+  "punpcklbw %%xmm2,%%xmm1                                     \n" \
+  "punpcklbw %%xmm0,%%xmm5                                     \n" \
+  "movdqa    %%xmm5,%%xmm0                                     \n" \
+  "punpcklwd %%xmm1,%%xmm5                                     \n" \
+  "punpckhwd %%xmm1,%%xmm0                                     \n" \
+  "movdqu    %%xmm5,(%[dst_rgba])                              \n" \
+  "movdqu    %%xmm0,0x10(%[dst_rgba])                          \n" \
+  "lea       0x20(%[dst_rgba]),%[dst_rgba]                     \n"
+
+// Store 8 AR30 values.
+#define STOREAR30                                                  \
+  "psraw      $0x4,%%xmm0                                      \n" \
+  "psraw      $0x4,%%xmm1                                      \n" \
+  "psraw      $0x4,%%xmm2                                      \n" \
+  "pminsw     %%xmm7,%%xmm0                                    \n" \
+  "pminsw     %%xmm7,%%xmm1                                    \n" \
+  "pminsw     %%xmm7,%%xmm2                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm0                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm1                                    \n" \
+  "pmaxsw     %%xmm6,%%xmm2                                    \n" \
+  "psllw      $0x4,%%xmm2                                      \n" \
+  "movdqa     %%xmm0,%%xmm3                                    \n" \
+  "punpcklwd  %%xmm2,%%xmm0                                    \n" \
+  "punpckhwd  %%xmm2,%%xmm3                                    \n" \
+  "movdqa     %%xmm1,%%xmm2                                    \n" \
+  "punpcklwd  %%xmm5,%%xmm1                                    \n" \
+  "punpckhwd  %%xmm5,%%xmm2                                    \n" \
+  "pslld      $0xa,%%xmm1                                      \n" \
+  "pslld      $0xa,%%xmm2                                      \n" \
+  "por        %%xmm1,%%xmm0                                    \n" \
+  "por        %%xmm2,%%xmm3                                    \n" \
+  "movdqu     %%xmm0,(%[dst_ar30])                             \n" \
+  "movdqu     %%xmm3,0x10(%[dst_ar30])                         \n" \
+  "lea        0x20(%[dst_ar30]), %[dst_ar30]                   \n"
+
+void OMITFP I444ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1691,15 +2028,15 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                                 const uint8* u_buf,
-                                 const uint8* v_buf,
-                                 uint8* dst_rgb24,
+void OMITFP I422ToRGB24Row_SSSE3(const uint8_t* y_buf,
+                                 const uint8_t* u_buf,
+                                 const uint8_t* v_buf,
+                                 uint8_t* dst_rgb24,
                                  const struct YuvConstants* yuvconstants,
                                  int width) {
   asm volatile (
@@ -1707,8 +2044,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
     "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
     "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     "punpcklbw %%xmm1,%%xmm0                   \n"
@@ -1719,16 +2057,16 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
     "pshufb    %%xmm5,%%xmm0                   \n"
     "pshufb    %%xmm6,%%xmm1                   \n"
     "palignr   $0xc,%%xmm0,%%xmm1              \n"
-    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
-    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
-    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
+    "movq      %%xmm0,(%[dst_rgb24])           \n"
+    "movdqu    %%xmm1,0x8(%[dst_rgb24])        \n"
+    "lea       0x18(%[dst_rgb24]),%[dst_rgb24] \n"
     "subl      $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
@@ -1736,23 +2074,24 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
   : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
     [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
     [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
   );
 }
 
-void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+void OMITFP I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1764,92 +2103,160 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 
-#ifdef HAS_I422ALPHATOARGBROW_SSSE3
-void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                                     const uint8* u_buf,
-                                     const uint8* v_buf,
-                                     const uint8* a_buf,
-                                     uint8* dst_argb,
-                                     const struct YuvConstants* yuvconstants,
-                                     int width) {
+void OMITFP I422ToAR30Row_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // AR30 constants
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
     LABELALIGN
-  "1:                                          \n"
-    READYUVA422
-    YUVTORGB(yuvconstants)
-    STOREARGB
-    "subl      $0x8,%[width]                   \n"
+    "1:                                        \n"
+    READYUV422
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
     [v_buf]"+r"(v_buf),    // %[v_buf]
-    [a_buf]"+r"(a_buf),    // %[a_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)     // %[width]
-#else
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
     [width]"+rm"(width)    // %[width]
-#endif
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
   );
 }
-#endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
-#ifdef HAS_I411TOARGBROW_SSSE3
-void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_argb,
+// 10 bit YUV to ARGB
+void OMITFP I210ToARGBRow_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
-  int temp;
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV411_TEMP
+    "1:                                        \n"
+    READYUV210
     YUVTORGB(yuvconstants)
     STOREARGB
-    "subl      $0x8,%[width]                   \n"
+    "sub       $0x8,%[width]                   \n"
     "jg        1b                              \n"
-  : [y_buf]"+r"(y_buf),        // %[y_buf]
-    [u_buf]"+r"(u_buf),        // %[u_buf]
-    [v_buf]"+r"(v_buf),        // %[v_buf]
-    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-    [temp]"=&r"(temp),         // %[temp]
-#if defined(__i386__) && defined(__pic__)
-    [width]"+m"(width)         // %[width]
-#else
-    [width]"+rm"(width)        // %[width]
-#endif
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+    [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
+
+// 10 bit YUV to AR30
+void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf,
+                                const uint16_t* u_buf,
+                                const uint16_t* v_buf,
+                                uint8_t* dst_ar30,
+                                const struct YuvConstants* yuvconstants,
+                                int width) {
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "pcmpeqb   %%xmm5,%%xmm5                   \n"
+    "psrlw     $14,%%xmm5                      \n"
+    "psllw     $4,%%xmm5                       \n"  // 2 alpha bits
+    "pxor      %%xmm6,%%xmm6                   \n"
+    "pcmpeqb   %%xmm7,%%xmm7                   \n"  // 0 for min
+    "psrlw     $6,%%xmm7                       \n"  // 1023 for max
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210
+    YUVTORGB16(yuvconstants)
+    STOREAR30
+    "sub       $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+
+#ifdef HAS_I422ALPHATOARGBROW_SSSE3
+void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                                     const uint8_t* u_buf,
+                                     const uint8_t* v_buf,
+                                     const uint8_t* a_buf,
+                                     uint8_t* dst_argb,
+                                     const struct YuvConstants* yuvconstants,
+                                     int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUVA422
+    YUVTORGB(yuvconstants)
+    STOREARGB
+    "subl      $0x8,%[width]                   \n"
+    "jg        1b                              \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [a_buf]"+r"(a_buf),    // %[a_buf]
+    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
+#if defined(__i386__)
+    [width]"+m"(width)     // %[width]
+#else
+    [width]"+rm"(width)    // %[width]
 #endif
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+  // clang-format on
+}
+#endif  // HAS_I422ALPHATOARGBROW_SSSE3
 
-void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* uv_buf,
-                                uint8* dst_argb,
+void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* uv_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1860,21 +2267,24 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                                const uint8* vu_buf,
-                                uint8* dst_argb,
+void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* vu_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1886,20 +2296,23 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
-                                uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1911,20 +2324,23 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
-                                uint8* dst_argb,
+void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf,
+                                uint8_t* dst_argb,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY
     YUVTORGB(yuvconstants)
     STOREARGB
@@ -1936,23 +2352,25 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleUYVYY]"m"(kShuffleUYVYY),
     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 
-void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
-                                const uint8* u_buf,
-                                const uint8* v_buf,
-                                uint8* dst_rgba,
+void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
+                                const uint8_t* u_buf,
+                                const uint8_t* v_buf,
+                                uint8_t* dst_rgba,
                                 const struct YuvConstants* yuvconstants,
                                 int width) {
   asm volatile (
     YUVTORGB_SETUP(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422
     YUVTORGB(yuvconstants)
     STORERGBA
@@ -1964,7 +2382,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
     [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS
+  : "memory", "cc", YUVTORGB_REGS
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
@@ -1972,179 +2390,211 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
 #endif  // HAS_I422TOARGBROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2                                                        \
-    "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
-    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUV444_AVX2                                               \
+  "vmovdqu    (%[u_buf]),%%xmm0                                   \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                             \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2                                                        \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUV422_AVX2                                               \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
+
+// Read 8 UV from 210 10 bit, upsample to 16 UV
+// TODO(fbarchard): Consider vshufb to replace pack/unpack
+// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
+#define READYUV210_AVX2                                            \
+  "vmovdqu    (%[u_buf]),%%xmm0                                \n" \
+  "vmovdqu    0x00(%[u_buf],%[v_buf],1),%%xmm1                 \n" \
+  "lea        0x10(%[u_buf]),%[u_buf]                          \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                              \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                              \n" \
+  "vpunpcklwd %%ymm1,%%ymm0,%%ymm0                             \n" \
+  "vpsraw     $0x2,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb  %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                             \n" \
+  "vmovdqu    (%[y_buf]),%%ymm4                                \n" \
+  "vpsllw     $0x6,%%ymm4,%%ymm4                               \n" \
+  "lea        0x20(%[y_buf]),%[y_buf]                          \n"
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2                                                       \
-    "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
-    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
-    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
-    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2                                                        \
-    "vmovd      " MEMACCESS([u_buf]) ",%%xmm0                       \n"        \
-    MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
-    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]                   \n"        \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpckldq %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READYUVA422_AVX2                                              \
+  "vmovq      (%[u_buf]),%%xmm0                                   \n" \
+  "vmovq      0x00(%[u_buf],%[v_buf],1),%%xmm1                    \n" \
+  "lea        0x8(%[u_buf]),%[u_buf]                              \n" \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n" \
+  "vmovdqu    (%[a_buf]),%%xmm5                                   \n" \
+  "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n" \
+  "lea        0x10(%[a_buf]),%[a_buf]                             \n"
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READNV12_AVX2                                                 \
+  "vmovdqu    (%[uv_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[uv_buf]),%[uv_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 VU from NV21, upsample to 16 UV.
-#define READNV21_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                      \n"        \
-    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
-    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
-    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
-    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
-    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
+#define READNV21_AVX2                                                 \
+  "vmovdqu    (%[vu_buf]),%%xmm0                                  \n" \
+  "lea        0x10(%[vu_buf]),%[vu_buf]                           \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n" \
+  "vmovdqu    (%[y_buf]),%%xmm4                                   \n" \
+  "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n" \
+  "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n" \
+  "lea        0x10(%[y_buf]),%[y_buf]                             \n"
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2                                                          \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                    \n"        \
-    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
-    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
-    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
-    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
+#define READYUY2_AVX2                                                 \
+  "vmovdqu    (%[yuy2_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[yuy2_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[yuy2_buf]),%[yuy2_buf]                       \n"
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2                                                          \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                   \n"        \
-    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
-    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
-    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
-    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
+#define READUYVY_AVX2                                                 \
+  "vmovdqu    (%[uyvy_buf]),%%ymm4                                \n" \
+  "vpshufb    %[kShuffleUYVYY], %%ymm4, %%ymm4                    \n" \
+  "vmovdqu    (%[uyvy_buf]),%%ymm0                                \n" \
+  "vpshufb    %[kShuffleUYVYUV], %%ymm0, %%ymm0                   \n" \
+  "lea        0x20(%[uyvy_buf]),%[uyvy_buf]                       \n"
 
 #if defined(__x86_64__)
-#define YUVTORGB_SETUP_AVX2(yuvconstants)                                      \
-    "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8            \n"           \
-    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
-    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
-    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
-    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
-    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
-    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n"        \
-    "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n"        \
-    "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n"        \
-    "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n"        \
-    "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n"        \
-    "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n"        \
-    "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB_SETUP_AVX2(yuvconstants)                            \
+  "vmovdqa     (%[yuvconstants]),%%ymm8                          \n" \
+  "vmovdqa     32(%[yuvconstants]),%%ymm9                        \n" \
+  "vmovdqa     64(%[yuvconstants]),%%ymm10                       \n" \
+  "vmovdqa     96(%[yuvconstants]),%%ymm11                       \n" \
+  "vmovdqa     128(%[yuvconstants]),%%ymm12                      \n" \
+  "vmovdqa     160(%[yuvconstants]),%%ymm13                      \n" \
+  "vmovdqa     192(%[yuvconstants]),%%ymm14                      \n"
+
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
+  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
+  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
+  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
+  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
+  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
+  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
+
 #define YUVTORGB_REGS_AVX2 \
-    "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
+
 #else  // Convert 16 pixels: 16 UV and 16 Y.
+
 #define YUVTORGB_SETUP_AVX2(yuvconstants)
-#define YUVTORGB_AVX2(yuvconstants)                                            \
-    "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2   \n"        \
-    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
-    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
-    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
-    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
-    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
-    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
-    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
-    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
-    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
-    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
-    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
-    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
-    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
-    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
-    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
-    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+#define YUVTORGB16_AVX2(yuvconstants)                                 \
+  "vpmaddubsw  64(%[yuvconstants]),%%ymm0,%%ymm2                  \n" \
+  "vpmaddubsw  32(%[yuvconstants]),%%ymm0,%%ymm1                  \n" \
+  "vpmaddubsw  (%[yuvconstants]),%%ymm0,%%ymm0                    \n" \
+  "vmovdqu     160(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n" \
+  "vmovdqu     128(%[yuvconstants]),%%ymm3                        \n" \
+  "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n" \
+  "vmovdqu     96(%[yuvconstants]),%%ymm3                         \n" \
+  "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n" \
+  "vpmulhuw    192(%[yuvconstants]),%%ymm4,%%ymm4                 \n" \
+  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
+  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
+  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"
 #define YUVTORGB_REGS_AVX2
 #endif
 
+#define YUVTORGB_AVX2(yuvconstants)                                   \
+  YUVTORGB16_AVX2(yuvconstants)                                       \
+  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
+  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
+  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
+  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
+  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
+  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
+
 // Store 16 ARGB values.
-#define STOREARGB_AVX2                                                         \
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
-    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
-    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
-    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
-    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
-    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
-    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
-    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
-    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
+#define STOREARGB_AVX2                                                \
+  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vmovdqu    %%ymm1,(%[dst_argb])                                \n" \
+  "vmovdqu    %%ymm0,0x20(%[dst_argb])                            \n" \
+  "lea       0x40(%[dst_argb]), %[dst_argb]                       \n"
+
+// Store 16 AR30 values.
+#define STOREAR30_AVX2                                                \
+  "vpsraw     $0x4,%%ymm0,%%ymm0                                  \n" \
+  "vpsraw     $0x4,%%ymm1,%%ymm1                                  \n" \
+  "vpsraw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpminsw    %%ymm7,%%ymm0,%%ymm0                                \n" \
+  "vpminsw    %%ymm7,%%ymm1,%%ymm1                                \n" \
+  "vpminsw    %%ymm7,%%ymm2,%%ymm2                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm0,%%ymm0                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm1,%%ymm1                                \n" \
+  "vpmaxsw    %%ymm6,%%ymm2,%%ymm2                                \n" \
+  "vpsllw     $0x4,%%ymm2,%%ymm2                                  \n" \
+  "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n" \
+  "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n" \
+  "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n" \
+  "vpunpckhwd %%ymm2,%%ymm0,%%ymm3                                \n" \
+  "vpunpcklwd %%ymm2,%%ymm0,%%ymm0                                \n" \
+  "vpunpckhwd %%ymm5,%%ymm1,%%ymm2                                \n" \
+  "vpunpcklwd %%ymm5,%%ymm1,%%ymm1                                \n" \
+  "vpslld     $0xa,%%ymm1,%%ymm1                                  \n" \
+  "vpslld     $0xa,%%ymm2,%%ymm2                                  \n" \
+  "vpor       %%ymm1,%%ymm0,%%ymm0                                \n" \
+  "vpor       %%ymm2,%%ymm3,%%ymm3                                \n" \
+  "vmovdqu    %%ymm0,(%[dst_ar30])                                \n" \
+  "vmovdqu    %%ymm3,0x20(%[dst_ar30])                            \n" \
+  "lea        0x40(%[dst_ar30]), %[dst_ar30]                      \n"
 
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I444ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV444_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2157,32 +2607,34 @@ void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
+#if defined(HAS_I422TOARGBROW_AVX2)
 // 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
+void OMITFP I422ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV411_AVX2
+    "1:                                        \n"
+    READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
     "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2190,32 +2642,74 @@ void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
-#endif  // HAS_I411TOARGBROW_AVX2
+#endif  // HAS_I422TOARGBROW_AVX2
 
-#if defined(HAS_I422TOARGBROW_AVX2)
+#if defined(HAS_I422TOAR30ROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I422ToAR30Row_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_ar30,
+                               const struct YuvConstants* yuvconstants,
+                               int width) {
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV422_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+  );
+}
+#endif  // HAS_I422TOAR30ROW_AVX2
+
+#if defined(HAS_I210TOARGBROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I210ToARGBRow_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+
     LABELALIGN
-  "1:                                          \n"
-    READYUV422_AVX2
+    "1:                                        \n"
+    READYUV210_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
     "sub       $0x10,%[width]                  \n"
     "jg        1b                              \n"
+
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2223,27 +2717,69 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
-#endif  // HAS_I422TOARGBROW_AVX2
+#endif  // HAS_I210TOARGBROW_AVX2
 
-#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+#if defined(HAS_I210TOAR30ROW_AVX2)
 // 16 pixels
-// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               const uint8* a_buf,
-                               uint8* dst_argb,
+// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 AR30 (64 bytes).
+void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf,
+                               const uint16_t* u_buf,
+                               const uint16_t* v_buf,
+                               uint8_t* dst_ar30,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
+    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"  // AR30 constants
+    "vpsrlw    $14,%%ymm5,%%ymm5               \n"
+    "vpsllw    $4,%%ymm5,%%ymm5                \n"  // 2 alpha bits
+    "vpxor     %%ymm6,%%ymm6,%%ymm6            \n"  // 0 for min
+    "vpcmpeqb  %%ymm7,%%ymm7,%%ymm7            \n"  // 1023 for max
+    "vpsrlw    $6,%%ymm7,%%ymm7                \n"
+
+    LABELALIGN
+    "1:                                        \n"
+    READYUV210_AVX2
+    YUVTORGB16_AVX2(yuvconstants)
+    STOREAR30_AVX2
+    "sub       $0x10,%[width]                  \n"
+    "jg        1b                              \n"
+
+    "vzeroupper                                \n"
+  : [y_buf]"+r"(y_buf),    // %[y_buf]
+    [u_buf]"+r"(u_buf),    // %[u_buf]
+    [v_buf]"+r"(v_buf),    // %[v_buf]
+    [dst_ar30]"+r"(dst_ar30),  // %[dst_ar30]
+    [width]"+rm"(width)    // %[width]
+  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
+  : "memory", "cc", YUVTORGB_REGS_AVX2
+    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
+  );
+}
+#endif  // HAS_I210TOAR30ROW_AVX2
+
+#if defined(HAS_I422ALPHATOARGBROW_AVX2)
+// 16 pixels
+// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
+void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf,
+                                    const uint8_t* u_buf,
+                                    const uint8_t* v_buf,
+                                    const uint8_t* a_buf,
+                                    uint8_t* dst_argb,
+                                    const struct YuvConstants* yuvconstants,
+                                    int width) {
+  // clang-format off
+  asm volatile (
+    YUVTORGB_SETUP_AVX2(yuvconstants)
+    "sub       %[u_buf],%[v_buf]               \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUVA422_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2255,33 +2791,35 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
     [v_buf]"+r"(v_buf),    // %[v_buf]
     [a_buf]"+r"(a_buf),    // %[a_buf]
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
-#if defined(__i386__) && defined(__pic__)
+#if defined(__i386__)
     [width]"+m"(width)     // %[width]
 #else
     [width]"+rm"(width)    // %[width]
 #endif
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_I422ALPHATOARGBROW_AVX2
 
 #if defined(HAS_I422TORGBAROW_AVX2)
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
-                               const uint8* u_buf,
-                               const uint8* v_buf,
-                               uint8* dst_argb,
+void OMITFP I422ToRGBARow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* u_buf,
+                               const uint8_t* v_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "sub       %[u_buf],%[v_buf]               \n"
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUV422_AVX2
     YUVTORGB_AVX2(yuvconstants)
 
@@ -2292,11 +2830,11 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
     "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
     "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
     "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
-    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
-    "sub       $0x10,%[width]                  \n"
-    "jg        1b                              \n"
+    "vmovdqu    %%ymm0,(%[dst_argb])           \n"
+    "vmovdqu    %%ymm1,0x20(%[dst_argb])       \n"
+    "lea        0x40(%[dst_argb]),%[dst_argb]  \n"
+    "sub        $0x10,%[width]                 \n"
+    "jg         1b                             \n"
     "vzeroupper                                \n"
   : [y_buf]"+r"(y_buf),    // %[y_buf]
     [u_buf]"+r"(u_buf),    // %[u_buf]
@@ -2304,7 +2842,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
+  : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
 }
@@ -2313,16 +2851,18 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
 #if defined(HAS_NV12TOARGBROW_AVX2)
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* uv_buf,
-                               uint8* dst_argb,
+void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* uv_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV12_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2334,25 +2874,28 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
     [dst_argb]"+r"(dst_argb),  // %[dst_argb]
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
     "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV12TOARGBROW_AVX2
 
 #if defined(HAS_NV21TOARGBROW_AVX2)
 // 16 pixels.
 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
-                               const uint8* vu_buf,
-                               uint8* dst_argb,
+void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf,
+                               const uint8_t* vu_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READNV21_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2365,24 +2908,27 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
     [width]"+rm"(width)    // %[width]
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleNV21]"m"(kShuffleNV21)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_NV21TOARGBROW_AVX2
 
 #if defined(HAS_YUY2TOARGBROW_AVX2)
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
-                               uint8* dst_argb,
+void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READYUY2_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2395,24 +2941,27 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
     [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_YUY2TOARGBROW_AVX2
 
 #if defined(HAS_UYVYTOARGBROW_AVX2)
 // 16 pixels.
 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
-                               uint8* dst_argb,
+void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf,
+                               uint8_t* dst_argb,
                                const struct YuvConstants* yuvconstants,
                                int width) {
+  // clang-format off
   asm volatile (
     YUVTORGB_SETUP_AVX2(yuvconstants)
     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+
     LABELALIGN
-  "1:                                          \n"
+    "1:                                        \n"
     READUYVY_AVX2
     YUVTORGB_AVX2(yuvconstants)
     STOREARGB_AVX2
@@ -2425,1131 +2974,1603 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
   : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
     [kShuffleUYVYY]"m"(kShuffleUYVYY),
     [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
-    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
+    : "memory", "cc", YUVTORGB_REGS_AVX2
       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
   );
+  // clang-format on
 }
 #endif  // HAS_UYVYTOARGBROW_AVX2
 
 #ifdef HAS_I400TOARGBROW_SSE2
-void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
-    "movd      %%eax,%%xmm2                    \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
-    "movd      %%eax,%%xmm3                    \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "psubusw   %%xmm3,%%xmm0                   \n"
-    "psrlw     $6, %%xmm0                      \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-
-    // Step 2: Weave into ARGB
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm0,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm1                   \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "por       %%xmm4,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "movd      %%eax,%%xmm2                    \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "movd      %%eax,%%xmm3                    \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "psubusw   %%xmm3,%%xmm0                   \n"
+      "psrlw     $6, %%xmm0                      \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+
+      // Step 2: Weave into ARGB
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm0,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm1                   \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "por       %%xmm4,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_SSE2
 
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
-  asm volatile (
-    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
-    "vmovd      %%eax,%%xmm2                   \n"
-    "vbroadcastss %%xmm2,%%ymm2                \n"
-    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
-    "vmovd      %%eax,%%xmm3                   \n"
-    "vbroadcastss %%xmm3,%%ymm3                \n"
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
-    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
-    "lea        " MEMLEA(0x10,0) ",%0          \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
-    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
-    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub        $0x10,%2                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(y_buf),     // %0
-    "+r"(dst_argb),  // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc", "eax"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "mov        $0x4a354a35,%%eax              \n"  // 0488 = 1160 = 1.164 *
+                                                      // 16
+      "vmovd      %%eax,%%xmm2                   \n"
+      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "mov        $0x4880488,%%eax               \n"  // 4a35 = 18997 = 1.164
+      "vmovd      %%eax,%%xmm3                   \n"
+      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpslld     $0x18,%%ymm4,%%ymm4            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
+      "vmovdqu    (%0),%%xmm0                    \n"
+      "lea        0x10(%0),%0                    \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
+      "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
+      "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub        $0x10,%2                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(y_buf),     // %0
+        "+r"(dst_argb),  // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 #endif  // HAS_I400TOARGBROW_AVX2
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %3,%%xmm5                       \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "movdqa    %3,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    -0x10(%0,%2,1),%%xmm0           \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm5                  \n"
-    LABELALIGN
-  "1:                                          \n"
-    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
-    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kShuffleMirror) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    -0x20(%0,%2,1),%%ymm0          \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(temp_width)     // %2
+      : "m"(kShuffleMirror)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORROW_AVX2
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+void MirrorUVRow_SSSE3(const uint8_t* src,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
                        int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "movdqa    %4,%%xmm1                       \n"
-    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "pshufb    %%xmm1,%%xmm0                   \n"
-    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $8,%3                           \n"
-    "jg        1b                              \n"
-  : "+r"(src),      // %0
-    "+r"(dst_u),    // %1
-    "+r"(dst_v),    // %2
-    "+r"(temp_width)  // %3
-  : "m"(kShuffleMirrorUV)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+  asm volatile(
+      "movdqa    %4,%%xmm1                       \n"
+      "lea       -0x10(%0,%3,2),%0               \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       -0x10(%0),%0                    \n"
+      "pshufb    %%xmm1,%%xmm0                   \n"
+      "movlpd    %%xmm0,(%1)                     \n"
+      "movhpd    %%xmm0,0x00(%1,%2,1)            \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $8,%3                           \n"
+      "jg        1b                              \n"
+      : "+r"(src),             // %0
+        "+r"(dst_u),           // %1
+        "+r"(dst_v),           // %2
+        "+r"(temp_width)       // %3
+      : "m"(kShuffleMirrorUV)  // %4
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
 
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
-    "lea       " MEMLEA(-0x10,0) ",%0          \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+  asm volatile(
+
+      "lea       -0x10(%0,%2,4),%0               \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
+      "lea       -0x10(%0),%0                    \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),        // %0
+        "+r"(dst),        // %1
+        "+r"(temp_width)  // %2
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBMIRRORROW_SSE2
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
+void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
   intptr_t temp_width = (intptr_t)(width);
-  asm volatile (
-    "vmovdqu    %3,%%ymm5                      \n"
-    LABELALIGN
-  "1:                                          \n"
-    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x8,%2                        \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src),  // %0
-    "+r"(dst),  // %1
-    "+r"(temp_width)  // %2
-  : "m"(kARGBShuffleMirror_AVX2) // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm5"
-  );
+  asm volatile(
+
+      "vmovdqu    %3,%%ymm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpermd    -0x20(%0,%2,4),%%ymm5,%%ymm0    \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x8,%2                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src),                    // %0
+        "+r"(dst),                    // %1
+        "+r"(temp_width)              // %2
+      : "m"(kARGBShuffleMirror_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_AVX2
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_AVX2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5             \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5               \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0          \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1    \n"
-    "lea        " MEMLEA(0x40,0) ",%0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm2               \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm3               \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0             \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm2,%%ymm2             \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0              \n"
-    "vpermq     $0xd8,%%ymm2,%%ymm2              \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)             //  vmovdqu %%ymm2,(%1,%2)
-    "lea        " MEMLEA(0x20,1) ",%1            \n"
-    "sub        $0x20,%3                         \n"
-    "jg         1b                               \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm2,0x00(%1,%2,1)            \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_SSE2(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                    \n"
-    "psrlw      $0x8,%%xmm5                      \n"
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
-    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
-    "lea        " MEMLEA(0x20,0) ",%0            \n"
-    "movdqa     %%xmm0,%%xmm2                    \n"
-    "movdqa     %%xmm1,%%xmm3                    \n"
-    "pand       %%xmm5,%%xmm0                    \n"
-    "pand       %%xmm5,%%xmm1                    \n"
-    "packuswb   %%xmm1,%%xmm0                    \n"
-    "psrlw      $0x8,%%xmm2                      \n"
-    "psrlw      $0x8,%%xmm3                      \n"
-    "packuswb   %%xmm3,%%xmm2                    \n"
-    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
-    "lea        " MEMLEA(0x10,1) ",%1            \n"
-    "sub        $0x10,%3                         \n"
-    "jg         1b                               \n"
-  : "+r"(src_uv),     // %0
-    "+r"(dst_u),      // %1
-    "+r"(dst_v),      // %2
-    "+r"(width)         // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psrlw      $0x8,%%xmm5                    \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "lea        0x20(%0),%0                    \n"
+      "movdqa     %%xmm0,%%xmm2                  \n"
+      "movdqa     %%xmm1,%%xmm3                  \n"
+      "pand       %%xmm5,%%xmm0                  \n"
+      "pand       %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "movdqu    %%xmm2,0x00(%1,%2,1)            \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%3                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_AVX2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0           \n"
-    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)             //  vmovdqu (%0,%1,1),%%ymm1
-    "lea       " MEMLEA(0x20,0) ",%0             \n"
-    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2             \n"
-    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) "   \n"
-    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
-    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
-    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
-    "lea       " MEMLEA(0x40,2) ",%2             \n"
-    "sub       $0x20,%3                          \n"
-    "jg        1b                                \n"
-    "vzeroupper                                  \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu    0x00(%0,%1,1),%%ymm1           \n"
+      "lea       0x20(%0),%0                     \n"
+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
+      "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vextractf128 $0x0,%%ymm2,(%2)             \n"
+      "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+      "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+      "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_SSE2(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-    "sub       %0,%1                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm2                     \n"
-    "punpcklbw %%xmm1,%%xmm0                     \n"
-    "punpckhbw %%xmm1,%%xmm2                     \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
-    "lea       " MEMLEA(0x20,2) ",%2             \n"
-    "sub       $0x10,%3                          \n"
-    "jg        1b                                \n"
-  : "+r"(src_u),     // %0
-    "+r"(src_v),     // %1
-    "+r"(dst_uv),    // %2
-    "+r"(width)      // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2"
-  );
+  asm volatile(
+
+      "sub       %0,%1                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm2                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_u),   // %0
+        "+r"(src_v),   // %1
+        "+r"(dst_uv),  // %2
+        "+r"(width)    // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_MERGEUVROW_SSE2
 
-#ifdef HAS_COPYROW_SSE2
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MERGEUVROW_16_AVX2
+void MergeUVRow_16_AVX2(const uint16_t* src_u,
+                        const uint16_t* src_v,
+                        uint16_t* dst_uv,
+                        int scale,
+                        int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %4,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
+    LABELALIGN
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   (%0,%1,1),%%ymm1                \n"
+    "add        $0x20,%0                       \n"
+
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vpunpcklwd %%ymm1,%%ymm0,%%ymm2           \n"  // mutates
+    "vpunpckhwd %%ymm1,%%ymm0,%%ymm0           \n"
+    "vextractf128 $0x0,%%ymm2,(%2)             \n"
+    "vextractf128 $0x0,%%ymm0,0x10(%2)         \n"
+    "vextractf128 $0x1,%%ymm2,0x20(%2)         \n"
+    "vextractf128 $0x1,%%ymm0,0x30(%2)         \n"
+    "add       $0x40,%2                        \n"
+    "sub       $0x10,%3                        \n"
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_u),   // %0
+    "+r"(src_v),   // %1
+    "+r"(dst_uv),  // %2
+    "+r"(width)    // %3
+  : "r"(scale)     // %4
+  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MERGEUVROW_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 128 = 9 bits
+// 64 = 10 bits
+// 16 = 12 bits
+// 1 = 16 bits
+#ifdef HAS_MULTIPLYROW_16_AVX2
+void MultiplyRow_16_AVX2(const uint16_t* src_y,
+                         uint16_t* dst_y,
+                         int scale,
+                         int width) {
+  // clang-format off
   asm volatile (
-    "test       $0xf,%0                        \n"
-    "jne        2f                             \n"
-    "test       $0xf,%1                        \n"
-    "jne        2f                             \n"
+    "vmovd      %3,%%xmm3                      \n"
+    "vpunpcklwd %%xmm3,%%xmm3,%%xmm3           \n"
+    "vbroadcastss %%xmm3,%%ymm3                \n"
+    "sub       %0,%1                           \n"
+
+    // 16 pixels per loop.
     LABELALIGN
-  "1:                                          \n"
-    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "vpmullw   %%ymm3,%%ymm0,%%ymm0            \n"
+    "vpmullw   %%ymm3,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%0,%1)                  \n"
+    "vmovdqu   %%ymm1,0x20(%0,%1)              \n"
+    "add        $0x40,%0                       \n"
     "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
-    "jmp       9f                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm3");
+  // clang-format on
+}
+#endif  // HAS_MULTIPLYROW_16_AVX2
+
+// Use scale to convert lsb formats to msb, depending how many bits there are:
+// 32768 = 9 bits
+// 16384 = 10 bits
+// 4096 = 12 bits
+// 256 = 16 bits
+void Convert16To8Row_SSSE3(const uint16_t* src_y,
+                           uint8_t* dst_y,
+                           int scale,
+                           int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqu    0x10(%0),%%xmm1                \n"
+    "add       $0x20,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "packuswb  %%xmm1,%%xmm0                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "add       $0x10,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+
+#ifdef HAS_CONVERT16TO8ROW_AVX2
+void Convert16To8Row_AVX2(const uint16_t* src_y,
+                          uint8_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
     LABELALIGN
-  "2:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vmovdqu   0x20(%0),%%ymm1                 \n"
+    "add       $0x40,%0                        \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"  // mutates
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "add       $0x20,%1                        \n"
     "sub       $0x20,%2                        \n"
-    "jg        2b                              \n"
-  "9:                                          \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+    "jg        1b                              \n"
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
+// Use scale to convert to lsb formats depending how many bits there are:
+// 512 = 9 bits
+// 1024 = 10 bits
+// 4096 = 12 bits
+// TODO(fbarchard): reduce to SSE2
+void Convert8To16Row_SSE2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
+  asm volatile (
+    "movd      %3,%%xmm2                      \n"
+    "punpcklwd %%xmm2,%%xmm2                  \n"
+    "pshufd    $0x0,%%xmm2,%%xmm2             \n"
+
+    // 32 pixels per loop.
+    LABELALIGN
+    "1:                                       \n"
+    "movdqu    (%0),%%xmm0                    \n"
+    "movdqa    %%xmm0,%%xmm1                  \n"
+    "punpcklbw %%xmm0,%%xmm0                  \n"
+    "punpckhbw %%xmm1,%%xmm1                  \n"
+    "add       $0x10,%0                       \n"
+    "pmulhuw   %%xmm2,%%xmm0                  \n"
+    "pmulhuw   %%xmm2,%%xmm1                  \n"
+    "movdqu    %%xmm0,(%1)                    \n"
+    "movdqu    %%xmm1,0x10(%1)                \n"
+    "add       $0x20,%1                       \n"
+    "sub       $0x10,%2                       \n"
+    "jg        1b                             \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
 }
-#endif  // HAS_COPYROW_SSE2
 
-#ifdef HAS_COPYROW_AVX
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+#ifdef HAS_CONVERT8TO16ROW_AVX2
+void Convert8To16Row_AVX2(const uint8_t* src_y,
+                          uint16_t* dst_y,
+                          int scale,
+                          int width) {
+  // clang-format off
   asm volatile (
+    "vmovd      %3,%%xmm2                      \n"
+    "vpunpcklwd %%xmm2,%%xmm2,%%xmm2           \n"
+    "vbroadcastss %%xmm2,%%ymm2                \n"
+
+    // 32 pixels per loop.
     LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x40,%2                        \n"
+    "1:                                        \n"
+    "vmovdqu   (%0),%%ymm0                     \n"
+    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+    "add       $0x20,%0                        \n"
+    "vpunpckhbw %%ymm0,%%ymm0,%%ymm1           \n"
+    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+    "vpmulhuw  %%ymm2,%%ymm0,%%ymm0            \n"
+    "vpmulhuw  %%ymm2,%%ymm1,%%ymm1            \n"
+    "vmovdqu   %%ymm0,(%1)                     \n"
+    "vmovdqu   %%ymm1,0x20(%1)                 \n"
+    "add       $0x40,%1                        \n"
+    "sub       $0x20,%2                        \n"
     "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+    "vzeroupper                                \n"
+  : "+r"(src_y),   // %0
+    "+r"(dst_y),   // %1
+    "+r"(width)    // %2
+  : "r"(scale)     // %3
+  : "memory", "cc", "xmm0", "xmm1", "xmm2");
+  // clang-format on
+}
+#endif  // HAS_CONVERT8TO16ROW_AVX2
+
+#ifdef HAS_SPLITRGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRGBToR0 = {0u,   3u,   6u,   9u,   12u,  15u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR1 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToR2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u};
+
+static const uvec8 kShuffleMaskRGBToG0 = {1u,   4u,   7u,   10u,  13u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG1 = {128u, 128u, 128u, 128u, 128u, 0u,
+                                          3u,   6u,   9u,   12u,  15u,  128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToG2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 128u, 2u,
+                                          5u,   8u,   11u,  14u};
+
+static const uvec8 kShuffleMaskRGBToB0 = {2u,   5u,   8u,   11u,  14u,  128u,
+                                          128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB1 = {128u, 128u, 128u, 128u, 128u, 1u,
+                                          4u,   7u,   10u,  13u,  128u, 128u,
+                                          128u, 128u, 128u, 128u};
+static const uvec8 kShuffleMaskRGBToB2 = {128u, 128u, 128u, 128u, 128u, 128u,
+                                          128u, 128u, 128u, 128u, 0u,   3u,
+                                          6u,   9u,   12u,  15u};
+
+void SplitRGBRow_SSSE3(const uint8_t* src_rgb,
+                       uint8_t* dst_r,
+                       uint8_t* dst_g,
+                       uint8_t* dst_b,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%2)                    \n"
+      "lea        0x10(%2),%2                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     0x10(%0),%%xmm1                \n"
+      "movdqu     0x20(%0),%%xmm2                \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+      "lea        0x10(%3),%3                    \n"
+      "lea        0x30(%0),%0                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_rgb),             // %0
+        "+r"(dst_r),               // %1
+        "+r"(dst_g),               // %2
+        "+r"(dst_b),               // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRGBToR0),  // %5
+        "m"(kShuffleMaskRGBToR1),  // %6
+        "m"(kShuffleMaskRGBToR2),  // %7
+        "m"(kShuffleMaskRGBToG0),  // %8
+        "m"(kShuffleMaskRGBToG1),  // %9
+        "m"(kShuffleMaskRGBToG2),  // %10
+        "m"(kShuffleMaskRGBToB0),  // %11
+        "m"(kShuffleMaskRGBToB1),  // %12
+        "m"(kShuffleMaskRGBToB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_SPLITRGBROW_SSSE3
+
+#ifdef HAS_MERGERGBROW_SSSE3
+
+// Shuffle table for converting RGB to Planar.
+static const uvec8 kShuffleMaskRToRGB0 = {0u, 128u, 128u, 1u, 128u, 128u,
+                                          2u, 128u, 128u, 3u, 128u, 128u,
+                                          4u, 128u, 128u, 5u};
+static const uvec8 kShuffleMaskGToRGB0 = {128u, 0u, 128u, 128u, 1u, 128u,
+                                          128u, 2u, 128u, 128u, 3u, 128u,
+                                          128u, 4u, 128u, 128u};
+static const uvec8 kShuffleMaskBToRGB0 = {128u, 128u, 0u, 128u, 128u, 1u,
+                                          128u, 128u, 2u, 128u, 128u, 3u,
+                                          128u, 128u, 4u, 128u};
+
+static const uvec8 kShuffleMaskGToRGB1 = {5u, 128u, 128u, 6u, 128u, 128u,
+                                          7u, 128u, 128u, 8u, 128u, 128u,
+                                          9u, 128u, 128u, 10u};
+static const uvec8 kShuffleMaskBToRGB1 = {128u, 5u, 128u, 128u, 6u, 128u,
+                                          128u, 7u, 128u, 128u, 8u, 128u,
+                                          128u, 9u, 128u, 128u};
+static const uvec8 kShuffleMaskRToRGB1 = {128u, 128u, 6u,  128u, 128u, 7u,
+                                          128u, 128u, 8u,  128u, 128u, 9u,
+                                          128u, 128u, 10u, 128u};
+
+static const uvec8 kShuffleMaskBToRGB2 = {10u, 128u, 128u, 11u, 128u, 128u,
+                                          12u, 128u, 128u, 13u, 128u, 128u,
+                                          14u, 128u, 128u, 15u};
+static const uvec8 kShuffleMaskRToRGB2 = {128u, 11u, 128u, 128u, 12u, 128u,
+                                          128u, 13u, 128u, 128u, 14u, 128u,
+                                          128u, 15u, 128u, 128u};
+static const uvec8 kShuffleMaskGToRGB2 = {128u, 128u, 11u, 128u, 128u, 12u,
+                                          128u, 128u, 13u, 128u, 128u, 14u,
+                                          128u, 128u, 15u, 128u};
+
+void MergeRGBRow_SSSE3(const uint8_t* src_r,
+                       const uint8_t* src_g,
+                       const uint8_t* src_b,
+                       uint8_t* dst_rgb,
+                       int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %5, %%xmm0                     \n"
+      "pshufb     %6, %%xmm1                     \n"
+      "pshufb     %7, %%xmm2                     \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,(%3)                    \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %8, %%xmm0                     \n"
+      "pshufb     %9, %%xmm1                     \n"
+      "pshufb     %10, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,16(%3)                  \n"
+
+      "movdqu     (%0),%%xmm0                    \n"
+      "movdqu     (%1),%%xmm1                    \n"
+      "movdqu     (%2),%%xmm2                    \n"
+      "pshufb     %11, %%xmm0                    \n"
+      "pshufb     %12, %%xmm1                    \n"
+      "pshufb     %13, %%xmm2                    \n"
+      "por        %%xmm1,%%xmm0                  \n"
+      "por        %%xmm2,%%xmm0                  \n"
+      "movdqu     %%xmm0,32(%3)                  \n"
+
+      "lea        0x10(%0),%0                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "lea        0x10(%2),%2                    \n"
+      "lea        0x30(%3),%3                    \n"
+      "sub        $0x10,%4                       \n"
+      "jg         1b                             \n"
+      : "+r"(src_r),               // %0
+        "+r"(src_g),               // %1
+        "+r"(src_b),               // %2
+        "+r"(dst_rgb),             // %3
+        "+r"(width)                // %4
+      : "m"(kShuffleMaskRToRGB0),  // %5
+        "m"(kShuffleMaskGToRGB0),  // %6
+        "m"(kShuffleMaskBToRGB0),  // %7
+        "m"(kShuffleMaskRToRGB1),  // %8
+        "m"(kShuffleMaskGToRGB1),  // %9
+        "m"(kShuffleMaskBToRGB1),  // %10
+        "m"(kShuffleMaskRToRGB2),  // %11
+        "m"(kShuffleMaskGToRGB2),  // %12
+        "m"(kShuffleMaskBToRGB2)   // %13
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_MERGERGBROW_SSSE3
+
+#ifdef HAS_COPYROW_SSE2
+void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "test       $0xf,%0                        \n"
+      "jne        2f                             \n"
+      "test       $0xf,%1                        \n"
+      "jne        2f                             \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqa    (%0),%%xmm0                     \n"
+      "movdqa    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,(%1)                     \n"
+      "movdqa    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       9f                              \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        2b                              \n"
+
+      LABELALIGN "9:                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
+}
+#endif  // HAS_COPYROW_SSE2
+
+#ifdef HAS_COPYROW_AVX
+void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x40,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_COPYROW_AVX
 
 #ifdef HAS_COPYROW_ERMS
 // Multiple of 1.
-void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
+void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep movsb " MEMMOVESTRING(0,1) "          \n"
-  : "+S"(src),  // %0
-    "+D"(dst),  // %1
-    "+c"(width_tmp) // %2
-  :
-  : "memory", "cc"
-  );
+  asm volatile(
+
+      "rep movsb                      \n"
+      : "+S"(src),       // %0
+        "+D"(dst),       // %1
+        "+c"(width_tmp)  // %2
+      :
+      : "memory", "cc");
 }
 #endif  // HAS_COPYROW_ERMS
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm1                     \n"
+      "vmovdqu   0x20(%0),%%ymm2                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYALPHAROW_AVX2
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
-    "lea       " MEMLEA(0x20, 0) ", %0         \n"
-    "psrld     $0x18, %%xmm0                   \n"
-    "psrld     $0x18, %%xmm1                   \n"
-    "packssdw  %%xmm1, %%xmm0                  \n"
-    "packuswb  %%xmm0, %%xmm0                  \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8, 1) ", %1          \n"
-    "sub       $0x8, %2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_a),     // %1
-    "+rm"(width)     // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0), %%xmm0                    \n"
+      "movdqu    0x10(%0), %%xmm1                \n"
+      "lea       0x20(%0), %0                    \n"
+      "psrld     $0x18, %%xmm0                   \n"
+      "psrld     $0x18, %%xmm1                   \n"
+      "packssdw  %%xmm1, %%xmm0                  \n"
+      "packuswb  %%xmm0, %%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1), %1                     \n"
+      "sub       $0x8, %2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+rm"(width)     // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+static const uvec8 kShuffleAlphaShort_AVX2 = {
+    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
+    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
+
+void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "vmovdqa    %3,%%ymm4                      \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0), %%ymm0                    \n"
+      "vmovdqu   0x20(%0), %%ymm1                \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"  // vpsrld $0x18, %%ymm0
+      "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
+      "vmovdqu   0x40(%0), %%ymm2                \n"
+      "vmovdqu   0x60(%0), %%ymm3                \n"
+      "lea       0x80(%0), %0                    \n"
+      "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub        $0x20, %2                      \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),               // %0
+        "+r"(dst_a),                  // %1
+        "+rm"(width)                  // %2
+      : "m"(kPermdARGBToY_AVX),       // %3
+        "m"(kShuffleAlphaShort_AVX2)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm0,%%xmm0                   \n"
-    "pslld     $0x18,%%xmm0                    \n"
-    "pcmpeqb   %%xmm1,%%xmm1                   \n"
-    "psrld     $0x8,%%xmm1                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpckhwd %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm2,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
-    "pand      %%xmm0,%%xmm2                   \n"
-    "pand      %%xmm0,%%xmm3                   \n"
-    "pand      %%xmm1,%%xmm4                   \n"
-    "pand      %%xmm1,%%xmm5                   \n"
-    "por       %%xmm4,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm0,%%xmm0                   \n"
+      "pslld     $0x18,%%xmm0                    \n"
+      "pcmpeqb   %%xmm1,%%xmm1                   \n"
+      "psrld     $0x8,%%xmm1                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm2                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpckhwd %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm2,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm4                     \n"
+      "movdqu    0x10(%1),%%xmm5                 \n"
+      "pand      %%xmm0,%%xmm2                   \n"
+      "pand      %%xmm0,%%xmm3                   \n"
+      "pand      %%xmm1,%%xmm4                   \n"
+      "pand      %%xmm1,%%xmm5                   \n"
+      "por       %%xmm4,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
-    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
-    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
-    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
-    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
-    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd (%0),%%ymm1                     \n"
+      "vpmovzxbd 0x8(%0),%%ymm2                  \n"
+      "lea       0x10(%0),%0                     \n"
+      "vpslld    $0x18,%%ymm1,%%ymm1             \n"
+      "vpslld    $0x18,%%ymm2,%%ymm2             \n"
+      "vpblendvb %%ymm0,(%1),%%ymm1,%%ymm1       \n"
+      "vpblendvb %%ymm0,0x20(%1),%%ymm2,%%ymm2   \n"
+      "vmovdqu   %%ymm1,(%1)                     \n"
+      "vmovdqu   %%ymm2,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-void SetRow_X86(uint8* dst, uint8 v8, int width) {
+void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width >> 2);
-  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  const uint32_t v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 
-void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
+void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosb " MEMSTORESTRING(al,0) "        \n"
-    : "+D"(dst),       // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v8)          // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep stosb                      \n"
+      : "+D"(dst),       // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v8)          // %2
+      : "memory", "cc");
 }
 
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
+void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) {
   size_t width_tmp = (size_t)(width);
-  asm volatile (
-    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
-    : "+D"(dst_argb),  // %0
-      "+c"(width_tmp)  // %1
-    : "a"(v32)         // %2
-    : "memory", "cc");
+  asm volatile(
+
+      "rep stosl                      \n"
+      : "+D"(dst_argb),  // %0
+        "+c"(width_tmp)  // %1
+      : "a"(v32)         // %2
+      : "memory", "cc");
 }
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_SSE2
-void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void UYVYToYRow_SSE2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x8,%%xmm5                     \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrlw     $0x8,%%xmm5                     \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq    %%xmm1,0x00(%1,%2,1)              \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_SSE2
 
 #ifdef HAS_YUY2TOYROW_AVX2
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_yuy2))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_yuy2))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_yuy2),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "lea      " MEMLEA(0x20,1) ",%1            \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToYRow_AVX2(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "lea      0x20(%1),%1                      \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
-    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
-    "sub       %1,%2                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
-    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  : "r"((intptr_t)(stride_uyvy))  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
+      "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),               // %0
+        "+r"(dst_u),                  // %1
+        "+r"(dst_v),                  // %2
+        "+r"(width)                   // %3
+      : "r"((intptr_t)(stride_uyvy))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
-    "sub       %1,%2                           \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
-    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
-    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
-    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
-    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
-    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
-    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
-    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
-    "lea      " MEMLEA(0x10,1) ",%1            \n"
-    "sub       $0x20,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_uyvy),    // %0
-    "+r"(dst_u),       // %1
-    "+r"(dst_v),       // %2
-    "+r"(width)          // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
+void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
+      "sub       %1,%2                           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
+      "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
+      "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
+      "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
+      "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
+      "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
+      "vextractf128 $0x0,%%ymm1,(%1)             \n"
+      "vextractf128 $0x0,%%ymm0,0x00(%1,%2,1)    \n"
+      "lea      0x10(%1),%1                      \n"
+      "sub       $0x20,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $0xf,%%xmm7                     \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x8,%%xmm6                     \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psllw     $0x8,%%xmm5                     \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "pslld     $0x18,%%xmm4                    \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        99f                             \n"
-
-    // 1 pixel loop.
-  "91:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "movdqa    %%xmm3,%%xmm0                   \n"
-    "pxor      %%xmm4,%%xmm3                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm2         \n"
-    "pshufb    %4,%%xmm3                       \n"
-    "pand      %%xmm6,%%xmm2                   \n"
-    "paddw     %%xmm7,%%xmm3                   \n"
-    "pmullw    %%xmm3,%%xmm2                   \n"
-    "movd      " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "por       %%xmm4,%%xmm0                   \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm2                     \n"
-    "paddusb   %%xmm2,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       91b                             \n"
-  "99:                                         \n"
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  : "m"(kShuffleAlpha)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                        const uint8_t* src_argb1,
+                        uint8_t* dst_argb,
+                        int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $0xf,%%xmm7                     \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x8,%%xmm6                     \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psllw     $0x8,%%xmm5                     \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "pslld     $0x18,%%xmm4                    \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        99f                             \n"
+
+      // 1 pixel loop.
+      "91:                                       \n"
+      "movd      (%0),%%xmm3                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "movdqa    %%xmm3,%%xmm0                   \n"
+      "pxor      %%xmm4,%%xmm3                   \n"
+      "movd      (%1),%%xmm2                     \n"
+      "pshufb    %4,%%xmm3                       \n"
+      "pand      %%xmm6,%%xmm2                   \n"
+      "paddw     %%xmm7,%%xmm3                   \n"
+      "pmullw    %%xmm3,%%xmm2                   \n"
+      "movd      (%1),%%xmm1                     \n"
+      "lea       0x4(%1),%1                      \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "por       %%xmm4,%%xmm0                   \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm2                     \n"
+      "paddusb   %%xmm2,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       91b                             \n"
+      "99:                                       \n"
+      : "+r"(src_argb0),    // %0
+        "+r"(src_argb1),    // %1
+        "+r"(dst_argb),     // %2
+        "+r"(width)         // %3
+      : "m"(kShuffleAlpha)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
@@ -3559,46 +4580,49 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "pcmpeqb    %%xmm5,%%xmm5                  \n"
-    "psllw      $0x8,%%xmm5                    \n"
-    "mov        $0x80808080,%%eax              \n"
-    "movd       %%eax,%%xmm6                   \n"
-    "pshufd     $0x0,%%xmm6,%%xmm6             \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "movd       %%eax,%%xmm7                   \n"
-    "pshufd     $0x0,%%xmm7,%%xmm7             \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq       (%2),%%xmm0                    \n"
-    "punpcklbw  %%xmm0,%%xmm0                  \n"
-    "pxor       %%xmm5,%%xmm0                  \n"
-    "movq       (%0,%2,1),%%xmm1               \n"
-    "movq       (%1,%2,1),%%xmm2               \n"
-    "punpcklbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm6,%%xmm1                  \n"
-    "pmaddubsw  %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm7,%%xmm0                  \n"
-    "psrlw      $0x8,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq       %%xmm0,(%3,%2,1)               \n"
-    "lea        0x8(%2),%2                     \n"
-    "sub        $0x8,%4                        \n"
-    "jg        1b                              \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"
-  );
+void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                         const uint8_t* src1,
+                         const uint8_t* alpha,
+                         uint8_t* dst,
+                         int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm5,%%xmm5                  \n"
+      "psllw      $0x8,%%xmm5                    \n"
+      "mov        $0x80808080,%%eax              \n"
+      "movd       %%eax,%%xmm6                   \n"
+      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "movd       %%eax,%%xmm7                   \n"
+      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq       (%2),%%xmm0                    \n"
+      "punpcklbw  %%xmm0,%%xmm0                  \n"
+      "pxor       %%xmm5,%%xmm0                  \n"
+      "movq       (%0,%2,1),%%xmm1               \n"
+      "movq       (%1,%2,1),%%xmm2               \n"
+      "punpcklbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm6,%%xmm1                  \n"
+      "pmaddubsw  %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm7,%%xmm0                  \n"
+      "psrlw      $0x8,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq       %%xmm0,(%3,%2,1)               \n"
+      "lea        0x8(%2),%2                     \n"
+      "sub        $0x8,%4                        \n"
+      "jg        1b                              \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_SSSE3
 
@@ -3608,312 +4632,308 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                        const uint8* alpha, uint8* dst, int width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm6                   \n"
-    "vbroadcastss %%xmm6,%%ymm6                \n"
-    "mov        $0x807f807f,%%eax              \n"
-    "vmovd      %%eax,%%xmm7                   \n"
-    "vbroadcastss %%xmm7,%%ymm7                \n"
-    "sub        %2,%0                          \n"
-    "sub        %2,%1                          \n"
-    "sub        %2,%3                          \n"
-
-    // 32 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    (%2),%%ymm0                    \n"
-    "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
-    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vmovdqu    (%0,%2,1),%%ymm1               \n"
-    "vmovdqu    (%1,%2,1),%%ymm2               \n"
-    "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
-    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
-    "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0,(%3,%2,1)               \n"
-    "lea        0x20(%2),%2                    \n"
-    "sub        $0x20,%4                       \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src0),       // %0
-    "+r"(src1),       // %1
-    "+r"(alpha),      // %2
-    "+r"(dst),        // %3
-    "+rm"(width)      // %4
-  :: "memory", "cc", "eax",
-     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void BlendPlaneRow_AVX2(const uint8_t* src0,
+                        const uint8_t* src1,
+                        const uint8_t* alpha,
+                        uint8_t* dst,
+                        int width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm6                   \n"
+      "vbroadcastss %%xmm6,%%ymm6                \n"
+      "mov        $0x807f807f,%%eax              \n"
+      "vmovd      %%eax,%%xmm7                   \n"
+      "vbroadcastss %%xmm7,%%ymm7                \n"
+      "sub        %2,%0                          \n"
+      "sub        %2,%1                          \n"
+      "sub        %2,%3                          \n"
+
+      // 32 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%2),%%ymm0                    \n"
+      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
+      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vmovdqu    (%0,%2,1),%%ymm1               \n"
+      "vmovdqu    (%1,%2,1),%%ymm2               \n"
+      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
+      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
+      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src0),   // %0
+        "+r"(src1),   // %1
+        "+r"(alpha),  // %2
+        "+r"(dst),    // %3
+        "+rm"(width)  // %4
+        ::"memory",
+        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_BLENDPLANEROW_AVX2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
-static uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
-};
-static uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
+                                     7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
+static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+                                     15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
 // Attenuate 4 pixels at a time.
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "pcmpeqb   %%xmm3,%%xmm3                   \n"
-    "pslld     $0x18,%%xmm3                    \n"
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpcklbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm1,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "punpckhbw %%xmm2,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pand      %%xmm3,%%xmm2                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha0),  // %3
-    "m"(kShuffleAlpha1)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "pcmpeqb   %%xmm3,%%xmm3                   \n"
+      "pslld     $0x18,%%xmm3                    \n"
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpcklbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm1,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "punpckhbw %%xmm2,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "pand      %%xmm3,%%xmm2                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(dst_argb),       // %1
+        "+r"(width)           // %2
+      : "m"(kShuffleAlpha0),  // %3
+        "m"(kShuffleAlpha1)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBATTENUATEROW_SSSE3
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vbroadcastf128 %3,%%ymm4                  \n"
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
-    "sub        %0,%1                          \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
-    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)        // %2
-  : "m"(kShuffleAlpha_AVX2)  // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpslld     $0x18,%%ymm5,%%ymm5            \n"
+      "sub        %0,%1                          \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
+      "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),          // %0
+        "+r"(dst_argb),          // %1
+        "+r"(width)              // %2
+      : "m"(kShuffleAlpha_AVX2)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "movlhps   %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),     // %1
-    "+r"(width),        // %2
-    "=&r"(alpha)        // %3
-  : "r"(fixed_invtbl8)  // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movzb     0x03(%0),%3                     \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movd      0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "movd      0x00(%4,%3,4),%%xmm3            \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "movlhps   %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "lea       0x10(%0),%0                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),     // %0
+        "+r"(dst_argb),     // %1
+        "+r"(width),        // %2
+        "=&r"(alpha)        // %3
+      : "r"(fixed_invtbl8)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // Unattenuate 8 pixels at a time.
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
+void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
                              int width) {
   uintptr_t alpha;
-  asm volatile (
-    "sub        %0,%1                          \n"
-    "vbroadcastf128 %5,%%ymm5                  \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    // replace VPGATHER
-    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
-    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
-    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
-    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
-    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
-    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
-    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
-    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
-    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
-    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
-    // end of VPGATHER
-
-    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
-    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
-    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
-    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
-    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
-    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
-    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub        $0x8,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width),         // %2
-    "=&r"(alpha)         // %3
-  : "r"(fixed_invtbl8),  // %4
-    "m"(kUnattenShuffleAlpha_AVX2)  // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "sub        %0,%1                          \n"
+      "vbroadcastf128 %5,%%ymm5                  \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      // replace VPGATHER
+      "movzb     0x03(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x07(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x0b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x0f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "movzb     0x13(%0),%3                     \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm0            \n"
+      "movzb     0x17(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm1            \n"
+      "movzb     0x1b(%0),%3                     \n"
+      "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
+      "vmovd     0x00(%4,%3,4),%%xmm2            \n"
+      "movzb     0x1f(%0),%3                     \n"
+      "vmovd     0x00(%4,%3,4),%%xmm3            \n"
+      "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
+      "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
+      "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
+      "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
+      // end of VPGATHER
+
+      "vmovdqu    (%0),%%ymm6                    \n"
+      "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
+      "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
+      "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
+      "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
+      "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
+      "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%0,%1,1)           \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub        $0x8,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),                 // %0
+        "+r"(dst_argb),                 // %1
+        "+r"(width),                    // %2
+        "=&r"(alpha)                    // %3
+      : "r"(fixed_invtbl8),             // %4
+        "m"(kUnattenShuffleAlpha_AVX2)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBUNATTENUATEROW_AVX2
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm0                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm5,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrld     $0x18,%%xmm2                    \n"
-    "psrld     $0x18,%%xmm3                    \n"
-    "packuswb  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpcklbw %%xmm2,%%xmm3                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "m"(kARGBToYJ),   // %3
-    "m"(kAddYJ64)     // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ARGBGrayRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm5,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "movdqu    0x10(%0),%%xmm3                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrld     $0x18,%%xmm2                    \n"
+      "psrld     $0x18,%%xmm3                    \n"
+      "packuswb  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpcklbw %%xmm2,%%xmm3                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "m"(kARGBToYJ),  // %3
+        "m"(kAddYJ64)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBGRAYROW_SSSE3
 
@@ -3922,412 +4942,415 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone
-static vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
-  asm volatile (
-    "movdqa    %2,%%xmm2                       \n"
-    "movdqa    %3,%%xmm3                       \n"
-    "movdqa    %4,%%xmm4                       \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm6                   \n"
-    "phaddw    %%xmm6,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm5                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm4,%%xmm5                   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "phaddw    %%xmm1,%%xmm5                   \n"
-    "psrlw     $0x7,%%xmm5                     \n"
-    "packuswb  %%xmm5,%%xmm5                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "psrld     $0x18,%%xmm6                    \n"
-    "psrld     $0x18,%%xmm1                    \n"
-    "packuswb  %%xmm1,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm5                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "punpckhwd %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),      // %0
-    "+r"(width)          // %1
-  : "m"(kARGBToSepiaB),  // %2
-    "m"(kARGBToSepiaG),  // %3
-    "m"(kARGBToSepiaR)   // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movdqa    %2,%%xmm2                       \n"
+      "movdqa    %3,%%xmm3                       \n"
+      "movdqa    %4,%%xmm4                       \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm6                   \n"
+      "phaddw    %%xmm6,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm5                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm5                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm4,%%xmm5                   \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "phaddw    %%xmm1,%%xmm5                   \n"
+      "psrlw     $0x7,%%xmm5                     \n"
+      "packuswb  %%xmm5,%%xmm5                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "psrld     $0x18,%%xmm6                    \n"
+      "psrld     $0x18,%%xmm1                    \n"
+      "packuswb  %%xmm1,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm5                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "punpckhwd %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),      // %0
+        "+r"(width)          // %1
+      : "m"(kARGBToSepiaB),  // %2
+        "m"(kARGBToSepiaG),  // %3
+        "m"(kARGBToSepiaR)   // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBSEPIAROW_SSSE3
 
 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // Same as Sepia except matrix is provided.
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
-    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
-    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
-    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "pmaddubsw %%xmm2,%%xmm7                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "pmaddubsw %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm3,%%xmm1                   \n"
-    "phaddsw   %%xmm7,%%xmm0                   \n"
-    "phaddsw   %%xmm1,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm0                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm4,%%xmm1                   \n"
-    "pmaddubsw %%xmm4,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm1                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm7                   \n"
-    "phaddsw   %%xmm7,%%xmm6                   \n"
-    "psraw     $0x6,%%xmm1                     \n"
-    "psraw     $0x6,%%xmm6                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "punpcklbw %%xmm6,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "punpcklwd %%xmm1,%%xmm0                   \n"
-    "punpckhwd %%xmm1,%%xmm6                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb),      // %1
-    "+r"(width)          // %2
-  : "r"(matrix_argb)     // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              const int8_t* matrix_argb,
+                              int width) {
+  asm volatile(
+      "movdqu    (%3),%%xmm5                     \n"
+      "pshufd    $0x00,%%xmm5,%%xmm2             \n"
+      "pshufd    $0x55,%%xmm5,%%xmm3             \n"
+      "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
+      "pshufd    $0xff,%%xmm5,%%xmm5             \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "pmaddubsw %%xmm2,%%xmm7                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "pmaddubsw %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm3,%%xmm1                   \n"
+      "phaddsw   %%xmm7,%%xmm0                   \n"
+      "phaddsw   %%xmm1,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm0                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm1                   \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x10(%0),%%xmm7                 \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm7                   \n"
+      "phaddsw   %%xmm7,%%xmm6                   \n"
+      "psraw     $0x6,%%xmm1                     \n"
+      "psraw     $0x6,%%xmm6                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "punpcklbw %%xmm6,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "punpcklwd %%xmm1,%%xmm0                   \n"
+      "punpckhwd %%xmm1,%%xmm6                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm6,0x10(%1)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "movd      %2,%%xmm2                       \n"
-    "movd      %3,%%xmm3                       \n"
-    "movd      %4,%%xmm4                       \n"
-    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
-    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
-    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
-    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
-    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
-    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "pslld     $0x18,%%xmm6                    \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "pmullw    %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
-    "pmullw    %%xmm3,%%xmm1                   \n"
-    "pand      %%xmm6,%%xmm7                   \n"
-    "paddw     %%xmm4,%%xmm0                   \n"
-    "paddw     %%xmm4,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "por       %%xmm7,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "sub       $0x4,%1                         \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "movd      %2,%%xmm2                       \n"
+      "movd      %3,%%xmm3                       \n"
+      "movd      %4,%%xmm4                       \n"
+      "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
+      "pshufd    $0x44,%%xmm2,%%xmm2             \n"
+      "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
+      "pshufd    $0x44,%%xmm3,%%xmm3             \n"
+      "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
+      "pshufd    $0x44,%%xmm4,%%xmm4             \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "pslld     $0x18,%%xmm6                    \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm1                     \n"
+      "punpckhbw %%xmm5,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "pmullw    %%xmm3,%%xmm0                   \n"
+      "movdqu    (%0),%%xmm7                     \n"
+      "pmullw    %%xmm3,%%xmm1                   \n"
+      "pand      %%xmm6,%%xmm7                   \n"
+      "paddw     %%xmm4,%%xmm0                   \n"
+      "paddw     %%xmm4,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "por       %%xmm7,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "sub       $0x4,%1                         \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBQUANTIZEROW_SSE2
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "movd      %3,%%xmm2                       \n"
-    "punpcklbw %%xmm2,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm2                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm2,%%xmm1                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(value)       // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2"
-  );
+void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "movd      %3,%%xmm2                       \n"
+      "punpcklbw %%xmm2,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm2                  \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm2,%%xmm1                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_ARGBSHADEROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                  \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqu    %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm2,%%xmm3                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "pmulhuw   %%xmm2,%%xmm0                   \n"
-    "pmulhuw   %%xmm3,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm2                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqu    %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm2,%%xmm3                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "pmulhuw   %%xmm2,%%xmm0                   \n"
+      "pmulhuw   %%xmm3,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
-    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea       " MEMLEA(0x20,2) ",%2           \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
+void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm1                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vmovdqu    (%1),%%ymm3                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
+      "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea       0x20(%2),%2                     \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc"
 #if defined(__AVX2__)
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+        ,
+        "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-  );
+      );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBADDROW_SSE2
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpaddusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBADDROW_AVX2
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "psubusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1"
-  );
+void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqu    (%1),%%xmm1                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "psubusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_ARGBSUBTRACTROW_SSE2
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"
-    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
-    "lea        " MEMLEA(0x20,2) ",%2          \n"
-    "sub        $0x8,%3                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "memory", "cc"
-    , "xmm0"
-  );
+void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpsubusb   (%1),%%ymm0,%%ymm0             \n"
+      "lea        0x20(%1),%1                    \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x8,%3                        \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "memory", "cc", "xmm0");
 }
 #endif  // HAS_ARGBSUBTRACTROW_AVX2
 
@@ -4336,52 +5359,53 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "sub       %0,%3                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
-    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%4                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelXRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "sub       %0,%3                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x2(%0),%%xmm1                  \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "movq      0x02(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x00(%0,%2,1),%%xmm2            \n"
+      "movq      0x02(%0,%2,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%3,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%4                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(src_y2),      // %2
+        "+r"(dst_sobelx),  // %3
+        "+r"(width)        // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELXROW_SSE2
 
@@ -4390,50 +5414,50 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "sub       %0,%2                           \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "psubw     %%xmm1,%%xmm0                   \n"
-    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
-    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "psubw     %%xmm2,%%xmm1                   \n"
-    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
-    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "psubw     %%xmm3,%%xmm2                   \n"
-    "paddw     %%xmm2,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "paddw     %%xmm1,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "psubw     %%xmm0,%%xmm1                   \n"
-    "pmaxsw    %%xmm1,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x8,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelYRow_SSE2(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "sub       %0,%2                           \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movq      0x00(%0,%1,1),%%xmm1            \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "psubw     %%xmm1,%%xmm0                   \n"
+      "movq      0x1(%0),%%xmm1                  \n"
+      "movq      0x01(%0,%1,1),%%xmm2            \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "psubw     %%xmm2,%%xmm1                   \n"
+      "movq      0x2(%0),%%xmm2                  \n"
+      "movq      0x02(%0,%1,1),%%xmm3            \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "psubw     %%xmm3,%%xmm2                   \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "paddw     %%xmm1,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "psubw     %%xmm0,%%xmm1                   \n"
+      "pmaxsw    %%xmm1,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,0x00(%0,%2,1)            \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x8,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_y0),      // %0
+        "+r"(src_y1),      // %1
+        "+r"(dst_sobely),  // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELYROW_SSE2
 
@@ -4443,79 +5467,79 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "punpcklbw %%xmm0,%%xmm2                   \n"
-    "punpckhbw %%xmm0,%%xmm0                   \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "punpcklwd %%xmm2,%%xmm1                   \n"
-    "punpckhwd %%xmm2,%%xmm2                   \n"
-    "por       %%xmm5,%%xmm1                   \n"
-    "por       %%xmm5,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklwd %%xmm0,%%xmm3                   \n"
-    "punpckhwd %%xmm0,%%xmm0                   \n"
-    "por       %%xmm5,%%xmm3                   \n"
-    "por       %%xmm5,%%xmm0                   \n"
-    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void SobelRow_SSE2(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "punpcklbw %%xmm0,%%xmm2                   \n"
+      "punpckhbw %%xmm0,%%xmm0                   \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "punpcklwd %%xmm2,%%xmm1                   \n"
+      "punpckhwd %%xmm2,%%xmm2                   \n"
+      "por       %%xmm5,%%xmm1                   \n"
+      "por       %%xmm5,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklwd %%xmm0,%%xmm3                   \n"
+      "punpckhwd %%xmm0,%%xmm0                   \n"
+      "por       %%xmm5,%%xmm3                   \n"
+      "por       %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm1,(%2)                     \n"
+      "movdqu    %%xmm2,0x10(%2)                 \n"
+      "movdqu    %%xmm3,0x20(%2)                 \n"
+      "movdqu    %%xmm0,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SOBELROW_SSE2
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "pslld     $0x18,%%xmm5                    \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "pslld     $0x18,%%xmm5                    \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_SOBELTOPLANEROW_SSE2
 
@@ -4525,1004 +5549,1123 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "sub       %0,%1                           \n"
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-
-    // 8 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "paddusb   %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm0,%%xmm3                   \n"
-    "punpcklbw %%xmm5,%%xmm3                   \n"
-    "punpckhbw %%xmm5,%%xmm0                   \n"
-    "movdqa    %%xmm1,%%xmm4                   \n"
-    "punpcklbw %%xmm2,%%xmm4                   \n"
-    "punpckhbw %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm6                   \n"
-    "punpcklwd %%xmm3,%%xmm6                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "punpcklwd %%xmm0,%%xmm7                   \n"
-    "punpckhwd %%xmm0,%%xmm1                   \n"
-    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
-    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "sub       $0x10,%3                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "sub       %0,%1                           \n"
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+
+      // 8 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%1,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "paddusb   %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm0,%%xmm3                   \n"
+      "punpcklbw %%xmm5,%%xmm3                   \n"
+      "punpckhbw %%xmm5,%%xmm0                   \n"
+      "movdqa    %%xmm1,%%xmm4                   \n"
+      "punpcklbw %%xmm2,%%xmm4                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm6                   \n"
+      "punpcklwd %%xmm3,%%xmm6                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "punpcklwd %%xmm0,%%xmm7                   \n"
+      "punpckhwd %%xmm0,%%xmm1                   \n"
+      "movdqu    %%xmm6,(%2)                     \n"
+      "movdqu    %%xmm4,0x10(%2)                 \n"
+      "movdqu    %%xmm7,0x20(%2)                 \n"
+      "movdqu    %%xmm1,0x30(%2)                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_SOBELXYROW_SSE2
 
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
-  asm volatile (
-    "pxor      %%xmm0,%%xmm0                   \n"
-    "pxor      %%xmm1,%%xmm1                   \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "test      $0xf,%1                         \n"
-    "jne       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm4                   \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "punpckhwd %%xmm1,%%xmm3                   \n"
-    "punpckhbw %%xmm1,%%xmm4                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "punpcklwd %%xmm1,%%xmm4                   \n"
-    "punpckhwd %%xmm1,%%xmm5                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
-    "paddd     %%xmm0,%%xmm3                   \n"
-    "paddd     %%xmm4,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
-    "paddd     %%xmm0,%%xmm4                   \n"
-    "paddd     %%xmm5,%%xmm0                   \n"
-    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
-    "lea       " MEMLEA(0x40,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm5                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
-    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
-    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movd      " MEMACCESS(0) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "punpcklbw %%xmm1,%%xmm2                   \n"
-    "punpcklwd %%xmm1,%%xmm2                   \n"
-    "paddd     %%xmm2,%%xmm0                   \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-
-  "19:                                         \n"
-  : "+r"(row),  // %0
-    "+r"(cumsum),  // %1
-    "+r"(previous_cumsum),  // %2
-    "+r"(width)  // %3
-  :
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
+  asm volatile(
+      "pxor      %%xmm0,%%xmm0                   \n"
+      "pxor      %%xmm1,%%xmm1                   \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "test      $0xf,%1                         \n"
+      "jne       49f                             \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm2                     \n"
+      "lea       0x10(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm4                   \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "punpckhwd %%xmm1,%%xmm3                   \n"
+      "punpckhbw %%xmm1,%%xmm4                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "punpcklwd %%xmm1,%%xmm4                   \n"
+      "punpckhwd %%xmm1,%%xmm5                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "movdqu    0x10(%2),%%xmm3                 \n"
+      "paddd     %%xmm0,%%xmm3                   \n"
+      "paddd     %%xmm4,%%xmm0                   \n"
+      "movdqu    0x20(%2),%%xmm4                 \n"
+      "paddd     %%xmm0,%%xmm4                   \n"
+      "paddd     %%xmm5,%%xmm0                   \n"
+      "movdqu    0x30(%2),%%xmm5                 \n"
+      "lea       0x40(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm5                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "movdqu    %%xmm3,0x10(%1)                 \n"
+      "movdqu    %%xmm4,0x20(%1)                 \n"
+      "movdqu    %%xmm5,0x30(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop.
+      LABELALIGN
+      "10:                                       \n"
+      "movd      (%0),%%xmm2                     \n"
+      "lea       0x4(%0),%0                      \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "punpcklwd %%xmm1,%%xmm2                   \n"
+      "paddd     %%xmm2,%%xmm0                   \n"
+      "movdqu    (%2),%%xmm2                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm2,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+
+      "19:                                       \n"
+      : "+r"(row),              // %0
+        "+r"(cumsum),           // %1
+        "+r"(previous_cumsum),  // %2
+        "+r"(width)             // %3
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
 
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
-  asm volatile (
-    "movd      %5,%%xmm5                       \n"
-    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
-    "rcpss     %%xmm5,%%xmm4                   \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-    "sub       $0x4,%3                         \n"
-    "jl        49f                             \n"
-    "cmpl      $0x80,%5                        \n"
-    "ja        40f                             \n"
-
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrld     $0x10,%%xmm6                    \n"
-    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
-    "addps     %%xmm6,%%xmm5                   \n"
-    "mulps     %%xmm4,%%xmm5                   \n"
-    "cvtps2dq  %%xmm5,%%xmm5                   \n"
-    "packssdw  %%xmm5,%%xmm5                   \n"
-
-  // 4 pixel small loop                        \n"
-    LABELALIGN
-  "4:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "pmulhuw   %%xmm5,%%xmm0                   \n"
-    "pmulhuw   %%xmm5,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       4b                              \n"
-    "jmp       49f                             \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
-    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
-    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
-    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
-    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
-    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
-    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm1                   \n"
-    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
-    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
-    "mulps     %%xmm4,%%xmm2                   \n"
-    "mulps     %%xmm4,%%xmm3                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "cvtps2dq  %%xmm1,%%xmm1                   \n"
-    "cvtps2dq  %%xmm2,%%xmm2                   \n"
-    "cvtps2dq  %%xmm3,%%xmm3                   \n"
-    "packssdw  %%xmm1,%%xmm0                   \n"
-    "packssdw  %%xmm3,%%xmm2                   \n"
-    "packuswb  %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%3                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "mulps     %%xmm4,%%xmm0                   \n"
-    "cvtps2dq  %%xmm0,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x4,2) ",%2            \n"
-    "sub       $0x1,%3                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(topleft),  // %0
-    "+r"(botleft),  // %1
-    "+r"(dst),      // %2
-    "+rm"(count)    // %3
-  : "r"((intptr_t)(width)),  // %4
-    "rm"(area)     // %5
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+      "movd      %5,%%xmm5                       \n"
+      "cvtdq2ps  %%xmm5,%%xmm5                   \n"
+      "rcpss     %%xmm5,%%xmm4                   \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+      "sub       $0x4,%3                         \n"
+      "jl        49f                             \n"
+      "cmpl      $0x80,%5                        \n"
+      "ja        40f                             \n"
+
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrld     $0x10,%%xmm6                    \n"
+      "cvtdq2ps  %%xmm6,%%xmm6                   \n"
+      "addps     %%xmm6,%%xmm5                   \n"
+      "mulps     %%xmm4,%%xmm5                   \n"
+      "cvtps2dq  %%xmm5,%%xmm5                   \n"
+      "packssdw  %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel small loop.
+      LABELALIGN
+      "4:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "pmulhuw   %%xmm5,%%xmm0                   \n"
+      "pmulhuw   %%xmm5,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       4b                              \n"
+      "jmp       49f                             \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x20(%0),%%xmm2                 \n"
+      "movdqu    0x30(%0),%%xmm3                 \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "psubd     0x10(%0,%4,4),%%xmm1            \n"
+      "psubd     0x20(%0,%4,4),%%xmm2            \n"
+      "psubd     0x30(%0,%4,4),%%xmm3            \n"
+      "lea       0x40(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "psubd     0x10(%1),%%xmm1                 \n"
+      "psubd     0x20(%1),%%xmm2                 \n"
+      "psubd     0x30(%1),%%xmm3                 \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "paddd     0x10(%1,%4,4),%%xmm1            \n"
+      "paddd     0x20(%1,%4,4),%%xmm2            \n"
+      "paddd     0x30(%1,%4,4),%%xmm3            \n"
+      "lea       0x40(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm1,%%xmm1                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm1                   \n"
+      "cvtdq2ps  %%xmm2,%%xmm2                   \n"
+      "cvtdq2ps  %%xmm3,%%xmm3                   \n"
+      "mulps     %%xmm4,%%xmm2                   \n"
+      "mulps     %%xmm4,%%xmm3                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "cvtps2dq  %%xmm1,%%xmm1                   \n"
+      "cvtps2dq  %%xmm2,%%xmm2                   \n"
+      "cvtps2dq  %%xmm3,%%xmm3                   \n"
+      "packssdw  %%xmm1,%%xmm0                   \n"
+      "packssdw  %%xmm3,%%xmm2                   \n"
+      "packuswb  %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%3                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "psubd     0x00(%0,%4,4),%%xmm0            \n"
+      "lea       0x10(%0),%0                     \n"
+      "psubd     (%1),%%xmm0                     \n"
+      "paddd     0x00(%1,%4,4),%%xmm0            \n"
+      "lea       0x10(%1),%1                     \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "mulps     %%xmm4,%%xmm0                   \n"
+      "cvtps2dq  %%xmm0,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x4(%2),%2                      \n"
+      "sub       $0x1,%3                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(topleft),           // %0
+        "+r"(botleft),           // %1
+        "+r"(dst),               // %2
+        "+rm"(count)             // %3
+      : "r"((intptr_t)(width)),  // %4
+        "rm"(area)               // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
 LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* src_dudv, int width) {
+void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                        int src_argb_stride,
+                        uint8_t* dst_argb,
+                        const float* src_dudv,
+                        int width) {
   intptr_t src_argb_stride_temp = src_argb_stride;
   intptr_t temp;
-  asm volatile (
-    "movq      " MEMACCESS(3) ",%%xmm2         \n"
-    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
-    "shl       $0x10,%1                        \n"
-    "add       $0x4,%1                         \n"
-    "movd      %1,%%xmm5                       \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm0                   \n"
-    "movlhps   %%xmm0,%%xmm2                   \n"
-    "movdqa    %%xmm7,%%xmm4                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm2,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "addps     %%xmm4,%%xmm4                   \n"
-
-  // 4 pixel loop                              \n"
-    LABELALIGN
-  "40:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
-    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
-    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
-    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm1                   \n"
-    "addps     %%xmm4,%%xmm2                   \n"
-    "movq      %%xmm1," MEMACCESS(2) "         \n"
-    "movd      %%xmm0,%k1                      \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-    "movd      %%xmm0,%k5                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
-    "punpckldq %%xmm6,%%xmm0                   \n"
-    "addps     %%xmm4,%%xmm3                   \n"
-    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "add       $0x3,%4                         \n"
-    "jl        19f                             \n"
-
-  // 1 pixel loop                              \n"
-    LABELALIGN
-  "10:                                         \n"
-    "cvttps2dq %%xmm2,%%xmm0                   \n"
-    "packssdw  %%xmm0,%%xmm0                   \n"
-    "pmaddwd   %%xmm5,%%xmm0                   \n"
-    "addps     %%xmm7,%%xmm2                   \n"
-    "movd      %%xmm0,%k1                      \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x04,2) ",%2           \n"
-    "sub       $0x1,%4                         \n"
-    "jge       10b                             \n"
-  "19:                                         \n"
-  : "+r"(src_argb),  // %0
-    "+r"(src_argb_stride_temp),  // %1
-    "+r"(dst_argb),  // %2
-    "+r"(src_dudv),  // %3
-    "+rm"(width),    // %4
-    "=&r"(temp)      // %5
-  :
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "movq      (%3),%%xmm2                     \n"
+      "movq      0x08(%3),%%xmm7                 \n"
+      "shl       $0x10,%1                        \n"
+      "add       $0x4,%1                         \n"
+      "movd      %1,%%xmm5                       \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
+
+      "pshufd    $0x44,%%xmm7,%%xmm7             \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm0                   \n"
+      "movlhps   %%xmm0,%%xmm2                   \n"
+      "movdqa    %%xmm7,%%xmm4                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm2,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "addps     %%xmm4,%%xmm4                   \n"
+
+      // 4 pixel loop
+      LABELALIGN
+      "40:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"  // x,y float->int first 2
+      "cvttps2dq %%xmm3,%%xmm1                   \n"  // x,y float->int next 2
+      "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
+      "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x*4 + y*stride
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm1                   \n"
+      "addps     %%xmm4,%%xmm2                   \n"
+      "movq      %%xmm1,(%2)                     \n"
+      "movd      %%xmm0,%k1                      \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+      "movd      %%xmm0,%k5                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      0x00(%0,%5,1),%%xmm6            \n"
+      "punpckldq %%xmm6,%%xmm0                   \n"
+      "addps     %%xmm4,%%xmm3                   \n"
+      "movq      %%xmm0,0x08(%2)                 \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "add       $0x3,%4                         \n"
+      "jl        19f                             \n"
+
+      // 1 pixel loop
+      LABELALIGN
+      "10:                                       \n"
+      "cvttps2dq %%xmm2,%%xmm0                   \n"
+      "packssdw  %%xmm0,%%xmm0                   \n"
+      "pmaddwd   %%xmm5,%%xmm0                   \n"
+      "addps     %%xmm7,%%xmm2                   \n"
+      "movd      %%xmm0,%k1                      \n"
+      "movd      0x00(%0,%1,1),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "lea       0x04(%2),%2                     \n"
+      "sub       $0x1,%4                         \n"
+      "jge       10b                             \n"
+      "19:                                       \n"
+      : "+r"(src_argb),              // %0
+        "+r"(src_argb_stride_temp),  // %1
+        "+r"(dst_argb),              // %2
+        "+r"(src_dudv),              // %3
+        "+rm"(width),                // %4
+        "=&r"(temp)                  // %5
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBAFFINEROW_SSE2
 
 #ifdef HAS_INTERPOLATEROW_SSSE3
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
                           int source_y_fraction) {
-  asm volatile (
-    "sub       %1,%0                           \n"
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
-
-    "movd      %3,%%xmm0                       \n"
-    "neg       %3                              \n"
-    "add       $0x100,%3                       \n"
-    "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm0,%%xmm5                   \n"
-    "punpcklwd %%xmm5,%%xmm5                   \n"
-    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
-    "mov       $0x80808080,%%eax               \n"
-    "movd      %%eax,%%xmm4                    \n"
-    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
-    "movdqa     %%xmm0,%%xmm1                  \n"
-    "punpcklbw  %%xmm2,%%xmm0                  \n"
-    "punpckhbw  %%xmm2,%%xmm1                  \n"
-    "psubb      %%xmm4,%%xmm0                  \n"
-    "psubb      %%xmm4,%%xmm1                  \n"
-    "movdqa     %%xmm5,%%xmm2                  \n"
-    "movdqa     %%xmm5,%%xmm3                  \n"
-    "pmaddubsw  %%xmm0,%%xmm2                  \n"
-    "pmaddubsw  %%xmm1,%%xmm3                  \n"
-    "paddw      %%xmm4,%%xmm2                  \n"
-    "paddw      %%xmm4,%%xmm3                  \n"
-    "psrlw      $0x8,%%xmm2                    \n"
-    "psrlw      $0x8,%%xmm3                    \n"
-    "packuswb   %%xmm3,%%xmm2                  \n"
-    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        100b                            \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+rm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "sub       %1,%0                           \n"
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
+
+      "movd      %3,%%xmm0                       \n"
+      "neg       %3                              \n"
+      "add       $0x100,%3                       \n"
+      "movd      %3,%%xmm5                       \n"
+      "punpcklbw %%xmm0,%%xmm5                   \n"
+      "punpcklwd %%xmm5,%%xmm5                   \n"
+      "pshufd    $0x0,%%xmm5,%%xmm5              \n"
+      "mov       $0x80808080,%%eax               \n"
+      "movd      %%eax,%%xmm4                    \n"
+      "pshufd    $0x0,%%xmm4,%%xmm4              \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm2            \n"
+      "movdqa     %%xmm0,%%xmm1                  \n"
+      "punpcklbw  %%xmm2,%%xmm0                  \n"
+      "punpckhbw  %%xmm2,%%xmm1                  \n"
+      "psubb      %%xmm4,%%xmm0                  \n"
+      "psubb      %%xmm4,%%xmm1                  \n"
+      "movdqa     %%xmm5,%%xmm2                  \n"
+      "movdqa     %%xmm5,%%xmm3                  \n"
+      "pmaddubsw  %%xmm0,%%xmm2                  \n"
+      "pmaddubsw  %%xmm1,%%xmm3                  \n"
+      "paddw      %%xmm4,%%xmm2                  \n"
+      "paddw      %%xmm4,%%xmm3                  \n"
+      "psrlw      $0x8,%%xmm2                    \n"
+      "psrlw      $0x8,%%xmm3                    \n"
+      "packuswb   %%xmm3,%%xmm2                  \n"
+      "movdqu    %%xmm2,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "jmp       99f                             \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x00(%1,%4,1),%%xmm1            \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    %%xmm0,0x00(%1,%0,1)            \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        100b                            \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),               // %0
+        "+r"(src_ptr),               // %1
+        "+rm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_SSSE3
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
+void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
                          int source_y_fraction) {
-  asm volatile (
-    "cmp       $0x0,%3                         \n"
-    "je        100f                            \n"
-    "sub       %1,%0                           \n"
-    "cmp       $0x80,%3                        \n"
-    "je        50f                             \n"
-
-    "vmovd      %3,%%xmm0                      \n"
-    "neg        %3                             \n"
-    "add        $0x100,%3                      \n"
-    "vmovd      %3,%%xmm5                      \n"
-    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
-    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
-    "vbroadcastss %%xmm5,%%ymm5                \n"
-    "mov        $0x80808080,%%eax              \n"
-    "vmovd      %%eax,%%xmm4                   \n"
-    "vbroadcastss %%xmm4,%%ymm4                \n"
-
-    // General purpose row blend.
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
-    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
-    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
-    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
-    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    // Blend 50 / 50.
-    LABELALIGN
-  "50:                                         \n"
-    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
-    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
-    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        50b                             \n"
-    "jmp       99f                             \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-    LABELALIGN
-  "100:                                        \n"
-    "rep movsb " MEMMOVESTRING(1,0) "          \n"
-    "jmp       999f                            \n"
-
-  "99:                                         \n"
-    "vzeroupper                                \n"
-  "999:                                        \n"
-  : "+D"(dst_ptr),    // %0
-    "+S"(src_ptr),    // %1
-    "+cm"(dst_width),  // %2
-    "+r"(source_y_fraction)  // %3
-  : "r"((intptr_t)(src_stride))  // %4
-  : "memory", "cc", "eax", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "cmp       $0x0,%3                         \n"
+      "je        100f                            \n"
+      "sub       %1,%0                           \n"
+      "cmp       $0x80,%3                        \n"
+      "je        50f                             \n"
+
+      "vmovd      %3,%%xmm0                      \n"
+      "neg        %3                             \n"
+      "add        $0x100,%3                      \n"
+      "vmovd      %3,%%xmm5                      \n"
+      "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
+      "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
+      "vbroadcastss %%xmm5,%%ymm5                \n"
+      "mov        $0x80808080,%%eax              \n"
+      "vmovd      %%eax,%%xmm4                   \n"
+      "vbroadcastss %%xmm4,%%ymm4                \n"
+
+      // General purpose row blend.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%1),%%ymm0                    \n"
+      "vmovdqu    0x00(%1,%4,1),%%ymm2           \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
+      "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
+      "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,0x00(%1,%0,1)           \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "jmp        99f                            \n"
+
+      // Blend 50 / 50.
+      LABELALIGN
+      "50:                                       \n"
+      "vmovdqu   (%1),%%ymm0                     \n"
+      "vpavgb    0x00(%1,%4,1),%%ymm0,%%ymm0     \n"
+      "vmovdqu   %%ymm0,0x00(%1,%0,1)            \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        50b                             \n"
+      "jmp       99f                             \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      LABELALIGN
+      "100:                                      \n"
+      "rep movsb                                 \n"
+      "jmp       999f                            \n"
+
+      "99:                                       \n"
+      "vzeroupper                                \n"
+      "999:                                      \n"
+      : "+D"(dst_ptr),               // %0
+        "+S"(src_ptr),               // %1
+        "+cm"(dst_width),            // %2
+        "+r"(source_y_fraction)      // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm4", "xmm5");
 }
 #endif  // HAS_INTERPOLATEROW_AVX2
 
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
-  asm volatile (
-    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm5,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          const uint8_t* shuffler,
+                          int width) {
+  asm volatile(
+
+      "movdqu    (%3),%%xmm5                     \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_SSSE3
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
-    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
-    "lea       " MEMLEA(0x40,0) ",%0           \n"
-    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
-    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
-    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
-    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm5"
-  );
+void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+
+      "vbroadcastf128 (%3),%%ymm5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(shuffler)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-#ifdef HAS_ARGBSHUFFLEROW_SSE2
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  uintptr_t pixel_temp;
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-    "mov       " MEMACCESS(4) ",%k2            \n"
-    "cmp       $0x3000102,%k2                  \n"
-    "je        3012f                           \n"
-    "cmp       $0x10203,%k2                    \n"
-    "je        123f                            \n"
-    "cmp       $0x30201,%k2                    \n"
-    "je        321f                            \n"
-    "cmp       $0x2010003,%k2                  \n"
-    "je        2103f                           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(4) ",%2             \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS(1) "            \n"
-    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
-    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
-    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
-    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
-    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    "lea       " MEMLEA(0x4,1) ",%1            \n"
-    "sub       $0x1,%3                         \n"
-    "jg        1b                              \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "123:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        123b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "321:                                        \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        321b                            \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "2103:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        2103b                           \n"
-    "jmp       99f                             \n"
-
-    LABELALIGN
-  "3012:                                       \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpckhbw %%xmm5,%%xmm1                   \n"
-    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
-    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
-    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        3012b                           \n"
-
-  "99:                                         \n"
-  : "+r"(src_argb),     // %0
-    "+r"(dst_argb),     // %1
-    "=&d"(pixel_temp),  // %2
-    "+r"(width)         // %3
-  : "r"(shuffler)       // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm5"
-  );
-}
-#endif  // HAS_ARGBSHUFFLEROW_SSE2
-
 #ifdef HAS_I422TOYUY2ROW_SSE2
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub       %1,%2                             \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "movdqa    %%xmm0,%%xmm1                     \n"
-    "punpcklbw %%xmm2,%%xmm0                     \n"
-    "punpckhbw %%xmm2,%%xmm1                     \n"
-    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "add       $0x10,%0                          \n"
+      "movdqa    %%xmm0,%%xmm1                     \n"
+      "punpcklbw %%xmm2,%%xmm0                     \n"
+      "punpckhbw %%xmm2,%%xmm1                     \n"
+      "movdqu    %%xmm0,(%3)                       \n"
+      "movdqu    %%xmm1,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOYUY2ROW_SSE2
 
 #ifdef HAS_I422TOUYVYROW_SSE2
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
- asm volatile (
-    "sub        %1,%2                            \n"
-    LABELALIGN
-  "1:                                            \n"
-    "movq      " MEMACCESS(1) ",%%xmm2           \n"
-    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
-    "lea       " MEMLEA(0x8,1) ",%1              \n"
-    "punpcklbw %%xmm3,%%xmm2                     \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
-    "movdqa    %%xmm2,%%xmm1                     \n"
-    "lea       " MEMLEA(0x10,0) ",%0             \n"
-    "punpcklbw %%xmm0,%%xmm1                     \n"
-    "punpckhbw %%xmm0,%%xmm2                     \n"
-    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
-    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
-    "lea       " MEMLEA(0x20,3) ",%3             \n"
-    "sub       $0x10,%4                          \n"
-    "jg         1b                               \n"
-    : "+r"(src_y),  // %0
-      "+r"(src_u),  // %1
-      "+r"(src_v),  // %2
-      "+r"(dst_frame),  // %3
-      "+rm"(width)  // %4
-    :
-    : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "movq      (%1),%%xmm2                       \n"
+      "movq      0x00(%1,%2,1),%%xmm1              \n"
+      "add       $0x8,%1                           \n"
+      "punpcklbw %%xmm1,%%xmm2                     \n"
+      "movdqu    (%0),%%xmm0                       \n"
+      "movdqa    %%xmm2,%%xmm1                     \n"
+      "add       $0x10,%0                          \n"
+      "punpcklbw %%xmm0,%%xmm1                     \n"
+      "punpckhbw %%xmm0,%%xmm2                     \n"
+      "movdqu    %%xmm1,(%3)                       \n"
+      "movdqu    %%xmm2,0x10(%3)                   \n"
+      "lea       0x20(%3),%3                       \n"
+      "sub       $0x10,%4                          \n"
+      "jg         1b                               \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
 }
 #endif  // HAS_I422TOUYVYROW_SSE2
 
+#ifdef HAS_I422TOYUY2ROW_AVX2
+void I422ToYUY2Row_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+
+      "sub       %1,%2                             \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOYUY2ROW_AVX2
+
+#ifdef HAS_I422TOUYVYROW_AVX2
+void I422ToUYVYRow_AVX2(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+
+      "sub        %1,%2                            \n"
+
+      LABELALIGN
+      "1:                                          \n"
+      "vpmovzxbw  (%1),%%ymm1                      \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
+      "add        $0x10,%1                         \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
+      "vmovdqu    (%0),%%ymm0                      \n"
+      "add        $0x20,%0                         \n"
+      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
+      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
+      "vextractf128 $0x0,%%ymm1,(%3)               \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
+      "lea        0x40(%3),%3                      \n"
+      "sub        $0x20,%4                         \n"
+      "jg         1b                               \n"
+      "vzeroupper                                  \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+rm"(width)     // %4
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_I422TOUYVYROW_AVX2
+
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width) {
-  asm volatile (
-    "pxor      %%xmm3,%%xmm3                   \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "punpcklbw %%xmm3,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm4                   \n"
-    "punpcklwd %%xmm3,%%xmm0                   \n"
-    "punpckhwd %%xmm3,%%xmm4                   \n"
-    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
-    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "movdqa    %%xmm4,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
-    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
-    "addps     " MEMACCESS(3) ",%%xmm0         \n"
-    "addps     " MEMACCESS(3) ",%%xmm4         \n"
-    "movdqa    %%xmm1,%%xmm2                   \n"
-    "movdqa    %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm1,%%xmm2                   \n"
-    "mulps     %%xmm5,%%xmm6                   \n"
-    "mulps     %%xmm2,%%xmm1                   \n"
-    "mulps     %%xmm6,%%xmm5                   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
-    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
-    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
-    "addps     %%xmm2,%%xmm0                   \n"
-    "addps     %%xmm6,%%xmm4                   \n"
-    "addps     %%xmm1,%%xmm0                   \n"
-    "addps     %%xmm5,%%xmm4                   \n"
-    "cvttps2dq %%xmm0,%%xmm0                   \n"
-    "cvttps2dq %%xmm4,%%xmm4                   \n"
-    "packuswb  %%xmm4,%%xmm0                   \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x2,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc"
-    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+
+      "pxor      %%xmm3,%%xmm3                   \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "punpcklbw %%xmm3,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm4                   \n"
+      "punpcklwd %%xmm3,%%xmm0                   \n"
+      "punpckhwd %%xmm3,%%xmm4                   \n"
+      "cvtdq2ps  %%xmm0,%%xmm0                   \n"
+      "cvtdq2ps  %%xmm4,%%xmm4                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "movdqa    %%xmm4,%%xmm5                   \n"
+      "mulps     0x10(%3),%%xmm0                 \n"
+      "mulps     0x10(%3),%%xmm4                 \n"
+      "addps     (%3),%%xmm0                     \n"
+      "addps     (%3),%%xmm4                     \n"
+      "movdqa    %%xmm1,%%xmm2                   \n"
+      "movdqa    %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm1,%%xmm2                   \n"
+      "mulps     %%xmm5,%%xmm6                   \n"
+      "mulps     %%xmm2,%%xmm1                   \n"
+      "mulps     %%xmm6,%%xmm5                   \n"
+      "mulps     0x20(%3),%%xmm2                 \n"
+      "mulps     0x20(%3),%%xmm6                 \n"
+      "mulps     0x30(%3),%%xmm1                 \n"
+      "mulps     0x30(%3),%%xmm5                 \n"
+      "addps     %%xmm2,%%xmm0                   \n"
+      "addps     %%xmm6,%%xmm4                   \n"
+      "addps     %%xmm1,%%xmm0                   \n"
+      "addps     %%xmm5,%%xmm4                   \n"
+      "cvttps2dq %%xmm0,%%xmm0                   \n"
+      "cvttps2dq %%xmm4,%%xmm4                   \n"
+      "packuswb  %%xmm4,%%xmm0                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x2,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
+void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const float* poly,
                             int width) {
-  asm volatile (
-    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
-    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
-    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
-    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
-
-    // 2 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
-    "lea         " MEMLEA(0x8,0) ",%0          \n"
-    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
-    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
-    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
-    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
-    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
-    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
-    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
-    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
-    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
-    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
-    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
-    "lea         " MEMLEA(0x8,1) ",%1          \n"
-    "sub         $0x2,%2                       \n"
-    "jg          1b                            \n"
-    "vzeroupper                                \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  : "r"(poly)        // %3
-  : "memory", "cc",
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+  asm volatile(
+      "vbroadcastf128 (%3),%%ymm4                \n"
+      "vbroadcastf128 0x10(%3),%%ymm5            \n"
+      "vbroadcastf128 0x20(%3),%%ymm6            \n"
+      "vbroadcastf128 0x30(%3),%%ymm7            \n"
+
+      // 2 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxbd   (%0),%%ymm0                   \n"  // 2 ARGB pixels
+      "lea         0x8(%0),%0                    \n"
+      "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
+      "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
+      "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
+      "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
+      "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
+      "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X *
+                                                      // X
+      "vcvttps2dq  %%ymm0,%%ymm0                 \n"
+      "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
+      "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
+      "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
+      "vmovq       %%xmm0,(%1)                   \n"
+      "lea         0x8(%1),%1                    \n"
+      "sub         $0x2,%2                       \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(poly)        // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kScaleBias = 1.9259299444e-34f;
+void HalfFloatRow_SSE2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "movd        %3,%%xmm4                     \n"
+      "pshufd      $0x0,%%xmm4,%%xmm4            \n"
+      "pxor        %%xmm5,%%xmm5                 \n"
+      "sub         %0,%1                         \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu      (%0),%%xmm2                   \n"  // 8 shorts
+      "add         $0x10,%0                      \n"
+      "movdqa      %%xmm2,%%xmm3                 \n"
+      "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
+      "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
+      "punpckhwd   %%xmm5,%%xmm3                 \n"
+      "cvtdq2ps    %%xmm3,%%xmm3                 \n"
+      "mulps       %%xmm4,%%xmm2                 \n"
+      "mulps       %%xmm4,%%xmm3                 \n"
+      "psrld       $0xd,%%xmm2                   \n"
+      "psrld       $0xd,%%xmm3                   \n"
+      "packssdw    %%xmm3,%%xmm2                 \n"
+      "movdqu      %%xmm2,-0x10(%0,%1,1)         \n"
+      "sub         $0x8,%2                       \n"
+      "jg          1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "m"(scale)   // %3
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+void HalfFloatRow_AVX2(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  scale *= kScaleBias;
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm2                    \n"  // 16 shorts
+      "add        $0x20,%0                       \n"
+      "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
+      "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
+      "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
+      "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
+      "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
+      "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
+      "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
+      "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
+      "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
+      "vmovdqu    %%ymm2,-0x20(%0,%1,1)          \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloatRow_F16C(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "vbroadcastss  %3, %%ymm4                  \n"
+      "sub        %0,%1                          \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
+      "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+#if defined(__x86_64__)
+      : "x"(scale)  // %3
+#else
+      : "m"(scale)  // %3
+#endif
+      : "memory", "cc", "xmm2", "xmm3", "xmm4");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
+#ifdef HAS_HALFFLOATROW_F16C
+void HalfFloat1Row_F16C(const uint16_t* src, uint16_t* dst, float, int width) {
+  asm volatile(
+      "sub        %0,%1                          \n"
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vpmovzxwd   (%0),%%ymm2                   \n"  // 16 shorts -> 16 ints
+      "vpmovzxwd   0x10(%0),%%ymm3               \n"
+      "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
+      "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
+      "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
+      "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
+      "vmovdqu     %%xmm2,0x00(%0,%1,1)          \n"
+      "vmovdqu     %%xmm3,0x10(%0,%1,1)          \n"
+      "add         $0x20,%0                      \n"
+      "sub         $0x10,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "memory", "cc", "xmm2", "xmm3");
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
+void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                           const uint8_t* table_argb,
                            int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
-    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "movzb     -0x1(%0),%1                     \n"
+      "movzb     0x03(%3,%1,4),%1                \n"
+      "mov       %b1,-0x1(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_ARGBCOLORTABLEROW_X86
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+void RGBColorTableRow_X86(uint8_t* dst_argb,
+                          const uint8_t* table_argb,
+                          int width) {
   uintptr_t pixel_temp;
-  asm volatile (
-    // 1 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movzb     " MEMACCESS(0) ",%1             \n"
-    "lea       " MEMLEA(0x4,0) ",%0            \n"
-    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
-    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
-    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
-    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
-    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
-    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
-    "dec       %2                              \n"
-    "jg        1b                              \n"
-  : "+r"(dst_argb),     // %0
-    "=&d"(pixel_temp),  // %1
-    "+r"(width)         // %2
-  : "r"(table_argb)     // %3
-  : "memory", "cc");
+  asm volatile(
+      // 1 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movzb     (%0),%1                         \n"
+      "lea       0x4(%0),%0                      \n"
+      "movzb     0x00(%3,%1,4),%1                \n"
+      "mov       %b1,-0x4(%0)                    \n"
+      "movzb     -0x3(%0),%1                     \n"
+      "movzb     0x01(%3,%1,4),%1                \n"
+      "mov       %b1,-0x3(%0)                    \n"
+      "movzb     -0x2(%0),%1                     \n"
+      "movzb     0x02(%3,%1,4),%1                \n"
+      "mov       %b1,-0x2(%0)                    \n"
+      "dec       %2                              \n"
+      "jg        1b                              \n"
+      : "+r"(dst_argb),     // %0
+        "=&d"(pixel_temp),  // %1
+        "+r"(width)         // %2
+      : "r"(table_argb)     // %3
+      : "memory", "cc");
 }
 #endif  // HAS_RGBCOLORTABLEROW_X86
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
+void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                 uint8_t* dst_argb,
                                  int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+                                 const uint8_t* luma,
+                                 uint32_t lumacoeff) {
   uintptr_t pixel_temp;
   uintptr_t table_temp;
-  asm volatile (
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pcmpeqb   %%xmm4,%%xmm4                   \n"
-    "psllw     $0x8,%%xmm4                     \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    // 4 pixel loop.
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
-    "pmaddubsw %%xmm3,%%xmm0                   \n"
-    "phaddw    %%xmm0,%%xmm0                   \n"
-    "pand      %%xmm4,%%xmm0                   \n"
-    "punpcklwd %%xmm5,%%xmm0                   \n"
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS(2) ",%0             \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS(3) "            \n"
-    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
-    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
-    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
-    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
-    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
-    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
-
-    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
-    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
-    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
-    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
-
-    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
-    "add       %5,%1                           \n"
-
-    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
-    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
-    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
-    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
-    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
-    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
-    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "lea       " MEMLEA(0x10,3) ",%3           \n"
-    "sub       $0x4,%4                         \n"
-    "jg        1b                              \n"
-  : "=&d"(pixel_temp),  // %0
-    "=&a"(table_temp),  // %1
-    "+r"(src_argb),     // %2
-    "+r"(dst_argb),     // %3
-    "+rm"(width)        // %4
-  : "r"(luma),          // %5
-    "rm"(lumacoeff)     // %6
-  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pcmpeqb   %%xmm4,%%xmm4                   \n"
+      "psllw     $0x8,%%xmm4                     \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 4 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%2),%%xmm0                     \n"
+      "pmaddubsw %%xmm3,%%xmm0                   \n"
+      "phaddw    %%xmm0,%%xmm0                   \n"
+      "pand      %%xmm4,%%xmm0                   \n"
+      "punpcklwd %%xmm5,%%xmm0                   \n"
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     (%2),%0                         \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,(%3)                        \n"
+      "movzb     0x1(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x1(%3)                     \n"
+      "movzb     0x2(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x2(%3)                     \n"
+      "movzb     0x3(%2),%0                      \n"
+      "mov       %b0,0x3(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     0x4(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x4(%3)                     \n"
+      "movzb     0x5(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x5(%3)                     \n"
+      "movzb     0x6(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x6(%3)                     \n"
+      "movzb     0x7(%2),%0                      \n"
+      "mov       %b0,0x7(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+      "pshufd    $0x39,%%xmm0,%%xmm0             \n"
+
+      "movzb     0x8(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x8(%3)                     \n"
+      "movzb     0x9(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0x9(%3)                     \n"
+      "movzb     0xa(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xa(%3)                     \n"
+      "movzb     0xb(%2),%0                      \n"
+      "mov       %b0,0xb(%3)                     \n"
+
+      "movd      %%xmm0,%k1                      \n"  // 32 bit offset
+      "add       %5,%1                           \n"
+
+      "movzb     0xc(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xc(%3)                     \n"
+      "movzb     0xd(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xd(%3)                     \n"
+      "movzb     0xe(%2),%0                      \n"
+      "movzb     0x00(%1,%0,1),%0                \n"
+      "mov       %b0,0xe(%3)                     \n"
+      "movzb     0xf(%2),%0                      \n"
+      "mov       %b0,0xf(%3)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "lea       0x10(%3),%3                     \n"
+      "sub       $0x4,%4                         \n"
+      "jg        1b                              \n"
+      : "=&d"(pixel_temp),  // %0
+        "=&a"(table_temp),  // %1
+        "+r"(src_argb),     // %2
+        "+r"(dst_argb),     // %3
+        "+rm"(width)        // %4
+      : "r"(luma),          // %5
+        "rm"(lumacoeff)     // %6
+      : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
diff --git a/libs/libvpx/third_party/libyuv/source/row_mips.cc b/libs/libvpx/third_party/libyuv/source/row_mips.cc
deleted file mode 100644
index 285f0b5adc2..00000000000
--- a/libs/libvpx/third_party/libyuv/source/row_mips.cc
+++ /dev/null
@@ -1,782 +0,0 @@
-/*
- *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-#ifdef HAS_COPYROW_MIPS
-void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
-  __asm__ __volatile__ (
-    ".set      noreorder                         \n"
-    ".set      noat                              \n"
-    "slti      $at, %[count], 8                  \n"
-    "bne       $at ,$zero, $last8                \n"
-    "xor       $t8, %[src], %[dst]               \n"
-    "andi      $t8, $t8, 0x3                     \n"
-
-    "bne       $t8, $zero, unaligned             \n"
-    "negu      $a3, %[dst]                       \n"
-    // make dst/src aligned
-    "andi      $a3, $a3, 0x3                     \n"
-    "beq       $a3, $zero, $chk16w               \n"
-    // word-aligned now count is the remining bytes count
-    "subu     %[count], %[count], $a3            \n"
-
-    "lwr       $t8, 0(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"
-    "swr       $t8, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-
-    // Now the dst/src are mutually word-aligned with word-aligned addresses
-    "$chk16w:                                    \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, chk8w              \n"
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"
-    // t0 is the "past the end" address
-
-    // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be past
-    // the "t0-32" address
-    // This means: for x=128 the last "safe" a1 address is "t0-160"
-    // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
-    // we will use "pref 30,128(a1)", so "t0-160" is the limit
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line of src
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $loop16w                     \n"
-    "nop                                         \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$loop16w:                                    \n"
-    "pref      0, 96(%[src])                     \n"
-    "lw        $t0, 0(%[src])                    \n"
-    "bgtz      $v1, $skip_pref30_96              \n"  // skip
-    "lw        $t1, 4(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"  // continue
-    "$skip_pref30_96:                            \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    //  bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lw        $t0, 32(%[src])                   \n"
-    "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
-    "lw        $t1, 36(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
-    "$skip_pref30_128:                           \n"
-    "lw        $t2, 40(%[src])                   \n"
-    "lw        $t3, 44(%[src])                   \n"
-    "lw        $t4, 48(%[src])                   \n"
-    "lw        $t5, 52(%[src])                   \n"
-    "lw        $t6, 56(%[src])                   \n"
-    "lw        $t7, 60(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bne       %[dst], $a3, $loop16w             \n"
-    " addiu    %[src], %[src], 64                \n"  // adding 64 to src
-    "move      %[count], $t8                     \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "chk8w:                                      \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count past 32-bytes
-    "beq       %[count], $t8, chk1w              \n"
-    // count=t8,no 32-byte chunk
-    " nop                                        \n"
-
-    "lw        $t0, 0(%[src])                    \n"
-    "lw        $t1, 4(%[src])                    \n"
-    "lw        $t2, 8(%[src])                    \n"
-    "lw        $t3, 12(%[src])                   \n"
-    "lw        $t4, 16(%[src])                   \n"
-    "lw        $t5, 20(%[src])                   \n"
-    "lw        $t6, 24(%[src])                   \n"
-    "lw        $t7, 28(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "chk1w:                                      \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, $last8             \n"
-    " subu     $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-    // copying in words (4-byte chunks)
-    "$wordCopy_loop:                             \n"
-    "lw        $t3, 0(%[src])                    \n"
-    // the first t3 may be equal t0 ... optimize?
-    "addiu     %[src], %[src],4                  \n"
-    "addiu     %[dst], %[dst],4                  \n"
-    "bne       %[dst], $a3,$wordCopy_loop        \n"
-    " sw       $t3, -4(%[dst])                   \n"
-
-    // For the last (<8) bytes
-    "$last8:                                     \n"
-    "blez      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
-    "$last8loop:                                 \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst], $a3, $last8loop           \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "leave:                                      \n"
-    "  j       $ra                               \n"
-    "  nop                                       \n"
-
-    //
-    // UNALIGNED case
-    //
-
-    "unaligned:                                  \n"
-    // got here with a3="negu a1"
-    "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
-    "beqz      $a3, $ua_chk16w                   \n"
-    " subu     %[count], %[count], $a3           \n"
-    // bytes left after initial a3 bytes
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
-    "swr       $v1, 0(%[dst])                    \n"
-    "addu      %[dst], %[dst], $a3               \n"
-    // below the dst will be word aligned (NOTE1)
-    "$ua_chk16w:                                 \n"
-    "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
-    // t8 is the byte count after 64-byte chunks
-    "beq       %[count], $t8, ua_chk8w           \n"
-    // if a2==t8, no 64-byte chunks
-    // There will be at most 1 32-byte chunk after it
-    "subu      $a3, %[count], $t8                \n"  // the reminder
-    // Here a3 counts bytes in 16w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // Now a3 is the final dst after 64-byte chunks
-    "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
-    "subu      $t9, $t0, 160                     \n"
-    // t9 is the "last safe pref 30,128(a1)" address
-    "pref      0, 0(%[src])                      \n"  // first line of src
-    "pref      0, 32(%[src])                     \n"  // second line  addr 32
-    "pref      0, 64(%[src])                     \n"
-    "pref      30, 32(%[dst])                    \n"
-    // safe, as we have at least 64 bytes ahead
-    // In case the a1 > t9 don't use "pref 30" at all
-    "sgtu      $v1, %[dst], $t9                  \n"
-    "bgtz      $v1, $ua_loop16w                  \n"
-    // skip "pref 30,64(a1)" for too short arrays
-    " nop                                        \n"
-    // otherwise, start with using pref30
-    "pref      30, 64(%[dst])                    \n"
-    "$ua_loop16w:                                \n"
-    "pref      0, 96(%[src])                     \n"
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "bgtz      $v1, $ua_skip_pref30_96           \n"
-    " lwl      $t1, 7(%[src])                    \n"
-    "pref      30, 96(%[dst])                    \n"
-    // continue setting up the dest, addr 96
-    "$ua_skip_pref30_96:                         \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "pref      0, 128(%[src])                    \n"
-    // bring the next lines of src, addr 128
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "lwr       $t0, 32(%[src])                   \n"
-    "lwl       $t0, 35(%[src])                   \n"
-    "lwr       $t1, 36(%[src])                   \n"
-    "bgtz      $v1, ua_skip_pref30_128           \n"
-    " lwl      $t1, 39(%[src])                   \n"
-    "pref      30, 128(%[dst])                   \n"
-    // continue setting up the dest, addr 128
-    "ua_skip_pref30_128:                         \n"
-
-    "lwr       $t2, 40(%[src])                   \n"
-    "lwl       $t2, 43(%[src])                   \n"
-    "lwr       $t3, 44(%[src])                   \n"
-    "lwl       $t3, 47(%[src])                   \n"
-    "lwr       $t4, 48(%[src])                   \n"
-    "lwl       $t4, 51(%[src])                   \n"
-    "lwr       $t5, 52(%[src])                   \n"
-    "lwl       $t5, 55(%[src])                   \n"
-    "lwr       $t6, 56(%[src])                   \n"
-    "lwl       $t6, 59(%[src])                   \n"
-    "lwr       $t7, 60(%[src])                   \n"
-    "lwl       $t7, 63(%[src])                   \n"
-    "pref      0, 160(%[src])                    \n"
-    // bring the next lines of src, addr 160
-    "sw        $t0, 32(%[dst])                   \n"
-    "sw        $t1, 36(%[dst])                   \n"
-    "sw        $t2, 40(%[dst])                   \n"
-    "sw        $t3, 44(%[dst])                   \n"
-    "sw        $t4, 48(%[dst])                   \n"
-    "sw        $t5, 52(%[dst])                   \n"
-    "sw        $t6, 56(%[dst])                   \n"
-    "sw        $t7, 60(%[dst])                   \n"
-
-    "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
-    "sgtu      $v1,%[dst],$t9                    \n"
-    "bne       %[dst],$a3,$ua_loop16w            \n"
-    " addiu    %[src],%[src],64                  \n"  // adding 64 to src
-    "move      %[count],$t8                      \n"
-
-    // Here we have src and dest word-aligned but less than 64-bytes to go
-
-    "ua_chk8w:                                   \n"
-    "pref      0, 0x0(%[src])                    \n"
-    "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
-    // the t8 is the reminder count
-    "beq       %[count], $t8, $ua_chk1w          \n"
-    // when count==t8, no 32-byte chunk
-
-    "lwr       $t0, 0(%[src])                    \n"
-    "lwl       $t0, 3(%[src])                    \n"
-    "lwr       $t1, 4(%[src])                    \n"
-    "lwl       $t1, 7(%[src])                    \n"
-    "lwr       $t2, 8(%[src])                    \n"
-    "lwl       $t2, 11(%[src])                   \n"
-    "lwr       $t3, 12(%[src])                   \n"
-    "lwl       $t3, 15(%[src])                   \n"
-    "lwr       $t4, 16(%[src])                   \n"
-    "lwl       $t4, 19(%[src])                   \n"
-    "lwr       $t5, 20(%[src])                   \n"
-    "lwl       $t5, 23(%[src])                   \n"
-    "lwr       $t6, 24(%[src])                   \n"
-    "lwl       $t6, 27(%[src])                   \n"
-    "lwr       $t7, 28(%[src])                   \n"
-    "lwl       $t7, 31(%[src])                   \n"
-    "addiu     %[src], %[src], 32                \n"
-
-    "sw        $t0, 0(%[dst])                    \n"
-    "sw        $t1, 4(%[dst])                    \n"
-    "sw        $t2, 8(%[dst])                    \n"
-    "sw        $t3, 12(%[dst])                   \n"
-    "sw        $t4, 16(%[dst])                   \n"
-    "sw        $t5, 20(%[dst])                   \n"
-    "sw        $t6, 24(%[dst])                   \n"
-    "sw        $t7, 28(%[dst])                   \n"
-    "addiu     %[dst], %[dst], 32                \n"
-
-    "$ua_chk1w:                                  \n"
-    "andi      %[count], $t8, 0x3                \n"
-    // now count is the reminder past 1w chunks
-    "beq       %[count], $t8, ua_smallCopy       \n"
-    "subu      $a3, $t8, %[count]                \n"
-    // a3 is count of bytes in 1w chunks
-    "addu      $a3, %[dst], $a3                  \n"
-    // now a3 is the dst address past the 1w chunks
-
-    // copying in words (4-byte chunks)
-    "$ua_wordCopy_loop:                          \n"
-    "lwr       $v1, 0(%[src])                    \n"
-    "lwl       $v1, 3(%[src])                    \n"
-    "addiu     %[src], %[src], 4                 \n"
-    "addiu     %[dst], %[dst], 4                 \n"
-    // note: dst=a1 is word aligned here, see NOTE1
-    "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
-    " sw       $v1,-4(%[dst])                    \n"
-
-    // Now less than 4 bytes (value in count) left to copy
-    "ua_smallCopy:                               \n"
-    "beqz      %[count], leave                   \n"
-    " addu     $a3, %[dst], %[count]             \n" // a3 = last dst address
-    "$ua_smallCopy_loop:                         \n"
-    "lb        $v1, 0(%[src])                    \n"
-    "addiu     %[src], %[src], 1                 \n"
-    "addiu     %[dst], %[dst], 1                 \n"
-    "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
-    " sb       $v1, -1(%[dst])                   \n"
-
-    "j         $ra                               \n"
-    " nop                                        \n"
-    ".set      at                                \n"
-    ".set      reorder                           \n"
-       : [dst] "+r" (dst), [src] "+r" (src)
-       : [count] "r" (count)
-       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7",
-       "t8", "t9", "a3", "v1", "at"
-  );
-}
-#endif  // HAS_COPYROW_MIPS
-
-// DSPR2 functions
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
-    (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32) && (__mips_isa_rev < 6)
-
-void SplitUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  __asm__ __volatile__ (
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-    "srl             $t4, %[width], 4              \n"  // multiplies of 16
-    "blez            $t4, 2f                       \n"
-    " andi           %[width], %[width], 0xf       \n"  // residual
-
-  "1:                                              \n"
-    "addiu           $t4, $t4, -1                  \n"
-    "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
-    "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
-    "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
-    "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
-    "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
-    "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 | U10
-    "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 | U12
-    "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 | U14
-    "addiu           %[src_uv], %[src_uv], 32      \n"
-    "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
-    "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
-    "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
-    "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
-    "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
-    "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
-    "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 | V12
-    "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 | U12
-    "sw              $t9, 0(%[dst_v])              \n"
-    "sw              $t0, 0(%[dst_u])              \n"
-    "sw              $t1, 4(%[dst_v])              \n"
-    "sw              $t2, 4(%[dst_u])              \n"
-    "sw              $t3, 8(%[dst_v])              \n"
-    "sw              $t5, 8(%[dst_u])              \n"
-    "sw              $t6, 12(%[dst_v])             \n"
-    "sw              $t7, 12(%[dst_u])             \n"
-    "addiu           %[dst_v], %[dst_v], 16        \n"
-    "bgtz            $t4, 1b                       \n"
-    " addiu          %[dst_u], %[dst_u], 16        \n"
-
-    "beqz            %[width], 3f                  \n"
-    " nop                                          \n"
-
-  "2:                                              \n"
-    "lbu             $t0, 0(%[src_uv])             \n"
-    "lbu             $t1, 1(%[src_uv])             \n"
-    "addiu           %[src_uv], %[src_uv], 2       \n"
-    "addiu           %[width], %[width], -1        \n"
-    "sb              $t0, 0(%[dst_u])              \n"
-    "sb              $t1, 0(%[dst_v])              \n"
-    "addiu           %[dst_u], %[dst_u], 1         \n"
-    "bgtz            %[width], 2b                  \n"
-    " addiu          %[dst_v], %[dst_v], 1         \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-     : [src_uv] "+r" (src_uv),
-       [width] "+r" (width),
-       [dst_u] "+r" (dst_u),
-       [dst_v] "+r" (dst_v)
-     :
-     : "t0", "t1", "t2", "t3",
-     "t4", "t5", "t6", "t7", "t8", "t9"
-  );
-}
-
-void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
-  __asm__ __volatile__ (
-    ".set push                             \n"
-    ".set noreorder                        \n"
-
-    "srl       $t4, %[width], 4            \n"  // multiplies of 16
-    "andi      $t5, %[width], 0xf          \n"
-    "blez      $t4, 2f                     \n"
-    " addu     %[src], %[src], %[width]    \n"  // src += width
-
-   "1:                                     \n"
-    "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
-    "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
-    "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
-    "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
-    "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
-    "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
-    "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
-    "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
-    "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
-    "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
-    "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
-    "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
-    "addiu     %[src], %[src], -16         \n"
-    "addiu     $t4, $t4, -1                \n"
-    "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
-    "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
-    "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
-    "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
-    "bgtz      $t4, 1b                     \n"
-    " addiu    %[dst], %[dst], 16          \n"
-    "beqz      $t5, 3f                     \n"
-    " nop                                  \n"
-
-   "2:                                     \n"
-    "lbu       $t0, -1(%[src])             \n"
-    "addiu     $t5, $t5, -1                \n"
-    "addiu     %[src], %[src], -1          \n"
-    "sb        $t0, 0(%[dst])              \n"
-    "bgez      $t5, 2b                     \n"
-    " addiu    %[dst], %[dst], 1           \n"
-
-   "3:                                     \n"
-    ".set pop                              \n"
-      : [src] "+r" (src), [dst] "+r" (dst)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4", "t5"
-  );
-}
-
-void MirrorUVRow_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                       int width) {
-  int x;
-  int y;
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "addu            $t4, %[width], %[width]      \n"
-    "srl             %[x], %[width], 4            \n"
-    "andi            %[y], %[width], 0xf          \n"
-    "blez            %[x], 2f                     \n"
-    " addu           %[src_uv], %[src_uv], $t4    \n"
-
-   "1:                                            \n"
-    "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
-    "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
-    "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
-    "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
-    "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
-    "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
-    "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
-    "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
-
-    "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
-    "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
-    "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
-    "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
-    "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
-    "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
-    "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
-    "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
-    "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
-    "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
-    "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
-    "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
-    "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
-    "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
-    "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
-    "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
-    "addiu           %[src_uv], %[src_uv], -32    \n"
-    "addiu           %[x], %[x], -1               \n"
-    "swr             $t4, 0(%[dst_u])             \n"
-    "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
-    "swr             $t6, 0(%[dst_v])             \n"
-    "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
-    "swr             $t2, 4(%[dst_u])             \n"
-    "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
-    "swr             $t3, 4(%[dst_v])             \n"
-    "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
-    "swr             $t0, 8(%[dst_u])             \n"
-    "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
-    "swr             $t1, 8(%[dst_v])             \n"
-    "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
-    "swr             $t9, 12(%[dst_u])            \n"
-    "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
-    "swr             $t5, 12(%[dst_v])            \n"
-    "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
-    "addiu           %[dst_v], %[dst_v], 16       \n"
-    "bgtz            %[x], 1b                     \n"
-    " addiu          %[dst_u], %[dst_u], 16       \n"
-    "beqz            %[y], 3f                     \n"
-    " nop                                         \n"
-    "b               2f                           \n"
-    " nop                                         \n"
-
-   "2:                                            \n"
-    "lbu             $t0, -2(%[src_uv])           \n"
-    "lbu             $t1, -1(%[src_uv])           \n"
-    "addiu           %[src_uv], %[src_uv], -2     \n"
-    "addiu           %[y], %[y], -1               \n"
-    "sb              $t0, 0(%[dst_u])             \n"
-    "sb              $t1, 0(%[dst_v])             \n"
-    "addiu           %[dst_u], %[dst_u], 1        \n"
-    "bgtz            %[y], 2b                     \n"
-    " addiu          %[dst_v], %[dst_v], 1        \n"
-
-   "3:                                            \n"
-    ".set pop                                     \n"
-      : [src_uv] "+r" (src_uv),
-        [dst_u] "+r" (dst_u),
-        [dst_v] "+r" (dst_v),
-        [x] "=&r" (x),
-        [y] "=&r" (y)
-      : [width] "r" (width)
-      : "t0", "t1", "t2", "t3", "t4",
-      "t5", "t7", "t8", "t9"
-  );
-}
-
-// Convert (4 Y and 2 VU) I422 and arrange RGB values into
-// t5 = | 0 | B0 | 0 | b0 |
-// t4 = | 0 | B1 | 0 | b1 |
-// t9 = | 0 | G0 | 0 | g0 |
-// t8 = | 0 | G1 | 0 | g1 |
-// t2 = | 0 | R0 | 0 | r0 |
-// t1 = | 0 | R1 | 0 | r1 |
-#define YUVTORGB                                                               \
-      "lw                $t0, 0(%[y_buf])       \n"                            \
-      "lhu               $t1, 0(%[u_buf])       \n"                            \
-      "lhu               $t2, 0(%[v_buf])       \n"                            \
-      "preceu.ph.qbr     $t1, $t1               \n"                            \
-      "preceu.ph.qbr     $t2, $t2               \n"                            \
-      "preceu.ph.qbra    $t3, $t0               \n"                            \
-      "preceu.ph.qbla    $t0, $t0               \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t3, $t3, $s4          \n"                            \
-      "subu.ph           $t0, $t0, $s4          \n"                            \
-      "mul.ph            $t3, $t3, $s0          \n"                            \
-      "mul.ph            $t0, $t0, $s0          \n"                            \
-      "shll.ph           $t4, $t1, 0x7          \n"                            \
-      "subu.ph           $t4, $t4, $t1          \n"                            \
-      "mul.ph            $t6, $t1, $s1          \n"                            \
-      "mul.ph            $t1, $t2, $s2          \n"                            \
-      "addq_s.ph         $t5, $t4, $t3          \n"                            \
-      "addq_s.ph         $t4, $t4, $t0          \n"                            \
-      "shra.ph           $t5, $t5, 6            \n"                            \
-      "shra.ph           $t4, $t4, 6            \n"                            \
-      "addiu             %[u_buf], 2            \n"                            \
-      "addiu             %[v_buf], 2            \n"                            \
-      "addu.ph           $t6, $t6, $t1          \n"                            \
-      "mul.ph            $t1, $t2, $s3          \n"                            \
-      "addu.ph           $t9, $t6, $t3          \n"                            \
-      "addu.ph           $t8, $t6, $t0          \n"                            \
-      "shra.ph           $t9, $t9, 6            \n"                            \
-      "shra.ph           $t8, $t8, 6            \n"                            \
-      "addu.ph           $t2, $t1, $t3          \n"                            \
-      "addu.ph           $t1, $t1, $t0          \n"                            \
-      "shra.ph           $t2, $t2, 6            \n"                            \
-      "shra.ph           $t1, $t1, 6            \n"                            \
-      "subu.ph           $t5, $t5, $s5          \n"                            \
-      "subu.ph           $t4, $t4, $s5          \n"                            \
-      "subu.ph           $t9, $t9, $s5          \n"                            \
-      "subu.ph           $t8, $t8, $s5          \n"                            \
-      "subu.ph           $t2, $t2, $s5          \n"                            \
-      "subu.ph           $t1, $t1, $s5          \n"                            \
-      "shll_s.ph         $t5, $t5, 8            \n"                            \
-      "shll_s.ph         $t4, $t4, 8            \n"                            \
-      "shll_s.ph         $t9, $t9, 8            \n"                            \
-      "shll_s.ph         $t8, $t8, 8            \n"                            \
-      "shll_s.ph         $t2, $t2, 8            \n"                            \
-      "shll_s.ph         $t1, $t1, 8            \n"                            \
-      "shra.ph           $t5, $t5, 8            \n"                            \
-      "shra.ph           $t4, $t4, 8            \n"                            \
-      "shra.ph           $t9, $t9, 8            \n"                            \
-      "shra.ph           $t8, $t8, 8            \n"                            \
-      "shra.ph           $t2, $t2, 8            \n"                            \
-      "shra.ph           $t1, $t1, 8            \n"                            \
-      "addu.ph           $t5, $t5, $s5          \n"                            \
-      "addu.ph           $t4, $t4, $s5          \n"                            \
-      "addu.ph           $t9, $t9, $s5          \n"                            \
-      "addu.ph           $t8, $t8, $s5          \n"                            \
-      "addu.ph           $t2, $t2, $s5          \n"                            \
-      "addu.ph           $t1, $t1, $s5          \n"
-
-// TODO(fbarchard): accept yuv conversion constants.
-void I422ToARGBRow_DSPR2(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* rgb_buf,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm__ __volatile__ (
-    ".set push                                \n"
-    ".set noreorder                           \n"
-    "beqz              %[width], 2f           \n"
-    " repl.ph          $s0, 74                \n"  // |YG|YG| = |74|74|
-    "repl.ph           $s1, -25               \n"  // |UG|UG| = |-25|-25|
-    "repl.ph           $s2, -52               \n"  // |VG|VG| = |-52|-52|
-    "repl.ph           $s3, 102               \n"  // |VR|VR| = |102|102|
-    "repl.ph           $s4, 16                \n"  // |0|16|0|16|
-    "repl.ph           $s5, 128               \n"  // |128|128| // clipping
-    "lui               $s6, 0xff00            \n"
-    "ori               $s6, 0xff00            \n"  // |ff|00|ff|00|ff|
-
-   "1:                                        \n"
-      YUVTORGB
-// Arranging into argb format
-    "precr.qb.ph       $t4, $t8, $t4          \n"  // |G1|g1|B1|b1|
-    "precr.qb.ph       $t5, $t9, $t5          \n"  // |G0|g0|B0|b0|
-    "addiu             %[width], -4           \n"
-    "precrq.qb.ph      $t8, $t4, $t5          \n"  // |G1|B1|G0|B0|
-    "precr.qb.ph       $t9, $t4, $t5          \n"  // |g1|b1|g0|b0|
-    "precr.qb.ph       $t2, $t1, $t2          \n"  // |R1|r1|R0|r0|
-
-    "addiu             %[y_buf], 4            \n"
-    "preceu.ph.qbla    $t1, $t2               \n"  // |0 |R1|0 |R0|
-    "preceu.ph.qbra    $t2, $t2               \n"  // |0 |r1|0 |r0|
-    "or                $t1, $t1, $s6          \n"  // |ff|R1|ff|R0|
-    "or                $t2, $t2, $s6          \n"  // |ff|r1|ff|r0|
-    "precrq.ph.w       $t0, $t2, $t9          \n"  // |ff|r1|g1|b1|
-    "precrq.ph.w       $t3, $t1, $t8          \n"  // |ff|R1|G1|B1|
-    "sll               $t9, $t9, 16           \n"
-    "sll               $t8, $t8, 16           \n"
-    "packrl.ph         $t2, $t2, $t9          \n"  // |ff|r0|g0|b0|
-    "packrl.ph         $t1, $t1, $t8          \n"  // |ff|R0|G0|B0|
-// Store results.
-    "sw                $t2, 0(%[rgb_buf])     \n"
-    "sw                $t0, 4(%[rgb_buf])     \n"
-    "sw                $t1, 8(%[rgb_buf])     \n"
-    "sw                $t3, 12(%[rgb_buf])    \n"
-    "bnez              %[width], 1b           \n"
-    " addiu            %[rgb_buf], 16         \n"
-   "2:                                        \n"
-    ".set pop                                 \n"
-      :[y_buf] "+r" (y_buf),
-       [u_buf] "+r" (u_buf),
-       [v_buf] "+r" (v_buf),
-       [width] "+r" (width),
-       [rgb_buf] "+r" (rgb_buf)
-      :
-      : "t0", "t1",  "t2", "t3",  "t4", "t5",
-      "t6", "t7", "t8", "t9",
-      "s0", "s1", "s2", "s3",
-      "s4", "s5", "s6"
-  );
-}
-
-// Bilinear filter 8x2 -> 8x1
-void InterpolateRow_DSPR2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
-    int y0_fraction = 256 - source_y_fraction;
-    const uint8* src_ptr1 = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-     ".set push                                           \n"
-     ".set noreorder                                      \n"
-
-     "replv.ph          $t0, %[y0_fraction]               \n"
-     "replv.ph          $t1, %[source_y_fraction]         \n"
-
-   "1:                                                    \n"
-     "lw                $t2, 0(%[src_ptr])                \n"
-     "lw                $t3, 0(%[src_ptr1])               \n"
-     "lw                $t4, 4(%[src_ptr])                \n"
-     "lw                $t5, 4(%[src_ptr1])               \n"
-     "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
-     "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
-     "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
-     "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
-     "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
-     "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
-     "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
-     "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
-     "addq.ph           $t6, $t6, $t8                     \n"
-     "addq.ph           $t7, $t7, $t9                     \n"
-     "addq.ph           $t2, $t2, $t4                     \n"
-     "addq.ph           $t3, $t3, $t5                     \n"
-     "shra.ph           $t6, $t6, 8                       \n"
-     "shra.ph           $t7, $t7, 8                       \n"
-     "shra.ph           $t2, $t2, 8                       \n"
-     "shra.ph           $t3, $t3, 8                       \n"
-     "precr.qb.ph       $t6, $t6, $t7                     \n"
-     "precr.qb.ph       $t2, $t2, $t3                     \n"
-     "addiu             %[src_ptr], %[src_ptr], 8         \n"
-     "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
-     "addiu             %[dst_width], %[dst_width], -8    \n"
-     "sw                $t6, 0(%[dst_ptr])                \n"
-     "sw                $t2, 4(%[dst_ptr])                \n"
-     "bgtz              %[dst_width], 1b                  \n"
-     " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
-
-     ".set pop                                            \n"
-  : [dst_ptr] "+r" (dst_ptr),
-    [src_ptr1] "+r" (src_ptr1),
-    [src_ptr] "+r" (src_ptr),
-    [dst_width] "+r" (dst_width)
-  : [source_y_fraction] "r" (source_y_fraction),
-    [y0_fraction] "r" (y0_fraction),
-    [src_stride] "r" (src_stride)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-#endif  // __mips_dsp_rev >= 2
-
-#endif  // defined(__mips__)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
diff --git a/libs/libvpx/third_party/libyuv/source/row_msa.cc b/libs/libvpx/third_party/libyuv/source/row_msa.cc
new file mode 100644
index 00000000000..4fb2631f0b3
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/row_msa.cc
@@ -0,0 +1,3512 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "libyuv/row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ALPHA_VAL (-1)
+
+// Fill YUV -> RGB conversion constants into vectors
+#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, bb, bg, br, yg) \
+  {                                                              \
+    ub = __msa_fill_w(yuvconst->kUVToB[0]);                      \
+    vr = __msa_fill_w(yuvconst->kUVToR[1]);                      \
+    ug = __msa_fill_w(yuvconst->kUVToG[0]);                      \
+    vg = __msa_fill_w(yuvconst->kUVToG[1]);                      \
+    bb = __msa_fill_w(yuvconst->kUVBiasB[0]);                    \
+    bg = __msa_fill_w(yuvconst->kUVBiasG[0]);                    \
+    br = __msa_fill_w(yuvconst->kUVBiasR[0]);                    \
+    yg = __msa_fill_w(yuvconst->kYToRgb[0]);                     \
+  }
+
+// Load YUV 422 pixel data
+#define READYUV422(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v)    \
+  {                                                                \
+    uint64_t y_m;                                                  \
+    uint32_t u_m, v_m;                                             \
+    v4i32 zero_m = {0};                                            \
+    y_m = LD(psrc_y);                                              \
+    u_m = LW(psrc_u);                                              \
+    v_m = LW(psrc_v);                                              \
+    out_y = (v16u8)__msa_insert_d((v2i64)zero_m, 0, (int64_t)y_m); \
+    out_u = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)u_m);        \
+    out_v = (v16u8)__msa_insert_w(zero_m, 0, (int32_t)v_m);        \
+  }
+
+// Clip input vector elements between 0 to 255
+#define CLIP_0TO255(in0, in1, in2, in3, in4, in5) \
+  {                                               \
+    v4i32 max_m = __msa_ldi_w(0xFF);              \
+                                                  \
+    in0 = __msa_maxi_s_w(in0, 0);                 \
+    in1 = __msa_maxi_s_w(in1, 0);                 \
+    in2 = __msa_maxi_s_w(in2, 0);                 \
+    in3 = __msa_maxi_s_w(in3, 0);                 \
+    in4 = __msa_maxi_s_w(in4, 0);                 \
+    in5 = __msa_maxi_s_w(in5, 0);                 \
+    in0 = __msa_min_s_w(max_m, in0);              \
+    in1 = __msa_min_s_w(max_m, in1);              \
+    in2 = __msa_min_s_w(max_m, in2);              \
+    in3 = __msa_min_s_w(max_m, in3);              \
+    in4 = __msa_min_s_w(max_m, in4);              \
+    in5 = __msa_min_s_w(max_m, in5);              \
+  }
+
+// Convert 8 pixels of YUV 420 to RGB.
+#define YUVTORGB(in_y, in_uv, ubvr, ugvg, bb, bg, br, yg, out_b, out_g, out_r) \
+  {                                                                            \
+    v8i16 vec0_m, vec1_m;                                                      \
+    v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m;                              \
+    v4i32 reg5_m, reg6_m, reg7_m;                                              \
+    v16i8 zero_m = {0};                                                        \
+                                                                               \
+    vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y);                    \
+    vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv);                 \
+    reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m);                \
+    reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m);                \
+    reg0_m *= yg;                                                              \
+    reg1_m *= yg;                                                              \
+    reg2_m *= ubvr;                                                            \
+    reg3_m *= ubvr;                                                            \
+    reg0_m = __msa_srai_w(reg0_m, 16);                                         \
+    reg1_m = __msa_srai_w(reg1_m, 16);                                         \
+    reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg);                       \
+    reg5_m = __msa_ilvev_w(reg2_m, reg2_m);                                    \
+    reg6_m = __msa_ilvev_w(reg3_m, reg3_m);                                    \
+    reg7_m = __msa_ilvr_w(reg4_m, reg4_m);                                     \
+    reg2_m = __msa_ilvod_w(reg2_m, reg2_m);                                    \
+    reg3_m = __msa_ilvod_w(reg3_m, reg3_m);                                    \
+    reg4_m = __msa_ilvl_w(reg4_m, reg4_m);                                     \
+    reg5_m = reg0_m - reg5_m;                                                  \
+    reg6_m = reg1_m - reg6_m;                                                  \
+    reg2_m = reg0_m - reg2_m;                                                  \
+    reg3_m = reg1_m - reg3_m;                                                  \
+    reg7_m = reg0_m - reg7_m;                                                  \
+    reg4_m = reg1_m - reg4_m;                                                  \
+    reg5_m += bb;                                                              \
+    reg6_m += bb;                                                              \
+    reg7_m += bg;                                                              \
+    reg4_m += bg;                                                              \
+    reg2_m += br;                                                              \
+    reg3_m += br;                                                              \
+    reg5_m = __msa_srai_w(reg5_m, 6);                                          \
+    reg6_m = __msa_srai_w(reg6_m, 6);                                          \
+    reg7_m = __msa_srai_w(reg7_m, 6);                                          \
+    reg4_m = __msa_srai_w(reg4_m, 6);                                          \
+    reg2_m = __msa_srai_w(reg2_m, 6);                                          \
+    reg3_m = __msa_srai_w(reg3_m, 6);                                          \
+    CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m);               \
+    out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m);                       \
+    out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m);                       \
+    out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m);                       \
+  }
+
+// Pack and Store 8 ARGB values.
+#define STOREARGB(in0, in1, in2, in3, pdst_argb)           \
+  {                                                        \
+    v8i16 vec0_m, vec1_m;                                  \
+    v16u8 dst0_m, dst1_m;                                  \
+    vec0_m = (v8i16)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
+    vec1_m = (v8i16)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
+    dst0_m = (v16u8)__msa_ilvr_h(vec1_m, vec0_m);          \
+    dst1_m = (v16u8)__msa_ilvl_h(vec1_m, vec0_m);          \
+    ST_UB2(dst0_m, dst1_m, pdst_argb, 16);                 \
+  }
+
+// Takes ARGB input and calculates Y.
+#define ARGBTOY(argb0, argb1, argb2, argb3, const0, const1, const2, shift, \
+                y_out)                                                     \
+  {                                                                        \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m;                                  \
+    v8u16 reg0_m, reg1_m;                                                  \
+                                                                           \
+    vec0_m = (v16u8)__msa_pckev_h((v8i16)argb1, (v8i16)argb0);             \
+    vec1_m = (v16u8)__msa_pckev_h((v8i16)argb3, (v8i16)argb2);             \
+    vec2_m = (v16u8)__msa_pckod_h((v8i16)argb1, (v8i16)argb0);             \
+    vec3_m = (v16u8)__msa_pckod_h((v8i16)argb3, (v8i16)argb2);             \
+    reg0_m = __msa_dotp_u_h(vec0_m, const0);                               \
+    reg1_m = __msa_dotp_u_h(vec1_m, const0);                               \
+    reg0_m = __msa_dpadd_u_h(reg0_m, vec2_m, const1);                      \
+    reg1_m = __msa_dpadd_u_h(reg1_m, vec3_m, const1);                      \
+    reg0_m += const2;                                                      \
+    reg1_m += const2;                                                      \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, shift);                    \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, shift);                    \
+    y_out = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);            \
+  }
+
+// Loads current and next row of ARGB input and averages it to calculate U and V
+#define READ_ARGB(s_ptr, t_ptr, argb0, argb1, argb2, argb3)               \
+  {                                                                       \
+    v16u8 src0_m, src1_m, src2_m, src3_m, src4_m, src5_m, src6_m, src7_m; \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+    v16u8 vec8_m, vec9_m;                                                 \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m, reg5_m, reg6_m, reg7_m; \
+    v8u16 reg8_m, reg9_m;                                                 \
+                                                                          \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 0);                             \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 16);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 32);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 48);                            \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 0);                             \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 16);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 32);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 48);                            \
+    vec0_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec1_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec4_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec5_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec0_m, vec0_m);                              \
+    reg1_m = __msa_hadd_u_h(vec1_m, vec1_m);                              \
+    reg2_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg3_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg4_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg5_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg6_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg7_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb0 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb1 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+    src0_m = (v16u8)__msa_ld_b((v16i8*)s, 64);                            \
+    src1_m = (v16u8)__msa_ld_b((v16i8*)s, 80);                            \
+    src2_m = (v16u8)__msa_ld_b((v16i8*)s, 96);                            \
+    src3_m = (v16u8)__msa_ld_b((v16i8*)s, 112);                           \
+    src4_m = (v16u8)__msa_ld_b((v16i8*)t, 64);                            \
+    src5_m = (v16u8)__msa_ld_b((v16i8*)t, 80);                            \
+    src6_m = (v16u8)__msa_ld_b((v16i8*)t, 96);                            \
+    src7_m = (v16u8)__msa_ld_b((v16i8*)t, 112);                           \
+    vec2_m = (v16u8)__msa_ilvr_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec3_m = (v16u8)__msa_ilvr_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec4_m = (v16u8)__msa_ilvr_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec5_m = (v16u8)__msa_ilvr_b((v16i8)src3_m, (v16i8)src7_m);           \
+    vec6_m = (v16u8)__msa_ilvl_b((v16i8)src0_m, (v16i8)src4_m);           \
+    vec7_m = (v16u8)__msa_ilvl_b((v16i8)src1_m, (v16i8)src5_m);           \
+    vec8_m = (v16u8)__msa_ilvl_b((v16i8)src2_m, (v16i8)src6_m);           \
+    vec9_m = (v16u8)__msa_ilvl_b((v16i8)src3_m, (v16i8)src7_m);           \
+    reg0_m = __msa_hadd_u_h(vec2_m, vec2_m);                              \
+    reg1_m = __msa_hadd_u_h(vec3_m, vec3_m);                              \
+    reg2_m = __msa_hadd_u_h(vec4_m, vec4_m);                              \
+    reg3_m = __msa_hadd_u_h(vec5_m, vec5_m);                              \
+    reg4_m = __msa_hadd_u_h(vec6_m, vec6_m);                              \
+    reg5_m = __msa_hadd_u_h(vec7_m, vec7_m);                              \
+    reg6_m = __msa_hadd_u_h(vec8_m, vec8_m);                              \
+    reg7_m = __msa_hadd_u_h(vec9_m, vec9_m);                              \
+    reg8_m = (v8u16)__msa_pckev_d((v2i64)reg4_m, (v2i64)reg0_m);          \
+    reg9_m = (v8u16)__msa_pckev_d((v2i64)reg5_m, (v2i64)reg1_m);          \
+    reg8_m += (v8u16)__msa_pckod_d((v2i64)reg4_m, (v2i64)reg0_m);         \
+    reg9_m += (v8u16)__msa_pckod_d((v2i64)reg5_m, (v2i64)reg1_m);         \
+    reg0_m = (v8u16)__msa_pckev_d((v2i64)reg6_m, (v2i64)reg2_m);          \
+    reg1_m = (v8u16)__msa_pckev_d((v2i64)reg7_m, (v2i64)reg3_m);          \
+    reg0_m += (v8u16)__msa_pckod_d((v2i64)reg6_m, (v2i64)reg2_m);         \
+    reg1_m += (v8u16)__msa_pckod_d((v2i64)reg7_m, (v2i64)reg3_m);         \
+    reg8_m = (v8u16)__msa_srai_h((v8i16)reg8_m, 2);                       \
+    reg9_m = (v8u16)__msa_srai_h((v8i16)reg9_m, 2);                       \
+    reg0_m = (v8u16)__msa_srai_h((v8i16)reg0_m, 2);                       \
+    reg1_m = (v8u16)__msa_srai_h((v8i16)reg1_m, 2);                       \
+    argb2 = (v16u8)__msa_pckev_b((v16i8)reg9_m, (v16i8)reg8_m);           \
+    argb3 = (v16u8)__msa_pckev_b((v16i8)reg1_m, (v16i8)reg0_m);           \
+  }
+
+// Takes ARGB input and calculates U and V.
+#define ARGBTOUV(argb0, argb1, argb2, argb3, const0, const1, const2, const3, \
+                 shf0, shf1, shf2, shf3, v_out, u_out)                       \
+  {                                                                          \
+    v16u8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
+    v8u16 reg0_m, reg1_m, reg2_m, reg3_m;                                    \
+                                                                             \
+    vec0_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb1, (v16i8)argb0);          \
+    vec1_m = (v16u8)__msa_vshf_b(shf0, (v16i8)argb3, (v16i8)argb2);          \
+    vec2_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb1, (v16i8)argb0);          \
+    vec3_m = (v16u8)__msa_vshf_b(shf1, (v16i8)argb3, (v16i8)argb2);          \
+    vec4_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb1, (v16i8)argb0);          \
+    vec5_m = (v16u8)__msa_vshf_b(shf2, (v16i8)argb3, (v16i8)argb2);          \
+    vec6_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb1, (v16i8)argb0);          \
+    vec7_m = (v16u8)__msa_vshf_b(shf3, (v16i8)argb3, (v16i8)argb2);          \
+    reg0_m = __msa_dotp_u_h(vec0_m, const1);                                 \
+    reg1_m = __msa_dotp_u_h(vec1_m, const1);                                 \
+    reg2_m = __msa_dotp_u_h(vec4_m, const1);                                 \
+    reg3_m = __msa_dotp_u_h(vec5_m, const1);                                 \
+    reg0_m += const3;                                                        \
+    reg1_m += const3;                                                        \
+    reg2_m += const3;                                                        \
+    reg3_m += const3;                                                        \
+    reg0_m -= __msa_dotp_u_h(vec2_m, const0);                                \
+    reg1_m -= __msa_dotp_u_h(vec3_m, const0);                                \
+    reg2_m -= __msa_dotp_u_h(vec6_m, const2);                                \
+    reg3_m -= __msa_dotp_u_h(vec7_m, const2);                                \
+    v_out = (v16u8)__msa_pckod_b((v16i8)reg1_m, (v16i8)reg0_m);              \
+    u_out = (v16u8)__msa_pckod_b((v16i8)reg3_m, (v16i8)reg2_m);              \
+  }
+
+// Load I444 pixel data
+#define READI444(psrc_y, psrc_u, psrc_v, out_y, out_u, out_v) \
+  {                                                           \
+    uint64_t y_m, u_m, v_m;                                   \
+    v2i64 zero_m = {0};                                       \
+    y_m = LD(psrc_y);                                         \
+    u_m = LD(psrc_u);                                         \
+    v_m = LD(psrc_v);                                         \
+    out_y = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)y_m);   \
+    out_u = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)u_m);   \
+    out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m);   \
+  }
+
+void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  src += width - 64;
+
+  for (x = 0; x < width; x += 64) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void ARGBMirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 shuffler = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+  src += width * 4 - 64;
+
+  for (x = 0; x < width; x += 16) {
+    LD_UB4(src, 16, src3, src2, src1, src0);
+    VSHF_B2_UB(src3, src3, src2, src2, shuffler, shuffler, dst3, dst2);
+    VSHF_B2_UB(src1, src1, src0, src0, shuffler, shuffler, dst1, dst0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
+    dst += 64;
+    src -= 64;
+  }
+}
+
+void I422ToYUY2Row_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
+void I422ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGBARow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(alpha, vec0, vec1, vec2, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb += 32;
+  }
+}
+
+void I422AlphaToARGBRow_MSA(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            const uint8_t* src_a,
+                            uint8_t* dst_argb,
+                            const struct YuvConstants* yuvconstants,
+                            int width) {
+  int x;
+  int64_t data_a;
+  v16u8 src0, src1, src2, src3;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v4i32 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    data_a = LD(src_a);
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_insert_d((v2i64)zero, 0, data_a);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    src3 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src3);
+    STOREARGB(vec0, vec1, vec2, src3, dst_argb);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    src_a += 8;
+    dst_argb += 32;
+  }
+}
+
+void I422ToRGB24Row_MSA(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int32_t width) {
+  int x;
+  int64_t data_u, data_v;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 reg0, reg1, reg2, reg3;
+  v2i64 zero = {0};
+  v16i8 shuffler0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10};
+  v16i8 shuffler1 = {0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, 8, 25, 9, 10};
+  v16i8 shuffler2 = {26, 6,  7,  27, 8,  9,  28, 10,
+                     11, 29, 12, 13, 30, 14, 15, 31};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_y, 0);
+    data_u = LD(src_u);
+    data_v = LD(src_v);
+    src1 = (v16u8)__msa_insert_d(zero, 0, data_u);
+    src2 = (v16u8)__msa_insert_d(zero, 0, data_v);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src0, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src1, 8);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    YUVTORGB(src3, src4, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec3, vec4, vec5);
+    reg0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    reg2 = (v16u8)__msa_ilvev_b((v16i8)vec4, (v16i8)vec3);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec2);
+    reg1 = (v16u8)__msa_sldi_b((v16i8)reg2, (v16i8)reg0, 11);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)reg3, (v16i8)reg0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)reg3, (v16i8)reg1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)reg3, (v16i8)reg2);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    ST_UB(dst2, (dst_argb + 32));
+    src_y += 16;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 48;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 5 upper bits of R.
+void I422ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec2, vec1);
+    vec0 = __msa_srai_h(vec0, 3);
+    vec1 = __msa_srai_h(vec1, 3);
+    vec2 = __msa_srai_h(vec2, 2);
+    vec1 = __msa_slli_h(vec1, 11);
+    vec2 = __msa_slli_h(vec2, 5);
+    vec0 |= vec1;
+    dst0 = (v16u8)(vec2 | vec0);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_rgb565 += 16;
+  }
+}
+
+// TODO(fbarchard): Consider AND instead of shift to isolate 4 upper bits of G.
+void I422ToARGB4444Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb4444,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0xF000 = (v8u16)__msa_fill_h(0xF000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 4);
+    reg1 = (v8u16)__msa_srai_h(vec1, 4);
+    reg2 = (v8u16)__msa_srai_h(vec2, 4);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 4);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 8);
+    reg1 |= const_0xF000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb4444);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb4444 += 16;
+  }
+}
+
+void I422ToARGB1555Row_MSA(const uint8_t* src_y,
+                           const uint8_t* src_u,
+                           const uint8_t* src_v,
+                           uint8_t* dst_argb1555,
+                           const struct YuvConstants* yuvconstants,
+                           int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0;
+  v8i16 vec0, vec1, vec2;
+  v8u16 reg0, reg1, reg2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v8u16 const_0x8000 = (v8u16)__msa_fill_h(0x8000);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    READYUV422(src_y, src_u, src_v, src0, src1, src2);
+    src1 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    reg0 = (v8u16)__msa_srai_h(vec0, 3);
+    reg1 = (v8u16)__msa_srai_h(vec1, 3);
+    reg2 = (v8u16)__msa_srai_h(vec2, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)reg1, 5);
+    reg2 = (v8u16)__msa_slli_h((v8i16)reg2, 10);
+    reg1 |= const_0x8000;
+    reg0 |= reg2;
+    dst0 = (v16u8)(reg1 | reg0);
+    ST_UB(dst0, dst_argb1555);
+    src_y += 8;
+    src_u += 4;
+    src_v += 4;
+    dst_argb1555 += 16;
+  }
+}
+
+void YUY2ToYRow_MSA(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_MSA(const uint8_t* src_yuy2,
+                     int src_stride_yuy2,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_MSA(const uint8_t* src_yuy2,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_MSA(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_MSA(const uint8_t* src_uyvy,
+                     int src_stride_uyvy,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    src3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_MSA(const uint8_t* src_uyvy,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    src0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16i8 zero = {0};
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    reg0 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec2);
+    reg3 = (v8u16)__msa_ilvev_b(zero, (v16i8)vec3);
+    reg4 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg5 = (v8u16)__msa_ilvod_b(zero, (v16i8)vec1);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+                     int src_stride_argb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* src_argb0_next = src_argb0 + src_stride_argb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v16u8 dst0, dst1;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 = __msa_hadd_u_h(vec8, vec8);
+    reg1 = __msa_hadd_u_h(vec9, vec9);
+    reg2 = __msa_hadd_u_h(vec4, vec4);
+    reg3 = __msa_hadd_u_h(vec5, vec5);
+    reg4 = __msa_hadd_u_h(vec0, vec0);
+    reg5 = __msa_hadd_u_h(vec1, vec1);
+    src0 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 16);
+    src2 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 32);
+    src3 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 48);
+    src4 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 64);
+    src5 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 80);
+    src6 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 96);
+    src7 = (v16u8)__msa_ld_b((v16u8*)src_argb0_next, 112);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src5, (v16i8)src4);
+    vec3 = (v16u8)__msa_pckev_b((v16i8)src7, (v16i8)src6);
+    vec4 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec5 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    vec6 = (v16u8)__msa_pckod_b((v16i8)src5, (v16i8)src4);
+    vec7 = (v16u8)__msa_pckod_b((v16i8)src7, (v16i8)src6);
+    vec8 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    vec9 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    vec4 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec5 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)vec3, (v16i8)vec2);
+    reg0 += __msa_hadd_u_h(vec8, vec8);
+    reg1 += __msa_hadd_u_h(vec9, vec9);
+    reg2 += __msa_hadd_u_h(vec4, vec4);
+    reg3 += __msa_hadd_u_h(vec5, vec5);
+    reg4 += __msa_hadd_u_h(vec0, vec0);
+    reg5 += __msa_hadd_u_h(vec1, vec1);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 2);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 2);
+    reg3 = (v8u16)__msa_srai_h((v8i16)reg3, 2);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 2);
+    reg6 = reg0 * const_0x70;
+    reg7 = reg1 * const_0x70;
+    reg8 = reg2 * const_0x4A;
+    reg9 = reg3 * const_0x4A;
+    reg6 += const_0x8080;
+    reg7 += const_0x8080;
+    reg8 += reg4 * const_0x26;
+    reg9 += reg5 * const_0x26;
+    reg0 *= const_0x12;
+    reg1 *= const_0x12;
+    reg2 *= const_0x5E;
+    reg3 *= const_0x5E;
+    reg4 *= const_0x70;
+    reg5 *= const_0x70;
+    reg2 += reg0;
+    reg3 += reg1;
+    reg4 += const_0x8080;
+    reg5 += const_0x8080;
+    reg6 -= reg8;
+    reg7 -= reg9;
+    reg4 -= reg2;
+    reg5 -= reg3;
+    reg6 = (v8u16)__msa_srai_h((v8i16)reg6, 8);
+    reg7 = (v8u16)__msa_srai_h((v8i16)reg7, 8);
+    reg4 = (v8u16)__msa_srai_h((v8i16)reg4, 8);
+    reg5 = (v8u16)__msa_srai_h((v8i16)reg5, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg7, (v16i8)reg6);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb0 += 128;
+    src_argb0_next += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBToRGB24Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20};
+  v16i8 shuffler1 = {5,  6,  8,  9,  10, 12, 13, 14,
+                     16, 17, 18, 20, 21, 22, 24, 25};
+  v16i8 shuffler2 = {10, 12, 13, 14, 16, 17, 18, 20,
+                     21, 22, 24, 25, 26, 28, 29, 30};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRAWRow_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22};
+  v16i8 shuffler1 = {5,  4,  10, 9,  8,  14, 13, 12,
+                     18, 17, 16, 22, 21, 20, 26, 25};
+  v16i8 shuffler2 = {8,  14, 13, 12, 18, 17, 16, 22,
+                     21, 20, 26, 25, 24, 30, 29, 28};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src2, (v16i8)src1);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_rgb, 16);
+    ST_UB(dst2, (dst_rgb + 32));
+    src_argb += 64;
+    dst_rgb += 48;
+  }
+}
+
+void ARGBToRGB565Row_MSA(const uint8_t* src_argb, uint8_t* dst_rgb, int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 3);
+    vec2 = (v16u8)__msa_srai_b((v16i8)src0, 5);
+    vec4 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec5 = (v16u8)__msa_slli_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_srai_b((v16i8)src1, 5);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec5 = (v16u8)__msa_sldi_b(zero, (v16i8)vec5, 1);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 2);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 2);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 4);
+    vec4 = __msa_binsli_b(vec4, vec5, 2);
+    vec5 = __msa_binsli_b(vec6, vec7, 4);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec4 = (v16u8)__msa_ilvev_b((v16i8)vec5, (v16i8)vec4);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec4, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB1555Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 3);
+    vec1 = (v16u8)__msa_slli_b((v16i8)src0, 2);
+    vec2 = (v16u8)__msa_srai_b((v16i8)vec0, 3);
+    vec1 = (v16u8)__msa_sldi_b(zero, (v16i8)vec1, 1);
+    vec2 = (v16u8)__msa_sldi_b(zero, (v16i8)vec2, 1);
+    vec3 = (v16u8)__msa_srai_b((v16i8)src0, 1);
+    vec5 = (v16u8)__msa_srai_b((v16i8)src1, 3);
+    vec6 = (v16u8)__msa_slli_b((v16i8)src1, 2);
+    vec7 = (v16u8)__msa_srai_b((v16i8)vec5, 3);
+    vec6 = (v16u8)__msa_sldi_b(zero, (v16i8)vec6, 1);
+    vec7 = (v16u8)__msa_sldi_b(zero, (v16i8)vec7, 1);
+    vec8 = (v16u8)__msa_srai_b((v16i8)src1, 1);
+    vec3 = (v16u8)__msa_sldi_b(zero, (v16i8)vec3, 2);
+    vec8 = (v16u8)__msa_sldi_b(zero, (v16i8)vec8, 2);
+    vec4 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 3);
+    vec9 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 3);
+    vec0 = __msa_binsli_b(vec0, vec1, 2);
+    vec5 = __msa_binsli_b(vec5, vec6, 2);
+    vec1 = __msa_binsli_b(vec2, vec3, 5);
+    vec6 = __msa_binsli_b(vec7, vec8, 5);
+    vec1 = __msa_binsli_b(vec1, vec4, 0);
+    vec6 = __msa_binsli_b(vec6, vec9, 0);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v16u8)__msa_ilvev_b((v16i8)vec6, (v16i8)vec5);
+    dst0 = (v16u8)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToARGB4444Row_MSA(const uint8_t* src_argb,
+                           uint8_t* dst_rgb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v16u8 vec0, vec1;
+  v16u8 dst0;
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_srai_b((v16i8)src0, 4);
+    vec1 = (v16u8)__msa_srai_b((v16i8)src1, 4);
+    src0 = (v16u8)__msa_sldi_b(zero, (v16i8)src0, 1);
+    src1 = (v16u8)__msa_sldi_b(zero, (v16i8)src1, 1);
+    vec0 = __msa_binsli_b(vec0, src0, 3);
+    vec1 = __msa_binsli_b(vec1, src1, 3);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBToUV444Row_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int32_t width) {
+  int32_t x;
+  v16u8 src0, src1, src2, src3, reg0, reg1, reg2, reg3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11;
+  v8u16 const_112 = (v8u16)__msa_ldi_h(112);
+  v8u16 const_74 = (v8u16)__msa_ldi_h(74);
+  v8u16 const_38 = (v8u16)__msa_ldi_h(38);
+  v8u16 const_94 = (v8u16)__msa_ldi_h(94);
+  v8u16 const_18 = (v8u16)__msa_ldi_h(18);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(32896);
+  v16i8 zero = {0};
+
+  for (x = width; x > 0; x -= 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    reg2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    src0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    src2 = (v16u8)__msa_pckod_b((v16i8)reg1, (v16i8)reg0);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec10 = vec0 * const_18;
+    vec11 = vec1 * const_18;
+    vec8 = vec2 * const_94;
+    vec9 = vec3 * const_94;
+    vec6 = vec4 * const_112;
+    vec7 = vec5 * const_112;
+    vec0 *= const_112;
+    vec1 *= const_112;
+    vec2 *= const_74;
+    vec3 *= const_74;
+    vec4 *= const_38;
+    vec5 *= const_38;
+    vec8 += vec10;
+    vec9 += vec11;
+    vec6 += const_32896;
+    vec7 += const_32896;
+    vec0 += const_32896;
+    vec1 += const_32896;
+    vec2 += vec4;
+    vec3 += vec5;
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6 -= vec8;
+    vec7 -= vec9;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    vec6 = (v8u16)__msa_srai_h((v8i16)vec6, 8);
+    vec7 = (v8u16)__msa_srai_h((v8i16)vec7, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_argb += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void ARGBMultiplyRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 16);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 16);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 16);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb0 += 16;
+    src_argb1 += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBAddRow_MSA(const uint8_t* src_argb0,
+                    const uint8_t* src_argb1,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSubtractRow_MSA(const uint8_t* src_argb0,
+                         const uint8_t* src_argb1,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    dst0 = __msa_subs_u_b(src0, src2);
+    dst1 = __msa_subs_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBAttenuateRow_MSA(const uint8_t* src_argb,
+                          uint8_t* dst_argb,
+                          int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4u32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v8i16 zero = {0};
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src1, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src1, (v16i8)src1);
+    vec4 = (v8u16)__msa_fill_h(vec0[3]);
+    vec5 = (v8u16)__msa_fill_h(vec0[7]);
+    vec6 = (v8u16)__msa_fill_h(vec1[3]);
+    vec7 = (v8u16)__msa_fill_h(vec1[7]);
+    vec4 = (v8u16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    vec5 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec6 = (v8u16)__msa_fill_h(vec2[3]);
+    vec7 = (v8u16)__msa_fill_h(vec2[7]);
+    vec8 = (v8u16)__msa_fill_h(vec3[3]);
+    vec9 = (v8u16)__msa_fill_h(vec3[7]);
+    vec6 = (v8u16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    vec7 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec4);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec4);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec5);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec5);
+    reg4 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec6);
+    reg5 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec6);
+    reg6 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec7);
+    reg7 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec7);
+    reg0 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg4 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec2);
+    reg5 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec2);
+    reg6 *= (v4u32)__msa_ilvr_h(zero, (v8i16)vec3);
+    reg7 *= (v4u32)__msa_ilvl_h(zero, (v8i16)vec3);
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    reg4 = (v4u32)__msa_srai_w((v4i32)reg4, 24);
+    reg5 = (v4u32)__msa_srai_w((v4i32)reg5, 24);
+    reg6 = (v4u32)__msa_srai_w((v4i32)reg6, 24);
+    reg7 = (v4u32)__msa_srai_w((v4i32)reg7, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec3 = (v8u16)__msa_pckev_h((v8i16)reg7, (v8i16)reg6);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, src0, mask);
+    dst1 = __msa_bmnz_v(dst1, src1, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBToRGB565DitherRow_MSA(const uint8_t* src_argb,
+                               uint8_t* dst_rgb,
+                               uint32_t dither4,
+                               int width) {
+  int x;
+  v16u8 src0, src1, dst0, vec0, vec1;
+  v8i16 vec_d0;
+  v8i16 reg0, reg1, reg2;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(0xFF);
+
+  vec_d0 = (v8i16)__msa_fill_w(dither4);
+  vec_d0 = (v8i16)__msa_ilvr_b(zero, (v16i8)vec_d0);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    reg0 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec0);
+    reg1 = (v8i16)__msa_ilvev_b(zero, (v16i8)vec1);
+    reg2 = (v8i16)__msa_ilvod_b(zero, (v16i8)vec0);
+    reg0 += vec_d0;
+    reg1 += vec_d0;
+    reg2 += vec_d0;
+    reg0 = __msa_maxi_s_h((v8i16)reg0, 0);
+    reg1 = __msa_maxi_s_h((v8i16)reg1, 0);
+    reg2 = __msa_maxi_s_h((v8i16)reg2, 0);
+    reg0 = __msa_min_s_h((v8i16)max, (v8i16)reg0);
+    reg1 = __msa_min_s_h((v8i16)max, (v8i16)reg1);
+    reg2 = __msa_min_s_h((v8i16)max, (v8i16)reg2);
+    reg0 = __msa_srai_h(reg0, 3);
+    reg2 = __msa_srai_h(reg2, 3);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_slli_h(reg2, 11);
+    reg1 = __msa_slli_h(reg1, 5);
+    reg0 |= reg1;
+    dst0 = (v16u8)(reg0 | reg2);
+    ST_UB(dst0, dst_rgb);
+    src_argb += 32;
+    dst_rgb += 16;
+  }
+}
+
+void ARGBShuffleRow_MSA(const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        const uint8_t* shuffler,
+                        int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+  v16i8 vec0;
+  v16i8 shuffler_vec = {0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
+  int32_t val = LW((int32_t*)shuffler);
+
+  vec0 = (v16i8)__msa_fill_w(val);
+  shuffler_vec += vec0;
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);
+    dst0 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src0, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler_vec, (v16i8)src1, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBShadeRow_MSA(const uint8_t* src_argb,
+                      uint8_t* dst_argb,
+                      int width,
+                      uint32_t value) {
+  int x;
+  v16u8 src0, dst0;
+  v8u16 vec0, vec1;
+  v4u32 reg0, reg1, reg2, reg3, rgba_scale;
+  v8i16 zero = {0};
+
+  rgba_scale[0] = value;
+  rgba_scale = (v4u32)__msa_ilvr_b((v16i8)rgba_scale, (v16i8)rgba_scale);
+  rgba_scale = (v4u32)__msa_ilvr_h(zero, (v8i16)rgba_scale);
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec0);
+    reg1 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec0);
+    reg2 = (v4u32)__msa_ilvr_h(zero, (v8i16)vec1);
+    reg3 = (v4u32)__msa_ilvl_h(zero, (v8i16)vec1);
+    reg0 *= rgba_scale;
+    reg1 *= rgba_scale;
+    reg2 *= rgba_scale;
+    reg3 *= rgba_scale;
+    reg0 = (v4u32)__msa_srai_w((v4i32)reg0, 24);
+    reg1 = (v4u32)__msa_srai_w((v4i32)reg1, 24);
+    reg2 = (v4u32)__msa_srai_w((v4i32)reg2, 24);
+    reg3 = (v4u32)__msa_srai_w((v4i32)reg3, 24);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 16;
+    dst_argb += 16;
+  }
+}
+
+void ARGBGrayRow_MSA(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0, dst1;
+  v8u16 reg0;
+  v16u8 const_0x26 = (v16u8)__msa_ldi_h(0x26);
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    reg0 = __msa_dotp_u_h(vec0, const_0x4B0F);
+    reg0 = __msa_dpadd_u_h(reg0, vec1, const_0x26);
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 7);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_ilvod_b((v16i8)vec1, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBSepiaRow_MSA(uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2;
+  v16u8 const_0x4411 = (v16u8)__msa_fill_h(0x4411);
+  v16u8 const_0x23 = (v16u8)__msa_ldi_h(0x23);
+  v16u8 const_0x5816 = (v16u8)__msa_fill_h(0x5816);
+  v16u8 const_0x2D = (v16u8)__msa_ldi_h(0x2D);
+  v16u8 const_0x6218 = (v16u8)__msa_fill_h(0x6218);
+  v16u8 const_0x32 = (v16u8)__msa_ldi_h(0x32);
+  v8u16 const_0xFF = (v8u16)__msa_ldi_h(0xFF);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16u8*)dst_argb, 16);
+    vec0 = (v16u8)__msa_pckev_h((v8i16)src1, (v8i16)src0);
+    vec1 = (v16u8)__msa_pckod_h((v8i16)src1, (v8i16)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec1);
+    reg0 = (v8u16)__msa_dotp_u_h(vec0, const_0x4411);
+    reg1 = (v8u16)__msa_dotp_u_h(vec0, const_0x5816);
+    reg2 = (v8u16)__msa_dotp_u_h(vec0, const_0x6218);
+    reg0 = (v8u16)__msa_dpadd_u_h(reg0, vec1, const_0x23);
+    reg1 = (v8u16)__msa_dpadd_u_h(reg1, vec1, const_0x2D);
+    reg2 = (v8u16)__msa_dpadd_u_h(reg2, vec1, const_0x32);
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 7);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 7);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 7);
+    reg1 = (v8u16)__msa_min_u_h((v8u16)reg1, const_0xFF);
+    reg2 = (v8u16)__msa_min_u_h((v8u16)reg2, const_0xFF);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)reg0, (v16i8)reg0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg1);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg2);
+    vec4 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    vec5 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec5, (v16i8)vec4);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec5, (v16i8)vec4);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ARGB4444ToARGBRow_MSA(const uint8_t* src_argb4444,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v16u8 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 0);
+    src1 = (v16u8)__msa_ld_b((const v16u8*)src_argb4444, 16);
+    vec0 = (v8u16)__msa_andi_b(src0, 0x0F);
+    vec1 = (v8u16)__msa_andi_b(src1, 0x0F);
+    vec2 = (v8u16)__msa_andi_b(src0, 0xF0);
+    vec3 = (v8u16)__msa_andi_b(src1, 0xF0);
+    vec0 |= (v8u16)__msa_slli_b((v16i8)vec0, 4);
+    vec1 |= (v8u16)__msa_slli_b((v16i8)vec1, 4);
+    vec2 |= (v8u16)__msa_srli_b((v16i8)vec2, 4);
+    vec3 |= (v8u16)__msa_srli_b((v16i8)vec3, 4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb4444 += 32;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToARGBRow_MSA(const uint8_t* src_argb1555,
+                           uint8_t* dst_argb,
+                           int width) {
+  int x;
+  v8u16 src0, src1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6;
+  v16u8 dst0, dst1, dst2, dst3;
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srli_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srli_h((v8i16)src1, 5);
+    reg0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    reg1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    reg2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    reg3 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    reg4 = (v16u8)__msa_slli_b((v16i8)reg0, 3);
+    reg5 = (v16u8)__msa_slli_b((v16i8)reg1, 3);
+    reg6 = (v16u8)__msa_slli_b((v16i8)reg2, 3);
+    reg4 |= (v16u8)__msa_srai_b((v16i8)reg0, 2);
+    reg5 |= (v16u8)__msa_srai_b((v16i8)reg1, 2);
+    reg6 |= (v16u8)__msa_srai_b((v16i8)reg2, 2);
+    reg3 = -reg3;
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)reg6, (v16i8)reg4);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)reg6, (v16i8)reg4);
+    reg2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg5);
+    reg3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg5);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg2, (v16i8)reg0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg2, (v16i8)reg0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg3, (v16i8)reg1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg3, (v16i8)reg1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_argb1555 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB565ToARGBRow_MSA(const uint8_t* src_rgb565,
+                         uint8_t* dst_argb,
+                         int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_h((const v8u16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)reg2, (v16i8)reg0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg1);
+    res2 = (v16u8)__msa_ilvev_b((v16i8)reg5, (v16i8)reg3);
+    res3 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)reg4);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb565 += 32;
+    dst_argb += 64;
+  }
+}
+
+void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24,
+                        uint8_t* dst_argb,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 shuffler = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_rgb24, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_rgb24 += 48;
+    dst_argb += 64;
+  }
+}
+
+void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v16u8 vec0, vec1, vec2;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16i8 mask = {2, 1, 0, 16, 5, 4, 3, 17, 8, 7, 6, 18, 11, 10, 9, 19};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);
+    vec0 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 12);
+    vec1 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    vec2 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src2, 4);
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec1);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)alpha, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_raw += 48;
+    dst_argb += 64;
+  }
+}
+
+void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
+                        uint8_t* dst_y,
+                        int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v16u8 dst0;
+  v8u16 const_0x19 = (v8u16)__msa_ldi_h(0x19);
+  v8u16 const_0x81 = (v8u16)__msa_ldi_h(0x81);
+  v8u16 const_0x42 = (v8u16)__msa_ldi_h(0x42);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 0);
+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_argb1555, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_slli_h((v8i16)vec1, 3);
+    reg0 |= (v8u16)__msa_srai_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srai_h((v8i16)vec1, 2);
+    reg2 = (v8u16)__msa_slli_h((v8i16)vec2, 3);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg2 |= (v8u16)__msa_srai_h((v8i16)vec2, 2);
+    reg3 |= (v8u16)__msa_srai_h((v8i16)vec3, 2);
+    reg4 = (v8u16)__msa_slli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_slli_h((v8i16)vec5, 3);
+    reg4 |= (v8u16)__msa_srai_h((v8i16)vec4, 2);
+    reg5 |= (v8u16)__msa_srai_h((v8i16)vec5, 2);
+    reg0 *= const_0x19;
+    reg1 *= const_0x19;
+    reg2 *= const_0x81;
+    reg3 *= const_0x81;
+    reg4 *= const_0x42;
+    reg5 *= const_0x42;
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 += reg4;
+    reg1 += reg5;
+    reg0 += const_0x1080;
+    reg1 += const_0x1080;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg1 = (v8u16)__msa_srai_h((v8i16)reg1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_y);
+    src_argb1555 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  int x;
+  v8u16 src0, src1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v4u32 res0, res1, res2, res3;
+  v16u8 dst0;
+  v4u32 const_0x810019 = (v4u32)__msa_fill_w(0x810019);
+  v4u32 const_0x010042 = (v4u32)__msa_fill_w(0x010042);
+  v8i16 const_0x1080 = __msa_fill_h(0x1080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x7E0 = (v8u16)__msa_fill_h(0x7E0);
+  v8u16 const_0xF800 = (v8u16)__msa_fill_h(0xF800);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 0);
+    src1 = (v8u16)__msa_ld_b((const v8i16*)src_rgb565, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src0 & const_0x7E0;
+    vec2 = src0 & const_0xF800;
+    vec3 = src1 & const_0x1F;
+    vec4 = src1 & const_0x7E0;
+    vec5 = src1 & const_0xF800;
+    reg0 = (v8u16)__msa_slli_h((v8i16)vec0, 3);
+    reg1 = (v8u16)__msa_srli_h((v8i16)vec1, 3);
+    reg2 = (v8u16)__msa_srli_h((v8i16)vec2, 8);
+    reg3 = (v8u16)__msa_slli_h((v8i16)vec3, 3);
+    reg4 = (v8u16)__msa_srli_h((v8i16)vec4, 3);
+    reg5 = (v8u16)__msa_srli_h((v8i16)vec5, 8);
+    reg0 |= (v8u16)__msa_srli_h((v8i16)vec0, 2);
+    reg1 |= (v8u16)__msa_srli_h((v8i16)vec1, 9);
+    reg2 |= (v8u16)__msa_srli_h((v8i16)vec2, 13);
+    reg3 |= (v8u16)__msa_srli_h((v8i16)vec3, 2);
+    reg4 |= (v8u16)__msa_srli_h((v8i16)vec4, 9);
+    reg5 |= (v8u16)__msa_srli_h((v8i16)vec5, 13);
+    vec0 = (v8u16)__msa_ilvr_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_ilvl_h((v8i16)reg1, (v8i16)reg0);
+    vec2 = (v8u16)__msa_ilvr_h((v8i16)reg4, (v8i16)reg3);
+    vec3 = (v8u16)__msa_ilvl_h((v8i16)reg4, (v8i16)reg3);
+    vec4 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg2);
+    vec5 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg2);
+    vec6 = (v8u16)__msa_ilvr_h(const_0x1080, (v8i16)reg5);
+    vec7 = (v8u16)__msa_ilvl_h(const_0x1080, (v8i16)reg5);
+    res0 = __msa_dotp_u_w(vec0, (v8u16)const_0x810019);
+    res1 = __msa_dotp_u_w(vec1, (v8u16)const_0x810019);
+    res2 = __msa_dotp_u_w(vec2, (v8u16)const_0x810019);
+    res3 = __msa_dotp_u_w(vec3, (v8u16)const_0x810019);
+    res0 = __msa_dpadd_u_w(res0, vec4, (v8u16)const_0x010042);
+    res1 = __msa_dpadd_u_w(res1, vec5, (v8u16)const_0x010042);
+    res2 = __msa_dpadd_u_w(res2, vec6, (v8u16)const_0x010042);
+    res3 = __msa_dpadd_u_w(res3, vec7, (v8u16)const_0x010042);
+    res0 = (v4u32)__msa_srai_w((v4i32)res0, 8);
+    res1 = (v4u32)__msa_srai_w((v4i32)res1, 8);
+    res2 = (v4u32)__msa_srai_w((v4i32)res2, 8);
+    res3 = (v4u32)__msa_srai_w((v4i32)res3, 8);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)res1, (v8i16)res0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)res3, (v8i16)res2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_rgb565 += 32;
+    dst_y += 16;
+  }
+}
+
+void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8119 = (v8u16)__msa_fill_h(0x8119);
+  v8u16 const_0x42 = (v8u16)__msa_fill_h(0x42);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8119);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8119);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x42);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x42);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, reg0, reg1, reg2, reg3, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const_0x8142 = (v8u16)__msa_fill_h(0x8142);
+  v8u16 const_0x19 = (v8u16)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+  v16i8 mask0 = {0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12};
+  v16i8 mask1 = {12, 13, 14, 15, 15, 16, 17, 18,
+                 18, 19, 20, 21, 21, 22, 23, 24};
+  v16i8 mask2 = {8, 9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19, 20};
+  v16i8 mask3 = {4, 5, 6, 7, 7, 8, 9, 10, 10, 11, 12, 13, 13, 14, 15, 16};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    reg0 = (v16u8)__msa_vshf_b(mask0, zero, (v16i8)src0);
+    reg1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    reg2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src2, (v16i8)src1);
+    reg3 = (v16u8)__msa_vshf_b(mask3, zero, (v16i8)src2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec3 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = __msa_dotp_u_h((v16u8)vec0, (v16u8)const_0x8142);
+    vec1 = __msa_dotp_u_h((v16u8)vec1, (v16u8)const_0x8142);
+    vec0 = __msa_dpadd_u_h(vec0, (v16u8)vec2, (v16u8)const_0x19);
+    vec1 = __msa_dpadd_u_h(vec1, (v16u8)vec3, (v16u8)const_0x19);
+    vec0 += const_0x1080;
+    vec1 += const_0x1080;
+    vec0 = (v8u16)__msa_srai_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srai_h((v8i16)vec1, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 48;
+    dst_y += 16;
+  }
+}
+
+void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
+                         int src_stride_argb1555,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_argb1555;
+  const uint16_t* t = (const uint16_t*)(src_argb1555 + src_stride_argb1555);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x1F;
+    vec3 = src1 & const_0x1F;
+    vec2 += src2 & const_0x1F;
+    vec3 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec4 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec4 = __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec6 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec6 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec0 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec0 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    vec2 = (v8u16)__msa_slli_h((v8i16)vec4, 1);
+    vec2 |= (v8u16)__msa_srai_h((v8i16)vec4, 6);
+    reg0 = vec6 * const_0x70;
+    reg1 = vec0 * const_0x4A;
+    reg2 = vec2 * const_0x70;
+    reg3 = vec0 * const_0x5E;
+    reg0 += const_0x8080;
+    reg1 += vec2 * const_0x26;
+    reg2 += const_0x8080;
+    reg3 += vec6 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
+                       int src_stride_rgb565,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  int x;
+  const uint16_t* s = (const uint16_t*)src_rgb565;
+  const uint16_t* t = (const uint16_t*)(src_rgb565 + src_stride_rgb565);
+  int64_t res0, res1;
+  v8u16 src0, src1, src2, src3, reg0, reg1, reg2, reg3;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_ldi_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_ldi_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_ldi_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_ldi_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_ldi_h(0x12);
+  v8u16 const_32896 = (v8u16)__msa_fill_h(0x8080);
+  v8u16 const_0x1F = (v8u16)__msa_ldi_h(0x1F);
+  v8u16 const_0x3F = (v8u16)__msa_fill_h(0x3F);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v8u16)__msa_ld_b((v8i16*)s, 0);
+    src1 = (v8u16)__msa_ld_b((v8i16*)s, 16);
+    src2 = (v8u16)__msa_ld_b((v8i16*)t, 0);
+    src3 = (v8u16)__msa_ld_b((v8i16*)t, 16);
+    vec0 = src0 & const_0x1F;
+    vec1 = src1 & const_0x1F;
+    vec0 += src2 & const_0x1F;
+    vec1 += src3 & const_0x1F;
+    vec0 = (v8u16)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 5);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 5);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 5);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 5);
+    vec2 = src0 & const_0x3F;
+    vec3 = src1 & const_0x3F;
+    vec2 += src2 & const_0x3F;
+    vec3 += src3 & const_0x3F;
+    vec1 = (v8u16)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    src0 = (v8u16)__msa_srai_h((v8i16)src0, 6);
+    src1 = (v8u16)__msa_srai_h((v8i16)src1, 6);
+    src2 = (v8u16)__msa_srai_h((v8i16)src2, 6);
+    src3 = (v8u16)__msa_srai_h((v8i16)src3, 6);
+    vec4 = src0 & const_0x1F;
+    vec5 = src1 & const_0x1F;
+    vec4 += src2 & const_0x1F;
+    vec5 += src3 & const_0x1F;
+    vec2 = (v8u16)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_slli_h((v8i16)vec0, 1);
+    vec3 |= (v8u16)__msa_srai_h((v8i16)vec0, 6);
+    vec4 = (v8u16)__msa_slli_h((v8i16)vec2, 1);
+    vec4 |= (v8u16)__msa_srai_h((v8i16)vec2, 6);
+    reg0 = vec3 * const_0x70;
+    reg1 = vec1 * const_0x4A;
+    reg2 = vec4 * const_0x70;
+    reg3 = vec1 * const_0x5E;
+    reg0 += const_32896;
+    reg1 += vec4 * const_0x26;
+    reg2 += const_32896;
+    reg3 += vec3 * const_0x12;
+    reg0 -= reg1;
+    reg2 -= reg3;
+    reg0 = (v8u16)__msa_srai_h((v8i16)reg0, 8);
+    reg2 = (v8u16)__msa_srai_h((v8i16)reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    s += 16;
+    t += 16;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h((v8i16)reg0, 2);
+    reg1 = __msa_srai_h((v8i16)reg1, 2);
+    reg2 = __msa_srai_h((v8i16)reg2, 2);
+    reg3 = __msa_srai_h((v8i16)reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h(reg1, reg0);
+    vec5 = (v8u16)__msa_pckev_h(reg3, reg2);
+    vec6 = (v8u16)__msa_pckod_h(reg1, reg0);
+    vec7 = (v8u16)__msa_pckod_h(reg3, reg2);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+                    int src_stride_rgb,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  int64_t res0, res1;
+  v16u8 inp0, inp1, inp2, inp3, inp4, inp5;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8i16 reg0, reg1, reg2, reg3;
+  v16u8 dst0;
+  v8u16 const_0x70 = (v8u16)__msa_fill_h(0x70);
+  v8u16 const_0x4A = (v8u16)__msa_fill_h(0x4A);
+  v8u16 const_0x26 = (v8u16)__msa_fill_h(0x26);
+  v8u16 const_0x5E = (v8u16)__msa_fill_h(0x5E);
+  v8u16 const_0x12 = (v8u16)__msa_fill_h(0x12);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+  v16i8 mask = {0, 1, 2, 16, 3, 4, 5, 17, 6, 7, 8, 18, 9, 10, 11, 19};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    inp0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    inp1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    inp2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    inp3 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    inp4 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    inp5 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src1 = (v16u8)__msa_sldi_b((v16i8)inp1, (v16i8)inp0, 12);
+    src5 = (v16u8)__msa_sldi_b((v16i8)inp4, (v16i8)inp3, 12);
+    src2 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp1, 8);
+    src6 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp4, 8);
+    src3 = (v16u8)__msa_sldi_b((v16i8)inp2, (v16i8)inp2, 4);
+    src7 = (v16u8)__msa_sldi_b((v16i8)inp5, (v16i8)inp5, 4);
+    src0 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp0);
+    src1 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src1);
+    src2 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src2);
+    src3 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src3);
+    src4 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)inp3);
+    src5 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src5);
+    src6 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src6);
+    src7 = (v16u8)__msa_vshf_b(mask, (v16i8)zero, (v16i8)src7);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src4, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src4, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src5, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src5, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)src6, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)src6, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)src7, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)src7, (v16i8)src3);
+    vec0 = (v8u16)__msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8u16)__msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8u16)__msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8u16)__msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8u16)__msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec6 = (v8u16)__msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec7 = (v8u16)__msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    reg0 = (v8i16)__msa_pckev_d((v2i64)vec1, (v2i64)vec0);
+    reg1 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec2);
+    reg2 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec4);
+    reg3 = (v8i16)__msa_pckev_d((v2i64)vec7, (v2i64)vec6);
+    reg0 += (v8i16)__msa_pckod_d((v2i64)vec1, (v2i64)vec0);
+    reg1 += (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec2);
+    reg2 += (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec4);
+    reg3 += (v8i16)__msa_pckod_d((v2i64)vec7, (v2i64)vec6);
+    reg0 = __msa_srai_h(reg0, 2);
+    reg1 = __msa_srai_h(reg1, 2);
+    reg2 = __msa_srai_h(reg2, 2);
+    reg3 = __msa_srai_h(reg3, 2);
+    vec4 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec5 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec6 = (v8u16)__msa_pckod_h((v8i16)reg1, (v8i16)reg0);
+    vec7 = (v8u16)__msa_pckod_h((v8i16)reg3, (v8i16)reg2);
+    vec0 = (v8u16)__msa_pckod_h((v8i16)vec5, (v8i16)vec4);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    vec3 = vec0 * const_0x70;
+    vec4 = vec1 * const_0x4A;
+    vec5 = vec2 * const_0x26;
+    vec2 *= const_0x70;
+    vec1 *= const_0x5E;
+    vec0 *= const_0x12;
+    reg0 = __msa_subv_h((v8i16)vec3, (v8i16)vec4);
+    reg1 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec5);
+    reg2 = __msa_subv_h((v8i16)vec2, (v8i16)vec1);
+    reg3 = __msa_subv_h((v8i16)const_0x8080, (v8i16)vec0);
+    reg0 += reg1;
+    reg2 += reg3;
+    reg0 = __msa_srai_h(reg0, 8);
+    reg2 = __msa_srai_h(reg2, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg2, (v16i8)reg0);
+    res0 = __msa_copy_u_d((v2i64)dst0, 0);
+    res1 = __msa_copy_u_d((v2i64)dst0, 1);
+    SD(res0, dst_u);
+    SD(res1, dst_v);
+    t += 48;
+    s += 48;
+    dst_u += 8;
+    dst_v += 8;
+  }
+}
+
+void NV12ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_uv,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_uv += 8;
+    dst_argb += 32;
+  }
+}
+
+void NV12ToRGB565Row_MSA(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb565,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_uv);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    vec0 = vec0 >> 3;
+    vec1 = (vec1 >> 2) << 5;
+    vec2 = (vec2 >> 3) << 11;
+    dst0 = (v16u8)(vec0 | vec1 | vec2);
+    ST_UB(dst0, dst_rgb565);
+    src_y += 8;
+    src_uv += 8;
+    dst_rgb565 += 16;
+  }
+}
+
+void NV21ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_vu,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  uint64_t val0, val1;
+  v16u8 src0, src1, res0, res1, dst0, dst1;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v16u8 zero = {0};
+  v16i8 shuffler = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    val0 = LD(src_y);
+    val1 = LD(src_vu);
+    src0 = (v16u8)__msa_insert_d((v2i64)zero, 0, val0);
+    src1 = (v16u8)__msa_insert_d((v2i64)zero, 0, val1);
+    src1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    YUVTORGB(src0, src1, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    res0 = (v16u8)__msa_ilvev_b((v16i8)vec2, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvev_b((v16i8)alpha, (v16i8)vec1);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res1, (v16i8)res0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res1, (v16i8)res0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_vu += 8;
+    dst_argb += 32;
+  }
+}
+
+void SobelRow_MSA(const uint8_t* src_sobelx,
+                  const uint8_t* src_sobely,
+                  uint8_t* dst_argb,
+                  int width) {
+  int x;
+  v16u8 src0, src1, vec0, dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {0, 0, 0, 16, 1, 1, 1, 16, 2, 2, 2, 16, 3, 3, 3, 16};
+  v16i8 const_0x4 = __msa_ldi_b(0x4);
+  v16i8 mask1 = mask0 + const_0x4;
+  v16i8 mask2 = mask1 + const_0x4;
+  v16i8 mask3 = mask2 + const_0x4;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    dst0 = (v16u8)__msa_vshf_b(mask0, (v16i8)alpha, (v16i8)vec0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)alpha, (v16i8)vec0);
+    dst2 = (v16u8)__msa_vshf_b(mask2, (v16i8)alpha, (v16i8)vec0);
+    dst3 = (v16u8)__msa_vshf_b(mask3, (v16i8)alpha, (v16i8)vec0);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void SobelToPlaneRow_MSA(const uint8_t* src_sobelx,
+                         const uint8_t* src_sobely,
+                         uint8_t* dst_y,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 16);
+    dst0 = __msa_adds_u_b(src0, src2);
+    dst1 = __msa_adds_u_b(src1, src3);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_sobelx += 32;
+    src_sobely += 32;
+    dst_y += 32;
+  }
+}
+
+void SobelXYRow_MSA(const uint8_t* src_sobelx,
+                    const uint8_t* src_sobely,
+                    uint8_t* dst_argb,
+                    int width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, vec2;
+  v16u8 reg0, reg1, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_sobelx, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_sobely, 0);
+    vec0 = __msa_adds_u_b(src0, src1);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src1);
+    reg0 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)vec0);
+    reg1 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)vec0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)vec1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)vec1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)vec2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)vec2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_sobelx += 16;
+    src_sobely += 16;
+    dst_argb += 64;
+  }
+}
+
+void ARGBToYJRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4B0F = (v16u8)__msa_fill_h(0x4B0F);
+  v16u8 const_0x26 = (v16u8)__msa_fill_h(0x26);
+  v8u16 const_0x40 = (v8u16)__msa_fill_h(0x40);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4B0F, const_0x26, const_0x40, 7,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x4200 = (v16u8)__msa_fill_h(0x4200);
+  v16u8 const_0x1981 = (v16u8)__msa_fill_h(0x1981);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x4200, const_0x1981, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x8142 = (v16u8)__msa_fill_h(0x8142);
+  v16u8 const_0x19 = (v16u8)__msa_fill_h(0x19);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x8142, const_0x19, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0;
+  v16u8 const_0x1900 = (v16u8)__msa_fill_h(0x1900);
+  v16u8 const_0x4281 = (v16u8)__msa_fill_h(0x4281);
+  v8u16 const_0x1080 = (v8u16)__msa_fill_h(0x1080);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 48);
+    ARGBTOY(src0, src1, src2, src3, const_0x1900, const_0x4281, const_0x1080, 8,
+            dst0);
+    ST_UB(dst0, dst_y);
+    src_argb0 += 64;
+    dst_y += 16;
+  }
+}
+
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+                      int src_stride_rgb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, vec2, vec3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x7F = (v16u8)__msa_fill_h(0x7F);
+  v16u8 const_0x6B14 = (v16u8)__msa_fill_h(0x6B14);
+  v16u8 const_0x2B54 = (v16u8)__msa_fill_h(0x2B54);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((const v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((const v16i8*)t, 48);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec0 = __msa_aver_u_b(src4, src6);
+    vec1 = __msa_aver_u_b(src5, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 64);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 80);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 96);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 112);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 64);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 80);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 96);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 112);
+    src0 = __msa_aver_u_b(src0, src4);
+    src1 = __msa_aver_u_b(src1, src5);
+    src2 = __msa_aver_u_b(src2, src6);
+    src3 = __msa_aver_u_b(src3, src7);
+    src4 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    src5 = (v16u8)__msa_pckev_w((v4i32)src3, (v4i32)src2);
+    src6 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    src7 = (v16u8)__msa_pckod_w((v4i32)src3, (v4i32)src2);
+    vec2 = __msa_aver_u_b(src4, src6);
+    vec3 = __msa_aver_u_b(src5, src7);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x6B14, const_0x7F, const_0x2B54,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x264A);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_v);
+    ST_UB(dst1, dst_u);
+    s += 128;
+    t += 128;
+    dst_v += 16;
+    dst_u += 16;
+  }
+}
+
+void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30};
+  v16u8 const_0x4A26 = (v16u8)__msa_fill_h(0x4A26);
+  v16u8 const_0x0070 = (v16u8)__msa_fill_h(0x0070);
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, src0, src1, src2, src3);
+    ARGBTOUV(src0, src1, src2, src3, const_0x4A26, const_0x0070, const_0x125E,
+             const_0x8080, shuffler1, shuffler0, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+                     int src_stride_rgb,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  const uint8_t* s = src_rgb0;
+  const uint8_t* t = src_rgb0 + src_stride_rgb;
+  v16u8 dst0, dst1, vec0, vec1, vec2, vec3;
+  v16i8 shuffler0 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+  v16i8 shuffler1 = {2,  3,  6,  7,  10, 11, 14, 15,
+                     18, 19, 22, 23, 26, 27, 30, 31};
+  v16i8 shuffler2 = {0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31};
+  v16i8 shuffler3 = {2, 1, 6, 5, 10, 9, 14, 13, 18, 17, 22, 21, 26, 25, 30, 29};
+  v16u8 const_0x125E = (v16u8)__msa_fill_h(0x264A);
+  v16u8 const_0x7000 = (v16u8)__msa_fill_h(0x7000);
+  v16u8 const_0x264A = (v16u8)__msa_fill_h(0x125E);
+  v8u16 const_0x8080 = (v8u16)__msa_fill_h(0x8080);
+
+  for (x = 0; x < width; x += 32) {
+    READ_ARGB(s, t, vec0, vec1, vec2, vec3);
+    ARGBTOUV(vec0, vec1, vec2, vec3, const_0x125E, const_0x7000, const_0x264A,
+             const_0x8080, shuffler0, shuffler1, shuffler2, shuffler3, dst0,
+             dst1);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    s += 128;
+    t += 128;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void I444ToARGBRow_MSA(const uint8_t* src_y,
+                       const uint8_t* src_u,
+                       const uint8_t* src_v,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2, dst0, dst1;
+  v8u16 vec0, vec1, vec2;
+  v4i32 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 zero = {0};
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+
+  for (x = 0; x < width; x += 8) {
+    READI444(src_y, src_u, src_v, src0, src1, src2);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg4 = reg0 + vec_br;
+    reg5 = reg1 + vec_br;
+    reg2 = reg0 + vec_bg;
+    reg3 = reg1 + vec_bg;
+    reg0 += vec_bb;
+    reg1 += vec_bb;
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec1 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src2);
+    reg6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    reg7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    reg8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    reg9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    reg0 -= reg6 * vec_ub;
+    reg1 -= reg7 * vec_ub;
+    reg2 -= reg6 * vec_ug;
+    reg3 -= reg7 * vec_ug;
+    reg4 -= reg8 * vec_vr;
+    reg5 -= reg9 * vec_vr;
+    reg2 -= reg8 * vec_vg;
+    reg3 -= reg9 * vec_vg;
+    reg0 = __msa_srai_w(reg0, 6);
+    reg1 = __msa_srai_w(reg1, 6);
+    reg2 = __msa_srai_w(reg2, 6);
+    reg3 = __msa_srai_w(reg3, 6);
+    reg4 = __msa_srai_w(reg4, 6);
+    reg5 = __msa_srai_w(reg5, 6);
+    CLIP_0TO255(reg0, reg1, reg2, reg3, reg4, reg5);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec2 = (v8u16)__msa_pckev_h((v8i16)reg5, (v8i16)reg4);
+    vec0 = (v8u16)__msa_ilvev_b((v16i8)vec1, (v16i8)vec0);
+    vec1 = (v8u16)__msa_ilvev_b((v16i8)alpha, (v16i8)vec2);
+    dst0 = (v16u8)__msa_ilvr_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v16u8)__msa_ilvl_h((v8i16)vec1, (v8i16)vec0);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_y += 8;
+    src_u += 8;
+    src_v += 8;
+    dst_argb += 32;
+  }
+}
+
+void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1;
+  v4i32 reg0, reg1, reg2, reg3;
+  v4i32 vec_yg = __msa_fill_w(0x4A35);
+  v8i16 vec_ygb = __msa_fill_h(0xFB78);
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+  v8i16 max = __msa_ldi_h(0xFF);
+  v8i16 zero = {0};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    reg0 = (v4i32)__msa_ilvr_h(zero, vec0);
+    reg1 = (v4i32)__msa_ilvl_h(zero, vec0);
+    reg2 = (v4i32)__msa_ilvr_h(zero, vec1);
+    reg3 = (v4i32)__msa_ilvl_h(zero, vec1);
+    reg0 *= vec_yg;
+    reg1 *= vec_yg;
+    reg2 *= vec_yg;
+    reg3 *= vec_yg;
+    reg0 = __msa_srai_w(reg0, 16);
+    reg1 = __msa_srai_w(reg1, 16);
+    reg2 = __msa_srai_w(reg2, 16);
+    reg3 = __msa_srai_w(reg3, 16);
+    vec0 = (v8i16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    vec0 += vec_ygb;
+    vec1 += vec_ygb;
+    vec0 = __msa_srai_h(vec0, 6);
+    vec1 = __msa_srai_h(vec1, 6);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    res0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    res1 = (v16u8)__msa_ilvr_b((v16i8)res0, (v16i8)res0);
+    res2 = (v16u8)__msa_ilvl_b((v16i8)res0, (v16i8)res0);
+    res3 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)res0);
+    res4 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)res0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)res3, (v16i8)res1);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)res3, (v16i8)res1);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)res4, (v16i8)res2);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)res4, (v16i8)res2);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void J400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  int x;
+  v16u8 src0, vec0, vec1, vec2, vec3, dst0, dst1, dst2, dst3;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y, 0);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvl_b((v16i8)src0, (v16i8)src0);
+    vec2 = (v16u8)__msa_ilvr_b((v16i8)alpha, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)alpha, (v16i8)src0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)vec2, (v16i8)vec0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)vec2, (v16i8)vec0);
+    dst2 = (v16u8)__msa_ilvr_b((v16i8)vec3, (v16i8)vec1);
+    dst3 = (v16u8)__msa_ilvl_b((v16i8)vec3, (v16i8)vec1);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    src_y += 16;
+    dst_argb += 64;
+  }
+}
+
+void YUY2ToARGBRow_MSA(const uint8_t* src_yuy2,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_yuy2, 0);
+    src1 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_yuy2 += 16;
+    dst_argb += 32;
+  }
+}
+
+void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
+                       uint8_t* dst_argb,
+                       const struct YuvConstants* yuvconstants,
+                       int width) {
+  int x;
+  v16u8 src0, src1, src2;
+  v8i16 vec0, vec1, vec2;
+  v4i32 vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg, vec_br, vec_yg;
+  v4i32 vec_ubvr, vec_ugvg;
+  v16u8 alpha = (v16u8)__msa_ldi_b(ALPHA_VAL);
+
+  YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_bb, vec_bg,
+                 vec_br, vec_yg);
+  vec_ubvr = __msa_ilvr_w(vec_vr, vec_ub);
+  vec_ugvg = (v4i32)__msa_ilvev_h((v8i16)vec_vg, (v8i16)vec_ug);
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uyvy, 0);
+    src1 = (v16u8)__msa_pckod_b((v16i8)src0, (v16i8)src0);
+    src2 = (v16u8)__msa_pckev_b((v16i8)src0, (v16i8)src0);
+    YUVTORGB(src1, src2, vec_ubvr, vec_ugvg, vec_bb, vec_bg, vec_br, vec_yg,
+             vec0, vec1, vec2);
+    STOREARGB(vec0, vec1, vec2, alpha, dst_argb);
+    src_uyvy += 16;
+    dst_argb += 32;
+  }
+}
+
+void InterpolateRow_MSA(uint8_t* dst_ptr,
+                        const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        int width,
+                        int32_t source_y_fraction) {
+  int32_t y1_fraction = source_y_fraction;
+  int32_t y0_fraction = 256 - y1_fraction;
+  uint16_t y_fractions;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, y_frac;
+
+  if (0 == y1_fraction) {
+    memcpy(dst_ptr, src_ptr, width);
+    return;
+  }
+
+  if (128 == y1_fraction) {
+    for (x = 0; x < width; x += 32) {
+      src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+      src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+      src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+      src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+      dst0 = __msa_aver_u_b(src0, src2);
+      dst1 = __msa_aver_u_b(src1, src3);
+      ST_UB2(dst0, dst1, dst_ptr, 16);
+      s += 32;
+      t += 32;
+      dst_ptr += 32;
+    }
+    return;
+  }
+
+  y_fractions = (uint16_t)(y0_fraction + (y1_fraction << 8));
+  y_frac = (v8u16)__msa_fill_h(y_fractions);
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = (v8u16)__msa_dotp_u_h((v16u8)vec0, (v16u8)y_frac);
+    vec1 = (v8u16)__msa_dotp_u_h((v16u8)vec1, (v16u8)y_frac);
+    vec2 = (v8u16)__msa_dotp_u_h((v16u8)vec2, (v16u8)y_frac);
+    vec3 = (v8u16)__msa_dotp_u_h((v16u8)vec3, (v16u8)y_frac);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 8);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 8);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 8);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 8);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst_ptr, 16);
+    s += 32;
+    t += 32;
+    dst_ptr += 32;
+  }
+}
+
+void ARGBSetRow_MSA(uint8_t* dst_argb, uint32_t v32, int width) {
+  int x;
+  v4i32 dst0 = __builtin_msa_fill_w(v32);
+
+  for (x = 0; x < width; x += 4) {
+    ST_UB(dst0, dst_argb);
+    dst_argb += 16;
+  }
+}
+
+void RAWToRGB24Row_MSA(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2;
+  v16i8 shuffler0 = {2, 1, 0, 5, 4, 3, 8, 7, 6, 11, 10, 9, 14, 13, 12, 17};
+  v16i8 shuffler1 = {8,  7,  12, 11, 10, 15, 14, 13,
+                     18, 17, 16, 21, 20, 19, 24, 23};
+  v16i8 shuffler2 = {14, 19, 18, 17, 22, 21, 20, 25,
+                     24, 23, 28, 27, 26, 31, 30, 29};
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_raw, 32);
+    src3 = (v16u8)__msa_sldi_b((v16i8)src1, (v16i8)src0, 8);
+    src4 = (v16u8)__msa_sldi_b((v16i8)src2, (v16i8)src1, 8);
+    dst0 = (v16u8)__msa_vshf_b(shuffler0, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(shuffler1, (v16i8)src4, (v16i8)src3);
+    dst2 = (v16u8)__msa_vshf_b(shuffler2, (v16i8)src2, (v16i8)src1);
+    ST_UB2(dst0, dst1, dst_rgb24, 16);
+    ST_UB(dst2, (dst_rgb24 + 32));
+    src_raw += 48;
+    dst_rgb24 += 48;
+  }
+}
+
+void MergeUVRow_MSA(const uint8_t* src_u,
+                    const uint8_t* src_v,
+                    uint8_t* dst_uv,
+                    int width) {
+  int x;
+  v16u8 src0, src1, dst0, dst1;
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_u, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_v, 0);
+    dst0 = (v16u8)__msa_ilvr_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_ilvl_b((v16i8)src1, (v16i8)src0);
+    ST_UB2(dst0, dst1, dst_uv, 16);
+    src_u += 16;
+    src_v += 16;
+    dst_uv += 32;
+  }
+}
+
+void ARGBExtractAlphaRow_MSA(const uint8_t* src_argb,
+                             uint8_t* dst_a,
+                             int width) {
+  int i;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+
+  for (i = 0; i < width; i += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 48);
+    vec0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_a);
+    src_argb += 64;
+    dst_a += 16;
+  }
+}
+
+void ARGBBlendRow_MSA(const uint8_t* src_argb0,
+                      const uint8_t* src_argb1,
+                      uint8_t* dst_argb,
+                      int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 vec8, vec9, vec10, vec11, vec12, vec13;
+  v8u16 const_256 = (v8u16)__msa_ldi_h(256);
+  v16u8 const_255 = (v16u8)__msa_ldi_b(255);
+  v16u8 mask = {0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_argb1, 16);
+    vec0 = (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8u16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8u16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8u16)__msa_ilvl_b(zero, (v16i8)src3);
+    vec8 = (v8u16)__msa_fill_h(vec0[3]);
+    vec9 = (v8u16)__msa_fill_h(vec0[7]);
+    vec10 = (v8u16)__msa_fill_h(vec1[3]);
+    vec11 = (v8u16)__msa_fill_h(vec1[7]);
+    vec8 = (v8u16)__msa_pckev_d((v2i64)vec9, (v2i64)vec8);
+    vec9 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec10 = (v8u16)__msa_fill_h(vec2[3]);
+    vec11 = (v8u16)__msa_fill_h(vec2[7]);
+    vec12 = (v8u16)__msa_fill_h(vec3[3]);
+    vec13 = (v8u16)__msa_fill_h(vec3[7]);
+    vec10 = (v8u16)__msa_pckev_d((v2i64)vec11, (v2i64)vec10);
+    vec11 = (v8u16)__msa_pckev_d((v2i64)vec13, (v2i64)vec12);
+    vec8 = const_256 - vec8;
+    vec9 = const_256 - vec9;
+    vec10 = const_256 - vec10;
+    vec11 = const_256 - vec11;
+    vec8 *= vec4;
+    vec9 *= vec5;
+    vec10 *= vec6;
+    vec11 *= vec7;
+    vec8 = (v8u16)__msa_srai_h((v8i16)vec8, 8);
+    vec9 = (v8u16)__msa_srai_h((v8i16)vec9, 8);
+    vec10 = (v8u16)__msa_srai_h((v8i16)vec10, 8);
+    vec11 = (v8u16)__msa_srai_h((v8i16)vec11, 8);
+    vec0 += vec8;
+    vec1 += vec9;
+    vec2 += vec10;
+    vec3 += vec11;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst0 = __msa_bmnz_v(dst0, const_255, mask);
+    dst1 = __msa_bmnz_v(dst1, const_255, mask);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb0 += 32;
+    src_argb1 += 32;
+    dst_argb += 32;
+  }
+}
+
+void ARGBQuantizeRow_MSA(uint8_t* dst_argb,
+                         int scale,
+                         int interval_size,
+                         int interval_offset,
+                         int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v4i32 vec_scale = __msa_fill_w(scale);
+  v16u8 vec_int_sz = (v16u8)__msa_fill_b(interval_size);
+  v16u8 vec_int_ofst = (v16u8)__msa_fill_b(interval_offset);
+  v16i8 mask = {0, 1, 2, 19, 4, 5, 6, 23, 8, 9, 10, 27, 12, 13, 14, 31};
+  v16i8 zero = {0};
+
+  for (x = 0; x < width; x += 8) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)dst_argb, 48);
+    vec0 = (v8i16)__msa_ilvr_b(zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b(zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_ilvr_b(zero, (v16i8)src3);
+    vec7 = (v8i16)__msa_ilvl_b(zero, (v16i8)src3);
+    tmp0 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec0);
+    tmp1 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec0);
+    tmp2 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec1);
+    tmp3 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec1);
+    tmp4 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec2);
+    tmp5 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec2);
+    tmp6 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec3);
+    tmp7 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec3);
+    tmp8 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec4);
+    tmp9 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec4);
+    tmp10 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec5);
+    tmp11 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec5);
+    tmp12 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec6);
+    tmp13 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec6);
+    tmp14 = (v4i32)__msa_ilvr_h((v8i16)zero, (v8i16)vec7);
+    tmp15 = (v4i32)__msa_ilvl_h((v8i16)zero, (v8i16)vec7);
+    tmp0 *= vec_scale;
+    tmp1 *= vec_scale;
+    tmp2 *= vec_scale;
+    tmp3 *= vec_scale;
+    tmp4 *= vec_scale;
+    tmp5 *= vec_scale;
+    tmp6 *= vec_scale;
+    tmp7 *= vec_scale;
+    tmp8 *= vec_scale;
+    tmp9 *= vec_scale;
+    tmp10 *= vec_scale;
+    tmp11 *= vec_scale;
+    tmp12 *= vec_scale;
+    tmp13 *= vec_scale;
+    tmp14 *= vec_scale;
+    tmp15 *= vec_scale;
+    tmp0 >>= 16;
+    tmp1 >>= 16;
+    tmp2 >>= 16;
+    tmp3 >>= 16;
+    tmp4 >>= 16;
+    tmp5 >>= 16;
+    tmp6 >>= 16;
+    tmp7 >>= 16;
+    tmp8 >>= 16;
+    tmp9 >>= 16;
+    tmp10 >>= 16;
+    tmp11 >>= 16;
+    tmp12 >>= 16;
+    tmp13 >>= 16;
+    tmp14 >>= 16;
+    tmp15 >>= 16;
+    vec0 = (v8i16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8i16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec2 = (v8i16)__msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec3 = (v8i16)__msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec4 = (v8i16)__msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec5 = (v8i16)__msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec6 = (v8i16)__msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec7 = (v8i16)__msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)vec5, (v16i8)vec4);
+    dst3 = (v16u8)__msa_pckev_b((v16i8)vec7, (v16i8)vec6);
+    dst0 *= vec_int_sz;
+    dst1 *= vec_int_sz;
+    dst2 *= vec_int_sz;
+    dst3 *= vec_int_sz;
+    dst0 += vec_int_ofst;
+    dst1 += vec_int_ofst;
+    dst2 += vec_int_ofst;
+    dst3 += vec_int_ofst;
+    dst0 = (v16u8)__msa_vshf_b(mask, (v16i8)src0, (v16i8)dst0);
+    dst1 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)dst1);
+    dst2 = (v16u8)__msa_vshf_b(mask, (v16i8)src2, (v16i8)dst2);
+    dst3 = (v16u8)__msa_vshf_b(mask, (v16i8)src3, (v16i8)dst3);
+    ST_UB4(dst0, dst1, dst2, dst3, dst_argb, 16);
+    dst_argb += 64;
+  }
+}
+
+void ARGBColorMatrixRow_MSA(const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            const int8_t* matrix_argb,
+                            int width) {
+  int32_t x;
+  v16i8 src0;
+  v16u8 src1, src2, dst0, dst1;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v8i16 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v4i32 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  v16i8 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  src0 = __msa_ld_b((v16i8*)matrix_argb, 0);
+  vec0 = (v8i16)__msa_ilvr_b(zero, src0);
+  vec1 = (v8i16)__msa_ilvl_b(zero, src0);
+
+  for (x = 0; x < width; x += 8) {
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 0);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_argb, 16);
+    vec2 = (v8i16)__msa_ilvr_b(zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b(zero, (v16i8)src1);
+    vec4 = (v8i16)__msa_ilvr_b(zero, (v16i8)src2);
+    vec5 = (v8i16)__msa_ilvl_b(zero, (v16i8)src2);
+    vec6 = (v8i16)__msa_pckod_d((v2i64)vec2, (v2i64)vec2);
+    vec7 = (v8i16)__msa_pckod_d((v2i64)vec3, (v2i64)vec3);
+    vec8 = (v8i16)__msa_pckod_d((v2i64)vec4, (v2i64)vec4);
+    vec9 = (v8i16)__msa_pckod_d((v2i64)vec5, (v2i64)vec5);
+    vec2 = (v8i16)__msa_pckev_d((v2i64)vec2, (v2i64)vec2);
+    vec3 = (v8i16)__msa_pckev_d((v2i64)vec3, (v2i64)vec3);
+    vec4 = (v8i16)__msa_pckev_d((v2i64)vec4, (v2i64)vec4);
+    vec5 = (v8i16)__msa_pckev_d((v2i64)vec5, (v2i64)vec5);
+    vec10 = vec2 * vec0;
+    vec11 = vec2 * vec1;
+    vec12 = vec6 * vec0;
+    vec13 = vec6 * vec1;
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    vec14 = vec3 * vec0;
+    vec15 = vec3 * vec1;
+    vec16 = vec7 * vec0;
+    vec17 = vec7 * vec1;
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    tmp0 = __msa_hadd_s_w(vec10, vec10);
+    tmp1 = __msa_hadd_s_w(vec11, vec11);
+    tmp2 = __msa_hadd_s_w(vec12, vec12);
+    tmp3 = __msa_hadd_s_w(vec13, vec13);
+    tmp0 = __msa_srai_w(tmp0, 6);
+    tmp1 = __msa_srai_w(tmp1, 6);
+    tmp2 = __msa_srai_w(tmp2, 6);
+    tmp3 = __msa_srai_w(tmp3, 6);
+    vec2 = vec4 * vec0;
+    vec6 = vec4 * vec1;
+    vec3 = vec8 * vec0;
+    vec7 = vec8 * vec1;
+    tmp8 = __msa_hadd_s_w(vec2, vec2);
+    tmp9 = __msa_hadd_s_w(vec6, vec6);
+    tmp10 = __msa_hadd_s_w(vec3, vec3);
+    tmp11 = __msa_hadd_s_w(vec7, vec7);
+    vec4 = vec5 * vec0;
+    vec8 = vec5 * vec1;
+    vec5 = vec9 * vec0;
+    vec9 = vec9 * vec1;
+    tmp12 = __msa_hadd_s_w(vec4, vec4);
+    tmp13 = __msa_hadd_s_w(vec8, vec8);
+    tmp14 = __msa_hadd_s_w(vec5, vec5);
+    tmp15 = __msa_hadd_s_w(vec9, vec9);
+    vec14 = __msa_pckev_h((v8i16)tmp9, (v8i16)tmp8);
+    vec15 = __msa_pckev_h((v8i16)tmp11, (v8i16)tmp10);
+    vec16 = __msa_pckev_h((v8i16)tmp13, (v8i16)tmp12);
+    vec17 = __msa_pckev_h((v8i16)tmp15, (v8i16)tmp14);
+    tmp4 = __msa_hadd_s_w(vec14, vec14);
+    tmp5 = __msa_hadd_s_w(vec15, vec15);
+    tmp6 = __msa_hadd_s_w(vec16, vec16);
+    tmp7 = __msa_hadd_s_w(vec17, vec17);
+    tmp4 = __msa_srai_w(tmp4, 6);
+    tmp5 = __msa_srai_w(tmp5, 6);
+    tmp6 = __msa_srai_w(tmp6, 6);
+    tmp7 = __msa_srai_w(tmp7, 6);
+    vec10 = __msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec11 = __msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    vec12 = __msa_pckev_h((v8i16)tmp5, (v8i16)tmp4);
+    vec13 = __msa_pckev_h((v8i16)tmp7, (v8i16)tmp6);
+    vec10 = __msa_maxi_s_h(vec10, 0);
+    vec11 = __msa_maxi_s_h(vec11, 0);
+    vec12 = __msa_maxi_s_h(vec12, 0);
+    vec13 = __msa_maxi_s_h(vec13, 0);
+    vec10 = __msa_min_s_h(vec10, max);
+    vec11 = __msa_min_s_h(vec11, max);
+    vec12 = __msa_min_s_h(vec12, max);
+    vec13 = __msa_min_s_h(vec13, max);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec11, (v16i8)vec10);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec13, (v16i8)vec12);
+    ST_UB2(dst0, dst1, dst_argb, 16);
+    src_argb += 32;
+    dst_argb += 32;
+  }
+}
+
+void SplitUVRow_MSA(const uint8_t* src_uv,
+                    uint8_t* dst_u,
+                    uint8_t* dst_v,
+                    int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+
+  for (x = 0; x < width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_u, 16);
+    ST_UB2(dst2, dst3, dst_v, 16);
+    src_uv += 64;
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SetRow_MSA(uint8_t* dst, uint8_t v8, int width) {
+  int x;
+  v16u8 dst0 = (v16u8)__msa_fill_b(v8);
+
+  for (x = 0; x < width; x += 16) {
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+void MirrorUVRow_MSA(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  int x;
+  v16u8 src0, src1, src2, src3;
+  v16u8 dst0, dst1, dst2, dst3;
+  v16i8 mask0 = {30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0};
+  v16i8 mask1 = {31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1};
+
+  src_uv += (2 * width);
+
+  for (x = 0; x < width; x += 32) {
+    src_uv -= 64;
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 16);
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 32);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_uv, 48);
+    dst0 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    dst2 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    dst3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst_v, 16);
+    ST_UB2(dst2, dst3, dst_u, 16);
+    dst_u += 32;
+    dst_v += 32;
+  }
+}
+
+void SobelXRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   const uint8_t* src_y2,
+                   uint8_t* dst_sobelx,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16i8 mask0 = {0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9};
+  v16i8 tmp = __msa_ldi_b(8);
+  v16i8 mask1 = mask0 + tmp;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 16);
+    src2 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);
+    src3 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 16);
+    src4 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 0);
+    src5 = (v16u8)__msa_ld_b((const v16i8*)src_y2, 16);
+    vec0 = (v8i16)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v8i16)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v8i16)__msa_vshf_b(mask0, (v16i8)src3, (v16i8)src2);
+    vec3 = (v8i16)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec4 = (v8i16)__msa_vshf_b(mask0, (v16i8)src5, (v16i8)src4);
+    vec5 = (v8i16)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec0 = (v8i16)__msa_hsub_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = (v8i16)__msa_hsub_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = (v8i16)__msa_hsub_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = (v8i16)__msa_hsub_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8i16)__msa_hsub_u_h((v16u8)vec4, (v16u8)vec4);
+    vec5 = (v8i16)__msa_hsub_u_h((v16u8)vec5, (v16u8)vec5);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobelx);
+    src_y0 += 16;
+    src_y1 += 16;
+    src_y2 += 16;
+    dst_sobelx += 16;
+  }
+}
+
+void SobelYRow_MSA(const uint8_t* src_y0,
+                   const uint8_t* src_y1,
+                   uint8_t* dst_sobely,
+                   int32_t width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6;
+  v8i16 zero = {0};
+  v8i16 max = __msa_ldi_h(255);
+
+  for (x = 0; x < width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((const v16i8*)src_y0, 0);
+    src1 = (v16u8)__msa_ld_b((const v16i8*)src_y1, 0);
+    vec0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src0);
+    vec1 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src0);
+    vec2 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)src1);
+    vec3 = (v8i16)__msa_ilvl_b((v16i8)zero, (v16i8)src1);
+    vec0 -= vec2;
+    vec1 -= vec3;
+    vec6[0] = src_y0[16] - src_y1[16];
+    vec6[1] = src_y0[17] - src_y1[17];
+    vec2 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 2);
+    vec3 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 2);
+    vec4 = (v8i16)__msa_sldi_b((v16i8)vec1, (v16i8)vec0, 4);
+    vec5 = (v8i16)__msa_sldi_b((v16i8)vec6, (v16i8)vec1, 4);
+    vec0 += vec2;
+    vec1 += vec3;
+    vec4 += vec2;
+    vec5 += vec3;
+    vec0 += vec4;
+    vec1 += vec5;
+    vec0 = __msa_add_a_h(zero, vec0);
+    vec1 = __msa_add_a_h(zero, vec1);
+    vec0 = __msa_maxi_s_h(vec0, 0);
+    vec1 = __msa_maxi_s_h(vec1, 0);
+    vec0 = __msa_min_s_h(max, vec0);
+    vec1 = __msa_min_s_h(max, vec1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst_sobely);
+    src_y0 += 16;
+    src_y1 += 16;
+    dst_sobely += 16;
+  }
+}
+
+void HalfFloatRow_MSA(const uint16_t* src,
+                      uint16_t* dst,
+                      float scale,
+                      int width) {
+  int i;
+  v8u16 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+  v4u32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4f32 fvec0, fvec1, fvec2, fvec3, fvec4, fvec5, fvec6, fvec7;
+  v4f32 mult_vec;
+  v8i16 zero = {0};
+  mult_vec[0] = 1.9259299444e-34f * scale;
+  mult_vec = (v4f32)__msa_splati_w((v4i32)mult_vec, 0);
+
+  for (i = 0; i < width; i += 32) {
+    src0 = (v8u16)__msa_ld_h((v8i16*)src, 0);
+    src1 = (v8u16)__msa_ld_h((v8i16*)src, 16);
+    src2 = (v8u16)__msa_ld_h((v8i16*)src, 32);
+    src3 = (v8u16)__msa_ld_h((v8i16*)src, 48);
+    vec0 = (v4u32)__msa_ilvr_h(zero, (v8i16)src0);
+    vec1 = (v4u32)__msa_ilvl_h(zero, (v8i16)src0);
+    vec2 = (v4u32)__msa_ilvr_h(zero, (v8i16)src1);
+    vec3 = (v4u32)__msa_ilvl_h(zero, (v8i16)src1);
+    vec4 = (v4u32)__msa_ilvr_h(zero, (v8i16)src2);
+    vec5 = (v4u32)__msa_ilvl_h(zero, (v8i16)src2);
+    vec6 = (v4u32)__msa_ilvr_h(zero, (v8i16)src3);
+    vec7 = (v4u32)__msa_ilvl_h(zero, (v8i16)src3);
+    fvec0 = __msa_ffint_u_w(vec0);
+    fvec1 = __msa_ffint_u_w(vec1);
+    fvec2 = __msa_ffint_u_w(vec2);
+    fvec3 = __msa_ffint_u_w(vec3);
+    fvec4 = __msa_ffint_u_w(vec4);
+    fvec5 = __msa_ffint_u_w(vec5);
+    fvec6 = __msa_ffint_u_w(vec6);
+    fvec7 = __msa_ffint_u_w(vec7);
+    fvec0 *= mult_vec;
+    fvec1 *= mult_vec;
+    fvec2 *= mult_vec;
+    fvec3 *= mult_vec;
+    fvec4 *= mult_vec;
+    fvec5 *= mult_vec;
+    fvec6 *= mult_vec;
+    fvec7 *= mult_vec;
+    vec0 = ((v4u32)fvec0) >> 13;
+    vec1 = ((v4u32)fvec1) >> 13;
+    vec2 = ((v4u32)fvec2) >> 13;
+    vec3 = ((v4u32)fvec3) >> 13;
+    vec4 = ((v4u32)fvec4) >> 13;
+    vec5 = ((v4u32)fvec5) >> 13;
+    vec6 = ((v4u32)fvec6) >> 13;
+    vec7 = ((v4u32)fvec7) >> 13;
+    dst0 = (v8u16)__msa_pckev_h((v8i16)vec1, (v8i16)vec0);
+    dst1 = (v8u16)__msa_pckev_h((v8i16)vec3, (v8i16)vec2);
+    dst2 = (v8u16)__msa_pckev_h((v8i16)vec5, (v8i16)vec4);
+    dst3 = (v8u16)__msa_pckev_h((v8i16)vec7, (v8i16)vec6);
+    ST_UH2(dst0, dst1, dst, 8);
+    ST_UH2(dst2, dst3, dst + 16, 8);
+    src += 32;
+    dst += 32;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libs/libvpx/third_party/libyuv/source/row_neon.cc b/libs/libvpx/third_party/libyuv/source/row_neon.cc
index 909df060c69..ff87e74c62c 100644
--- a/libs/libvpx/third_party/libyuv/source/row_neon.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_neon.cc
@@ -10,6 +10,8 @@
 
 #include "libyuv/row.h"
 
+#include <stdio.h>
+
 #ifdef __cplusplus
 namespace libyuv {
 extern "C" {
@@ -20,1446 +22,1311 @@ extern "C" {
     !defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.32    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.32    {d2[1]}, [%2]!                 \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.16    {d2[0]}, [%1]!                 \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.16    {d2[1]}, [%2]!                 \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vzip.u8    d2, d3                         \n"
+#define READYUV422                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.32    {d2[0]}, [%1]!                 \n" \
+  "vld1.32    {d2[1]}, [%2]!                 \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    MEMACCESS(2)                                                               \
-    "vld1.8     {d3}, [%2]!                    \n"                             \
-    "vpaddl.u8  q1, q1                         \n"                             \
-    "vrshrn.u16 d2, q1, #1                     \n"
+#define READYUV444                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vld1.8     {d2}, [%1]!                    \n" \
+  "vld1.8     {d3}, [%2]!                    \n" \
+  "vpaddl.u8  q1, q1                         \n" \
+  "vrshrn.u16 d2, q1, #1                     \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    "vmov.u8    d2, #128                       \n"
+#define READYUV400                               \
+  "vld1.8     {d0}, [%0]!                    \n" \
+  "vmov.u8    d2, #128                       \n"
 
 // Read 8 Y and 4 UV from NV12
 #define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d2, d3                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 Y and 4 VU from NV21
 #define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "vld1.8     {d0}, [%0]!                    \n"                             \
-    MEMACCESS(1)                                                               \
-    "vld1.8     {d2}, [%1]!                    \n"                             \
-    "vmov.u8    d3, d2                         \n"/* split odd/even uv apart */\
-    "vuzp.u8    d3, d2                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+  "vld1.8     {d0}, [%0]!                    \n"                               \
+  "vld1.8     {d2}, [%1]!                    \n"                               \
+  "vmov.u8    d3, d2                         \n" /* split odd/even uv apart */ \
+  "vuzp.u8    d3, d2                         \n"                               \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d0, d2}, [%0]!                \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
+#define READYUY2                                 \
+  "vld2.8     {d0, d2}, [%0]!                \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "vld2.8     {d2, d3}, [%0]!                \n"                             \
-    "vmov.u8    d0, d3                         \n"                             \
-    "vmov.u8    d3, d2                         \n"                             \
-    "vuzp.u8    d2, d3                         \n"                             \
-    "vtrn.u32   d2, d3                         \n"
-
-#define YUVTORGB_SETUP                                                         \
-    MEMACCESS([kUVToRB])                                                       \
-    "vld1.8     {d24}, [%[kUVToRB]]            \n"                             \
-    MEMACCESS([kUVToG])                                                        \
-    "vld1.8     {d25}, [%[kUVToG]]             \n"                             \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n"                           \
-    MEMACCESS([kUVBiasBGR])                                                    \
-    "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n"                           \
-    MEMACCESS([kYToRgb])                                                       \
-    "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
-
-#define YUVTORGB                                                               \
-    "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */\
-    "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */\
-    "vmovl.u8   q0, d0                         \n" /* Y                      */\
-    "vmovl.s16  q10, d1                        \n"                             \
-    "vmovl.s16  q0, d0                         \n"                             \
-    "vmul.s32   q10, q10, q15                  \n"                             \
-    "vmul.s32   q0, q0, q15                    \n"                             \
-    "vqshrun.s32 d0, q0, #16                   \n"                             \
-    "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */\
-    "vadd.s16   d18, d19                       \n"                             \
-    "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */\
-    "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */\
-    "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/\
-    "vaddw.u16  q1, q1, d16                    \n"                             \
-    "vaddw.u16  q10, q10, d17                  \n"                             \
-    "vaddw.u16  q3, q3, d18                    \n"                             \
-    "vqadd.s16  q8, q0, q13                    \n" /* B */                     \
-    "vqadd.s16  q9, q0, q14                    \n" /* R */                     \
-    "vqadd.s16  q0, q0, q4                     \n" /* G */                     \
-    "vqadd.s16  q8, q8, q1                     \n" /* B */                     \
-    "vqadd.s16  q9, q9, q10                    \n" /* R */                     \
-    "vqsub.s16  q0, q0, q3                     \n" /* G */                     \
-    "vqshrun.s16 d20, q8, #6                   \n" /* B */                     \
-    "vqshrun.s16 d22, q9, #6                   \n" /* R */                     \
-    "vqshrun.s16 d21, q0, #6                   \n" /* G */
-
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define READUYVY                                 \
+  "vld2.8     {d2, d3}, [%0]!                \n" \
+  "vmov.u8    d0, d3                         \n" \
+  "vmov.u8    d3, d2                         \n" \
+  "vuzp.u8    d2, d3                         \n" \
+  "vtrn.u32   d2, d3                         \n"
+
+#define YUVTORGB_SETUP                             \
+  "vld1.8     {d24}, [%[kUVToRB]]            \n"   \
+  "vld1.8     {d25}, [%[kUVToG]]             \n"   \
+  "vld1.16    {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \
+  "vld1.16    {d8[], d9[]}, [%[kUVBiasBGR]]!   \n" \
+  "vld1.16    {d28[], d29[]}, [%[kUVBiasBGR]]  \n" \
+  "vld1.32    {d30[], d31[]}, [%[kYToRgb]]     \n"
+
+#define YUVTORGB                                                              \
+  "vmull.u8   q8, d2, d24                    \n" /* u/v B/R component      */ \
+  "vmull.u8   q9, d2, d25                    \n" /* u/v G component        */ \
+  "vmovl.u8   q0, d0                         \n" /* Y                      */ \
+  "vmovl.s16  q10, d1                        \n"                              \
+  "vmovl.s16  q0, d0                         \n"                              \
+  "vmul.s32   q10, q10, q15                  \n"                              \
+  "vmul.s32   q0, q0, q15                    \n"                              \
+  "vqshrun.s32 d0, q0, #16                   \n"                              \
+  "vqshrun.s32 d1, q10, #16                  \n" /* Y                      */ \
+  "vadd.s16   d18, d19                       \n"                              \
+  "vshll.u16  q1, d16, #16                   \n" /* Replicate u * UB       */ \
+  "vshll.u16  q10, d17, #16                  \n" /* Replicate v * VR       */ \
+  "vshll.u16  q3, d18, #16                   \n" /* Replicate (v*VG + u*UG)*/ \
+  "vaddw.u16  q1, q1, d16                    \n"                              \
+  "vaddw.u16  q10, q10, d17                  \n"                              \
+  "vaddw.u16  q3, q3, d18                    \n"                              \
+  "vqadd.s16  q8, q0, q13                    \n" /* B */                      \
+  "vqadd.s16  q9, q0, q14                    \n" /* R */                      \
+  "vqadd.s16  q0, q0, q4                     \n" /* G */                      \
+  "vqadd.s16  q8, q8, q1                     \n" /* B */                      \
+  "vqadd.s16  q9, q9, q10                    \n" /* R */                      \
+  "vqsub.s16  q0, q0, q3                     \n" /* G */                      \
+  "vqshrun.s16 d20, q8, #6                   \n" /* B */                      \
+  "vqshrun.s16 d22, q9, #6                   \n" /* R */                      \
+  "vqshrun.s16 d21, q0, #6                   \n" /* G */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV444
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV444 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_argb),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %5, %5, #8                     \n"
-    MEMACCESS(3)
-    "vld1.8     {d23}, [%3]!                   \n"
-    MEMACCESS(4)
-    "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(src_a),     // %3
-      "+r"(dst_argb),  // %4
-      "+r"(width)      // %5
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %5, %5, #8                     \n"
+      "vld1.8     {d23}, [%3]!                   \n"
+      "vst4.8     {d20, d21, d22, d23}, [%4]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(src_a),     // %3
+        "+r"(dst_argb),  // %4
+        "+r"(width)      // %5
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst4.8     {d20, d21, d22, d23}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d19, #255                      \n"  // d19 modified by YUVTORGB
-    MEMACCESS(3)
-    "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_rgba),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d19, #255                      \n"  // YUVTORGB modified d19
+      "vst4.8     {d19, d20, d21, d22}, [%3]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_rgba),  // %3
+        "+r"(width)      // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    MEMACCESS(3)
-    "vst3.8     {d20, d21, d22}, [%3]!         \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),      // %0
-      "+r"(src_u),      // %1
-      "+r"(src_v),      // %2
-      "+r"(dst_rgb24),  // %3
-      "+r"(width)       // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTORGB565                                                           \
-    "vshll.u8    q0, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q8, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q9, d20, #8                   \n"  /* B                    */ \
-    "vsri.16     q0, q8, #5                    \n"  /* RG                   */ \
-    "vsri.16     q0, q9, #11                   \n"  /* RGB                  */
-
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%3]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_u),      // %1
+        "+r"(src_v),      // %2
+        "+r"(dst_rgb24),  // %3
+        "+r"(width)       // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTORGB565                                                        \
+  "vshll.u8    q0, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q8, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q9, d20, #8                   \n" /* B                    */ \
+  "vsri.16     q0, q8, #5                    \n" /* RG                   */ \
+  "vsri.16     q0, q9, #11                   \n" /* RGB                  */
+
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB1555                                                         \
-    "vshll.u8    q0, d23, #8                   \n"  /* A                    */ \
-    "vshll.u8    q8, d22, #8                   \n"  /* R                    */ \
-    "vshll.u8    q9, d21, #8                   \n"  /* G                    */ \
-    "vshll.u8    q10, d20, #8                  \n"  /* B                    */ \
-    "vsri.16     q0, q8, #1                    \n"  /* AR                   */ \
-    "vsri.16     q0, q9, #6                    \n"  /* ARG                  */ \
-    "vsri.16     q0, q10, #11                  \n"  /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "vshll.u8    q0, d23, #8                   \n" /* A                    */ \
+  "vshll.u8    q8, d22, #8                   \n" /* R                    */ \
+  "vshll.u8    q9, d21, #8                   \n" /* G                    */ \
+  "vshll.u8    q10, d20, #8                  \n" /* B                    */ \
+  "vsri.16     q0, q8, #1                    \n" /* AR                   */ \
+  "vsri.16     q0, q9, #6                    \n" /* ARG                  */ \
+  "vsri.16     q0, q10, #11                  \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-#define ARGBTOARGB4444                                                         \
-    "vshr.u8    d20, d20, #4                   \n"  /* B                    */ \
-    "vbic.32    d21, d21, d4                   \n"  /* G                    */ \
-    "vshr.u8    d22, d22, #4                   \n"  /* R                    */ \
-    "vbic.32    d23, d23, d4                   \n"  /* A                    */ \
-    "vorr       d0, d20, d21                   \n"  /* BG                   */ \
-    "vorr       d1, d22, d23                   \n"  /* RA                   */ \
-    "vzip.u8    d0, d1                         \n"  /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB1555
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+#define ARGBTOARGB4444                                                      \
+  "vshr.u8    d20, d20, #4                   \n" /* B                    */ \
+  "vbic.32    d21, d21, d4                   \n" /* G                    */ \
+  "vshr.u8    d22, d22, #4                   \n" /* R                    */ \
+  "vbic.32    d23, d23, d4                   \n" /* A                    */ \
+  "vorr       d0, d20, d21                   \n" /* BG                   */ \
+  "vorr       d1, d22, d23                   \n" /* RA                   */ \
+  "vzip.u8    d0, d1                         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB
-    "subs       %4, %4, #8                     \n"
-    "vmov.u8    d23, #255                      \n"
-    ARGBTOARGB4444
-    MEMACCESS(3)
-    "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb4444),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUV400
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
-      [kUVToG]"r"(&kYuvI601Constants.kUVToG),
-      [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
-      [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d20}, [%0]!                   \n"
-    "vmov       d21, d20                       \n"
-    "vmov       d22, d20                       \n"
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "d20", "d21", "d22", "d23"
-  );
-}
-
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d4, #0x0f                      \n"  // vbic bits to clear
+      "1:                                        \n"
+
+      READYUV422 YUVTORGB
+      "subs       %4, %4, #8                     \n"
+      "vmov.u8    d23, #255                      \n" ARGBTOARGB4444
+      "vst1.8     {q0}, [%3]!                    \n"  // store 8 pixels
+      "bgt        1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb4444),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n" READYUV400 YUVTORGB
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB),
+        [kUVToG] "r"(&kYuvI601Constants.kUVToG),
+        [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR),
+        [kYToRgb] "r"(&kYuvI601Constants.kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d23, #255                      \n"
+      "1:                                        \n"
+      "vld1.8     {d20}, [%0]!                   \n"
+      "vmov       d21, d20                       \n"
+      "vmov       d22, d20                       \n"
+      "subs       %2, %2, #8                     \n"
+      "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d20", "d21", "d22", "d23");
+}
+
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV12 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_uv),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READNV21
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_vu),    // %1
-      "+r"(dst_argb),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READNV21 YUVTORGB
+               "subs       %3, %3, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%2]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_y),     // %0
+                 "+r"(src_vu),    // %1
+                 "+r"(dst_argb),  // %2
+                 "+r"(width)      // %3
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_uv),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
+  asm volatile(
+
+      YUVTORGB_SETUP
+
+      "1:                                        \n"
+
+      READNV21 YUVTORGB
+      "subs       %3, %3, #8                     \n"
+      "vst3.8     {d20, d21, d22}, [%2]!         \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_rgb24),  // %2
+        "+r"(width)       // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READNV12
-    YUVTORGB
-    "subs       %3, %3, #8                     \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
-      "+r"(width)      // %3
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB
+      "subs       %3, %3, #8                     \n" ARGBTORGB565
+      "vst1.8     {q0}, [%2]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11",
+        "q12", "q13", "q14", "q15");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READYUY2
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READYUY2 YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_yuy2),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "vmov.u8    d23, #255                      \n"
-  "1:                                          \n"
-    READUYVY
-    YUVTORGB
-    "subs       %2, %2, #8                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
-    "bgt        1b                             \n"
-    : "+r"(src_uyvy),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "q0", "q1", "q2", "q3", "q4",
-      "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+  asm volatile(YUVTORGB_SETUP
+               "vmov.u8    d23, #255                      \n"
+               "1:                                        \n" READUYVY YUVTORGB
+               "subs       %2, %2, #8                     \n"
+               "vst4.8     {d20, d21, d22, d23}, [%1]!    \n"
+               "bgt        1b                             \n"
+               : "+r"(src_uyvy),  // %0
+                 "+r"(dst_argb),  // %1
+                 "+r"(width)      // %2
+               : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+                 [kUVToG] "r"(&yuvconstants->kUVToG),
+                 [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+                 [kYToRgb] "r"(&yuvconstants->kYToRgb)
+               : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9",
+                 "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store U
-    MEMACCESS(2)
-    "vst1.8     {q1}, [%2]!                    \n"  // store V
-    "bgt        1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pairs of UV
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store U
+      "vst1.8     {q1}, [%2]!                    \n"  // store V
+      "bgt        1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
                      int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load U
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load V
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "vst2.u8    {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
-    "bgt        1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load U
+      "vld1.8     {q1}, [%1]!                    \n"  // load V
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vst2.8     {q0, q1}, [%2]!                \n"  // store 16 pairs of UV
+      "bgt        1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB
+      "vld3.8     {d1, d3, d5}, [%0]!            \n"  // next 8 RGB
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q0}, [%1]!                    \n"  // store R
+      "vst1.8     {q1}, [%2]!                    \n"  // store G
+      "vst1.8     {q2}, [%3]!                    \n"  // store B
+      "bgt        1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "d0", "d1", "d2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load R
+      "vld1.8     {q1}, [%1]!                    \n"  // load G
+      "vld1.8     {q2}, [%2]!                    \n"  // load B
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "vst3.8     {d0, d2, d4}, [%3]!            \n"  // store 8 RGB
+      "vst3.8     {d1, d3, d5}, [%3]!            \n"  // next 8 RGB
+      "bgt        1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
 }
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
-    "subs       %2, %2, #32                    \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs      %1, %1, #16                     \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-// ARGBSetRow writes 'count' pixels using an 32 bit value repeated.
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs      %1, %1, #4                      \n"  // 4 pixels per loop
-    MEMACCESS(0)
-    "vst1.8    {q0}, [%0]!                     \n"  // store
-    "bgt       1b                              \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "q0"
-  );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2                     \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #16                        \n"  // 16 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 32
+      "subs       %2, %2, #32                    \n"  // 32 processed per loop
+      "vst1.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 32
+      "bgt        1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "vdup.8    q0, %2                          \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs      %1, %1, #16                     \n"  // 16 bytes per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "q0");
+}
+
+// ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "vdup.u32  q0, %2                          \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs      %1, %1, #4                      \n"  // 4 pixels per loop
+      "vst1.8    {q0}, [%0]!                     \n"  // store
+      "bgt       1b                              \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "q0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2                     \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #16                        \n"  // 16 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
                       int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r12, #-16                      \n"
-    "add        %0, %0, %3, lsl #1             \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
-    "subs       %3, #8                         \n"  // 8 pixels per loop.
-    "vrev64.8   q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  :
-  : "cc", "memory", "r12", "q0"
-  );
-}
-
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "mov        r3, #-16                       \n"
-    "add        %0, %0, %2, lsl #2             \n"
-    "sub        %0, #16                        \n"
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
-    "subs       %2, #4                         \n"  // 4 pixels per loop.
-    "vrev64.32  q0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  :
-  : "cc", "memory", "r3", "q0"
-  );
-}
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d4, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxGGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB RRRRRxxx */ \
-    "vshl.u8    d6, d6, #2                     \n"  /* G GGGGGG00 upper 6   */ \
-    "vshr.u8    d1, d1, #3                     \n"  /* R 000RRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #6                     \n"  /* G 000000GG lower 2   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                         \
-    "vshrn.u16  d7, q0, #8                     \n"  /* A Arrrrrxx           */ \
-    "vshr.u8    d6, d7, #2                     \n"  /* R xxxRRRRR           */ \
-    "vshrn.u16  d5, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vmovn.u16  d4, q0                         \n"  /* B xxxBBBBB           */ \
-    "vshr.u8    d7, d7, #7                     \n"  /* A 0000000A           */ \
-    "vneg.s8    d7, d7                         \n"  /* A AAAAAAAA upper 8   */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* R RRRRR000 upper 5   */ \
-    "vshr.u8    q1, q3, #5                     \n"  /* R,A 00000RRR lower 3 */ \
-    "vshl.u8    q0, q2, #3                     \n"  /* B,G BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,G 00000BBB lower 3 */ \
-    "vorr.u8    q1, q1, q3                     \n"  /* R,A                  */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,G                  */ \
+  asm volatile(
+      // Start at end of source row.
+      "mov        r12, #-16                      \n"
+      "add        %0, %0, %3, lsl #1             \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+      "subs       %3, #8                         \n"  // 8 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // dst += 8
+      "vst1.8     {d1}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_u),   // %1
+        "+r"(dst_v),   // %2
+        "+r"(width)    // %3
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r3, #-16                       \n"
+      "add        %0, %0, %2, lsl #2             \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0], r3                 \n"  // src -= 16
+      "subs       %2, #4                         \n"  // 4 pixels per loop.
+      "vrev64.32  q0, q0                         \n"
+      "vst1.8     {d1}, [%1]!                    \n"  // dst += 16
+      "vst1.8     {d0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "r3", "q0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d4, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst4.8     {d1, d2, d3, d4}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld3.8     {d1, d2, d3}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+#define RGB565TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxGGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB RRRRRxxx */ \
+  "vshl.u8    d6, d6, #2                     \n" /* G GGGGGG00 upper 6   */ \
+  "vshr.u8    d1, d1, #3                     \n" /* R 000RRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #6                     \n" /* G 000000GG lower 2   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "vshrn.u16  d7, q0, #8                     \n" /* A Arrrrrxx           */ \
+  "vshr.u8    d6, d7, #2                     \n" /* R xxxRRRRR           */ \
+  "vshrn.u16  d5, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vmovn.u16  d4, q0                         \n" /* B xxxBBBBB           */ \
+  "vshr.u8    d7, d7, #7                     \n" /* A 0000000A           */ \
+  "vneg.s8    d7, d7                         \n" /* A AAAAAAAA upper 8   */ \
+  "vshl.u8    d6, d6, #3                     \n" /* R RRRRR000 upper 5   */ \
+  "vshr.u8    q1, q3, #5                     \n" /* R,A 00000RRR lower 3 */ \
+  "vshl.u8    q0, q2, #3                     \n" /* B,G BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,G 00000BBB lower 3 */ \
+  "vorr.u8    q1, q1, q3                     \n" /* R,A                  */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,G                  */
 
 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "vshrn.u16  d6, q0, #5                     \n"  /* G xxxGGGGG           */ \
-    "vuzp.u8    d0, d1                         \n"  /* d0 xxxBBBBB xRRRRRxx */ \
-    "vshl.u8    d6, d6, #3                     \n"  /* G GGGGG000 upper 5   */ \
-    "vshr.u8    d1, d1, #2                     \n"  /* R 00xRRRRR lower 5   */ \
-    "vshl.u8    q0, q0, #3                     \n"  /* B,R BBBBB000 upper 5 */ \
-    "vshr.u8    q2, q0, #5                     \n"  /* B,R 00000BBB lower 3 */ \
-    "vorr.u8    d0, d0, d4                     \n"  /* B                    */ \
-    "vshr.u8    d4, d6, #5                     \n"  /* G 00000GGG lower 3   */ \
-    "vorr.u8    d2, d1, d5                     \n"  /* R                    */ \
-    "vorr.u8    d1, d4, d6                     \n"  /* G                    */
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+#define RGB555TOARGB                                                        \
+  "vshrn.u16  d6, q0, #5                     \n" /* G xxxGGGGG           */ \
+  "vuzp.u8    d0, d1                         \n" /* d0 xxxBBBBB xRRRRRxx */ \
+  "vshl.u8    d6, d6, #3                     \n" /* G GGGGG000 upper 5   */ \
+  "vshr.u8    d1, d1, #2                     \n" /* R 00xRRRRR lower 5   */ \
+  "vshl.u8    q0, q0, #3                     \n" /* B,R BBBBB000 upper 5 */ \
+  "vshr.u8    q2, q0, #5                     \n" /* B,R 00000BBB lower 3 */ \
+  "vorr.u8    d0, d0, d4                     \n" /* B                    */ \
+  "vshr.u8    d4, d6, #5                     \n" /* G 00000GGG lower 3   */ \
+  "vorr.u8    d2, d1, d5                     \n" /* R                    */ \
+  "vorr.u8    d1, d4, d6                     \n" /* G                    */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                         \
-    "vuzp.u8    d0, d1                         \n"  /* d0 BG, d1 RA         */ \
-    "vshl.u8    q2, q0, #4                     \n"  /* B,R BBBB0000         */ \
-    "vshr.u8    q1, q0, #4                     \n"  /* G,A 0000GGGG         */ \
-    "vshr.u8    q0, q2, #4                     \n"  /* B,R 0000BBBB         */ \
-    "vorr.u8    q0, q0, q2                     \n"  /* B,R BBBBBBBB         */ \
-    "vshl.u8    q2, q1, #4                     \n"  /* G,A GGGG0000         */ \
-    "vorr.u8    q1, q1, q2                     \n"  /* G,A GGGGGGGG         */ \
-    "vswp.u8    d1, d2                         \n"  /* B,R,G,A -> B,G,R,A   */
-
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "vuzp.u8    d0, d1                         \n" /* d0 BG, d1 RA         */ \
+  "vshl.u8    q2, q0, #4                     \n" /* B,R BBBB0000         */ \
+  "vshr.u8    q1, q0, #4                     \n" /* G,A 0000GGGG         */ \
+  "vshr.u8    q0, q2, #4                     \n" /* B,R 0000BBBB         */ \
+  "vorr.u8    q0, q0, q2                     \n" /* B,R BBBBBBBB         */ \
+  "vshl.u8    q2, q1, #4                     \n" /* G,A GGGG0000         */ \
+  "vorr.u8    q1, q1, q2                     \n" /* G,A GGGGGGGG         */ \
+  "vswp.u8    d1, d2                         \n" /* B,R,G,A -> B,G,R,A   */
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
                             int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RGB24.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vswp.u8    d1, d3                         \n"  // swap R, B
-    MEMACCESS(1)
-    "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // Alpha
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of
+                                                      // RGB24.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d1, d2, d3, d4}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vswp.u8    d1, d3                         \n"  // swap R, B
+      "vst3.8     {d1, d2, d3}, [%1]!            \n"  // store 8 pixels of RAW.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of YUY2.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q0}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 16 pixels of UYVY.
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop.
+      "vst1.8     {q1}, [%1]!                    \n"  // store 16 pixels of Y.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
-    MEMACCESS(2)
-    "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_yuy2
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
-    "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
-    "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(stride_yuy2),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // stride + src_uyvy
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
-    "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
-    "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
-    "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
-    MEMACCESS(3)
-    "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
-    "bgt        1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(stride_uyvy),  // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"  // Clobber List
-  );
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d1}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %3, %3, #16                    \n"  // 16 pixels = 8 UVs.
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%2]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
+      );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_yuy2
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of YUY2.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row YUY2.
+      "vrhadd.u8  d1, d1, d5                     \n"  // average rows of U
+      "vrhadd.u8  d3, d3, d7                     \n"  // average rows of V
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d3}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_yuy2),     // %0
+        "+r"(stride_yuy2),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // stride + src_uyvy
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 16 pixels of UYVY.
+      "subs       %4, %4, #16                    \n"  // 16 pixels = 8 UVs.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load next row UYVY.
+      "vrhadd.u8  d0, d0, d4                     \n"  // average rows of U
+      "vrhadd.u8  d2, d2, d6                     \n"  // average rows of V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 U.
+      "vst1.8     {d2}, [%3]!                    \n"  // store 8 V.
+      "bgt        1b                             \n"
+      : "+r"(src_uyvy),     // %0
+        "+r"(stride_uyvy),  // %1
+        "+r"(dst_u),        // %2
+        "+r"(dst_v),        // %3
+        "+r"(width)         // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+        "d7"  // Clobber List
+      );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
-    "subs       %2, %2, #4                     \n"  // 4 processed per loop
-    "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
-    "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store 4.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
-    "subs       %4, %4, #16                    \n"  // 16 pixels
-    MEMACCESS(3)
-    "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3"
-  );
-}
-
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "vdup.32    d2, %2                         \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d20, d20, d2                   \n"
-    "vqadd.u8   d21, d21, d2                   \n"
-    "vqadd.u8   d22, d22, d2                   \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"  // store 8 pixels RGB565.
-    "bgt        1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // shuffler
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 4 pixels.
+      "subs       %2, %2, #4                     \n"  // 4 processed per loop
+      "vtbl.8     d2, {d0, d1}, d4               \n"  // look up 2 first pixels
+      "vtbl.8     d3, {d0, d1}, d5               \n"  // look up 2 next pixels
+      "vst1.8     {q1}, [%1]!                    \n"  // store 4.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "q0", "q1", "q2"  // Clobber List
+      );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d3}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 YUY2/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d1, d3}, [%0]!                \n"  // load 16 Ys
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 Us
+      "vld1.8     {d2}, [%2]!                    \n"  // load 8 Vs
+      "subs       %4, %4, #16                    \n"  // 16 pixels
+      "vst4.8     {d0, d1, d2, d3}, [%3]!        \n"  // Store 8 UYVY/16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels RGB565.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "vdup.32    d2, %2                         \n"  // dither4
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%1]!    \n"  // load 8 pixels of ARGB.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d20, d20, d2                   \n"
+      "vqadd.u8   d21, d21, d2                   \n"
+      "vqadd.u8   d22, d22, d2                   \n"  // add for dither
+      ARGBTORGB565
+      "vst1.8     {q0}, [%0]!                    \n"  // store 8 RGB565.
+      "bgt        1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
                             int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB1555.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB1555.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
                             int width) {
-  asm volatile (
-    "vmov.u8    d4, #0x0f                      \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"  // store 8 pixels ARGB4444.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
-  );
-}
-
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
-    "bgt       1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_a),      // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+  asm volatile(
+      "vmov.u8    d4, #0x0f                      \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "vld4.8     {d20, d21, d22, d23}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "vst1.8     {q0}, [%1]!                    \n"  // store 8 ARGB4444.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q8", "q9", "q10", "q11");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q3}, [%1]!                    \n"  // store 16 A's.
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
+      );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit Y
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-    "vmov.u8    d24, #112                      \n"  // UB / VR 0.875 coefficient
-    "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
-    "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
-    "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
-    "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlsl.u8   q2, d1, d25                    \n"  // G
-    "vmlsl.u8   q2, d2, d26                    \n"  // R
-    "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
-
-    "vmull.u8   q3, d2, d24                    \n"  // R
-    "vmlsl.u8   q3, d1, d28                    \n"  // G
-    "vmlsl.u8   q3, d0, d27                    \n"  // B
-    "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
-
-    "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
-  );
-}
-
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
                          int width) {
-  asm volatile (
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "vld4.8     {d8, d10, d12, d14}, [%0]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d9, d11, d13, d15}, [%0]!     \n"  // load last 8 ARGB pixels.
-    "vpaddl.u8  q4, q4                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q5, q5                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q6, q6                         \n"  // R 16 bytes -> 8 shorts.
-
-    "vpadd.u16  d0, d0, d1                     \n"  // B 16 shorts -> 8 shorts.
-    "vpadd.u16  d1, d8, d9                     \n"  // B
-    "vpadd.u16  d2, d2, d3                     \n"  // G 16 shorts -> 8 shorts.
-    "vpadd.u16  d3, d10, d11                   \n"  // G
-    "vpadd.u16  d4, d4, d5                     \n"  // R 16 shorts -> 8 shorts.
-    "vpadd.u16  d5, d12, d13                   \n"  // R
-
-    "vrshr.u16  q0, q0, #1                     \n"  // 2x average
-    "vrshr.u16  q1, q1, #1                     \n"
-    "vrshr.u16  q2, q2, #1                     \n"
-
-    "subs       %3, %3, #32                    \n"  // 32 processed per loop.
-    "vmul.s16   q8, q0, q10                    \n"  // B
-    "vmls.s16   q8, q1, q11                    \n"  // G
-    "vmls.s16   q8, q2, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q2, q10                    \n"  // R
-    "vmls.s16   q9, q1, q14                    \n"  // G
-    "vmls.s16   q9, q0, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
+  asm volatile(
+      "vmov.u8    d24, #112                      \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.u8    d25, #74                       \n"  // UG -0.5781 coefficient
+      "vmov.u8    d26, #38                       \n"  // UR -0.2969 coefficient
+      "vmov.u8    d27, #18                       \n"  // VB -0.1406 coefficient
+      "vmov.u8    d28, #94                       \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlsl.u8   q2, d1, d25                    \n"  // G
+      "vmlsl.u8   q2, d2, d26                    \n"  // R
+      "vadd.u16   q2, q2, q15                    \n"  // +128 -> unsigned
+
+      "vmull.u8   q3, d2, d24                    \n"  // R
+      "vmlsl.u8   q3, d1, d28                    \n"  // G
+      "vmlsl.u8   q3, d0, d27                    \n"  // B
+      "vadd.u16   q3, q3, q15                    \n"  // +128 -> unsigned
+
+      "vqshrn.u16  d0, q2, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q3, #8                    \n"  // 16 bit to 8 bit V
+
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%2]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
+        "q15");
+}
+
+// clang-format off
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "vmul.s16   q8, " #QB ", q10               \n"  /* B                    */ \
-    "vmls.s16   q8, " #QG ", q11               \n"  /* G                    */ \
-    "vmls.s16   q8, " #QR ", q12               \n"  /* R                    */ \
-    "vadd.u16   q8, q8, q15                    \n"  /* +128 -> unsigned     */ \
-    "vmul.s16   q9, " #QR ", q10               \n"  /* R                    */ \
-    "vmls.s16   q9, " #QG ", q14               \n"  /* G                    */ \
-    "vmls.s16   q9, " #QB ", q13               \n"  /* B                    */ \
-    "vadd.u16   q9, q9, q15                    \n"  /* +128 -> unsigned     */ \
-    "vqshrn.u16  d0, q8, #8                    \n"  /* 16 bit to 8 bit U    */ \
-    "vqshrn.u16  d1, q9, #8                    \n"  /* 16 bit to 8 bit V    */
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "vmul.s16   q8, " #QB ", q10               \n" /* B                    */ \
+  "vmls.s16   q8, " #QG ", q11               \n" /* G                    */ \
+  "vmls.s16   q8, " #QR ", q12               \n" /* R                    */ \
+  "vadd.u16   q8, q8, q15                    \n" /* +128 -> unsigned     */ \
+  "vmul.s16   q9, " #QR ", q10               \n" /* R                    */ \
+  "vmls.s16   q9, " #QG ", q14               \n" /* G                    */ \
+  "vmls.s16   q9, " #QB ", q13               \n" /* B                    */ \
+  "vadd.u16   q9, q9, q15                    \n" /* +128 -> unsigned     */ \
+  "vqshrn.u16  d0, q8, #8                    \n" /* 16 bit to 8 bit U    */ \
+  "vqshrn.u16  d1, q9, #8                    \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1468,17 +1335,13 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1490,9 +1353,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1507,8 +1368,11 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_argb
     "vmov.s16   q10, #127 / 2                  \n"  // UB / VR 0.500 coefficient
@@ -1517,17 +1381,13 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "vmov.s16   q13, #20 / 2                   \n"  // VB -0.08131 coefficient
     "vmov.s16   q14, #107 / 2                  \n"  // VG -0.41869 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ARGB pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1539,9 +1399,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1555,8 +1413,11 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_bgra
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1565,17 +1426,13 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 BGRA pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 BGRA pixels.
     "vpaddl.u8  q3, q3                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more BGRA pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 BGRA pixels.
     "vpadal.u8  q3, q7                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q2, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1587,9 +1444,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q3, q2, q1)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1603,8 +1458,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_abgr
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1613,17 +1471,13 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ABGR pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ABGR pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more ABGR pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 ABGR pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1635,9 +1489,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1651,8 +1503,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgba
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1661,17 +1516,13 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 RGBA pixels.
-    MEMACCESS(0)
     "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 RGBA pixels.
     "vpaddl.u8  q0, q1                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q2                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q3                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more RGBA pixels.
-    MEMACCESS(1)
     "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 RGBA pixels.
     "vpadal.u8  q0, q5                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q6                         \n"  // G 16 bytes -> 8 shorts.
@@ -1683,9 +1534,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1699,8 +1548,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_rgb24
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1709,17 +1561,13 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RGB24 pixels.
-    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RGB24 pixels.
     "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RGB24 pixels.
-    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RGB24 pixels.
     "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1731,9 +1579,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q0, q1, q2)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1747,8 +1593,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
   asm volatile (
     "add        %1, %0, %1                     \n"  // src_stride + src_raw
     "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
@@ -1757,17 +1606,13 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
     "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
     "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
     "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
+    "1:                                        \n"
     "vld3.8     {d0, d2, d4}, [%0]!            \n"  // load 8 RAW pixels.
-    MEMACCESS(0)
     "vld3.8     {d1, d3, d5}, [%0]!            \n"  // load next 8 RAW pixels.
     "vpaddl.u8  q2, q2                         \n"  // B 16 bytes -> 8 shorts.
     "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
     "vpaddl.u8  q0, q0                         \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "vld3.8     {d8, d10, d12}, [%1]!          \n"  // load 8 more RAW pixels.
-    MEMACCESS(1)
     "vld3.8     {d9, d11, d13}, [%1]!          \n"  // load last 8 RAW pixels.
     "vpadal.u8  q2, q6                         \n"  // B 16 bytes -> 8 shorts.
     "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
@@ -1779,9 +1624,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 
     "subs       %4, %4, #16                    \n"  // 32 processed per loop.
     RGBTOUV(q2, q1, q0)
-    MEMACCESS(2)
     "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
     "bgt        1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1796,875 +1639,815 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_stride_rgb565),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),         // %0
+        "+r"(src_stride_rgb565),  // %1
+        "+r"(dst_u),              // %2
+        "+r"(dst_v),              // %3
+        "+r"(width)               // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_stride_argb1555),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),         // %0
+        "+r"(src_stride_argb1555),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  asm volatile (
-    "add        %1, %0, %1                     \n"  // src_stride + src_argb
-    "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875 coefficient
-    "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
-    "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
-    "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
-    "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
-    "vmov.u16   q15, #0x8080                   \n"  // 128.5
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
-    "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
-    "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
-
-    "vrshr.u16  q4, q4, #1                     \n"  // 2x average
-    "vrshr.u16  q5, q5, #1                     \n"
-    "vrshr.u16  q6, q6, #1                     \n"
-
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop.
-    "vmul.s16   q8, q4, q10                    \n"  // B
-    "vmls.s16   q8, q5, q11                    \n"  // G
-    "vmls.s16   q8, q6, q12                    \n"  // R
-    "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
-    "vmul.s16   q9, q6, q10                    \n"  // R
-    "vmls.s16   q9, q5, q14                    \n"  // G
-    "vmls.s16   q9, q4, q13                    \n"  // B
-    "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
-    "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
-    "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_stride_argb4444),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-    "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-  );
-}
-
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
-    "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
-    "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
-    "vmov.u8    d27, #16                       \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d27                        \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
-  );
-}
-
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // R
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // R
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // B
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d1, d4                     \n"  // B
-    "vmlal.u8   q8, d2, d5                     \n"  // G
-    "vmlal.u8   q8, d3, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
-}
-
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
-    "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
-    "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
-    "vmov.u8    d7, #16                        \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q8, d0, d4                     \n"  // B
-    "vmlal.u8   q8, d1, d5                     \n"  // G
-    "vmlal.u8   q8, d2, d6                     \n"  // R
-    "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
-    "vqadd.u8   d0, d7                         \n"
-    MEMACCESS(1)
-    "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
-    "bgt        1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
-  );
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_argb
+      "vmov.s16   q10, #112 / 2                  \n"  // UB / VR 0.875
+                                                      // coefficient
+      "vmov.s16   q11, #74 / 2                   \n"  // UG -0.5781 coefficient
+      "vmov.s16   q12, #38 / 2                   \n"  // UR -0.2969 coefficient
+      "vmov.s16   q13, #18 / 2                   \n"  // VB -0.1406 coefficient
+      "vmov.s16   q14, #94 / 2                   \n"  // VG -0.7344 coefficient
+      "vmov.u16   q15, #0x8080                   \n"  // 128.5
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%0]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpaddl.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpaddl.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpaddl.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vld1.8     {q0}, [%1]!                    \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d8, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d10, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d12, d2                        \n"  // R 8 bytes -> 4 shorts.
+      "vld1.8     {q0}, [%1]!                    \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "vpadal.u8  d9, d0                         \n"  // B 8 bytes -> 4 shorts.
+      "vpadal.u8  d11, d1                        \n"  // G 8 bytes -> 4 shorts.
+      "vpadal.u8  d13, d2                        \n"  // R 8 bytes -> 4 shorts.
+
+      "vrshr.u16  q4, q4, #1                     \n"  // 2x average
+      "vrshr.u16  q5, q5, #1                     \n"
+      "vrshr.u16  q6, q6, #1                     \n"
+
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop.
+      "vmul.s16   q8, q4, q10                    \n"  // B
+      "vmls.s16   q8, q5, q11                    \n"  // G
+      "vmls.s16   q8, q6, q12                    \n"  // R
+      "vadd.u16   q8, q8, q15                    \n"  // +128 -> unsigned
+      "vmul.s16   q9, q6, q10                    \n"  // R
+      "vmls.s16   q9, q5, q14                    \n"  // G
+      "vmls.s16   q9, q4, q13                    \n"  // B
+      "vadd.u16   q9, q9, q15                    \n"  // +128 -> unsigned
+      "vqshrn.u16  d0, q8, #8                    \n"  // 16 bit to 8 bit U
+      "vqshrn.u16  d1, q9, #8                    \n"  // 16 bit to 8 bit V
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 pixels U.
+      "vst1.8     {d1}, [%3]!                    \n"  // store 8 pixels V.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),         // %0
+        "+r"(src_stride_argb4444),  // %1
+        "+r"(dst_u),                // %2
+        "+r"(dst_v),                // %3
+        "+r"(width)                 // %4
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 RGB565 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB1555 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "vmov.u8    d24, #13                       \n"  // B * 0.1016 coefficient
+      "vmov.u8    d25, #65                       \n"  // G * 0.5078 coefficient
+      "vmov.u8    d26, #33                       \n"  // R * 0.2578 coefficient
+      "vmov.u8    d27, #16                       \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 8 ARGB4444 pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d27                        \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of BGRA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // R
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ABGR.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // R
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // B
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of RGBA.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d1, d4                     \n"  // B
+      "vmlal.u8   q8, d2, d5                     \n"  // G
+      "vmlal.u8   q8, d3, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RGB24.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vmov.u8    d4, #33                        \n"  // R * 0.2578 coefficient
+      "vmov.u8    d5, #65                        \n"  // G * 0.5078 coefficient
+      "vmov.u8    d6, #13                        \n"  // B * 0.1016 coefficient
+      "vmov.u8    d7, #16                        \n"  // Add 16 constant
+      "1:                                        \n"
+      "vld3.8     {d0, d1, d2}, [%0]!            \n"  // load 8 pixels of RAW.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q8, d0, d4                     \n"  // B
+      "vmlal.u8   q8, d1, d5                     \n"  // G
+      "vmlal.u8   q8, d2, d6                     \n"  // R
+      "vqrshrun.s16 d0, q8, #7                   \n"  // 16 bit to 8 bit Y
+      "vqadd.u8   d0, d7                         \n"
+      "vst1.8     {d0}, [%1]!                    \n"  // store 8 pixels Y.
+      "bgt        1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
 }
 
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
-  asm volatile (
-    "cmp        %4, #0                         \n"
-    "beq        100f                           \n"
-    "add        %2, %1                         \n"
-    "cmp        %4, #128                       \n"
-    "beq        50f                            \n"
-
-    "vdup.8     d5, %4                         \n"
-    "rsb        %4, #256                       \n"
-    "vdup.8     d4, %4                         \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vmull.u8   q13, d0, d4                    \n"
-    "vmull.u8   q14, d1, d4                    \n"
-    "vmlal.u8   q13, d2, d5                    \n"
-    "vmlal.u8   q14, d3, d5                    \n"
-    "vrshrn.u16 d0, q13, #8                    \n"
-    "vrshrn.u16 d1, q14, #8                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    MEMACCESS(2)
-    "vld1.8     {q1}, [%2]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    "vrhadd.u8  q0, q1                         \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8     {q0}, [%1]!                    \n"
-    "subs       %3, %3, #16                    \n"
-    MEMACCESS(0)
-    "vst1.8     {q0}, [%0]!                    \n"
-    "bgt        100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction)       // %4
-  :
-  : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
-  );
+  asm volatile(
+      "cmp        %4, #0                         \n"
+      "beq        100f                           \n"
+      "add        %2, %1                         \n"
+      "cmp        %4, #128                       \n"
+      "beq        50f                            \n"
+
+      "vdup.8     d5, %4                         \n"
+      "rsb        %4, #256                       \n"
+      "vdup.8     d4, %4                         \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vmull.u8   q13, d0, d4                    \n"
+      "vmull.u8   q14, d1, d4                    \n"
+      "vmlal.u8   q13, d2, d5                    \n"
+      "vmlal.u8   q14, d3, d5                    \n"
+      "vrshrn.u16 d0, q13, #8                    \n"
+      "vrshrn.u16 d1, q14, #8                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        1b                             \n"
+      "b          99f                            \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "vld1.8     {q1}, [%2]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vrhadd.u8  q0, q1                         \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        50b                            \n"
+      "b          99f                            \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8     {q0}, [%1]!                    \n"
+      "subs       %3, %3, #16                    \n"
+      "vst1.8     {q0}, [%0]!                    \n"
+      "bgt        100b                           \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),     // %0
+        "+r"(src_ptr),     // %1
+        "+r"(src_stride),  // %2
+        "+r"(dst_width),   // %3
+        "+r"(y1_fraction)  // %4
+      :
+      : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %3, #8                         \n"
-    "blt        89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
-    "bge        8b                             \n"
-
-  "89:                                         \n"
-    "adds       %3, #8-1                       \n"
-    "blt        99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
-    "subs       %3, %3, #1                     \n"  // 1 processed per loop.
-    "vmull.u8   q10, d4, d3                    \n"  // db * a
-    "vmull.u8   q11, d5, d3                    \n"  // dg * a
-    "vmull.u8   q12, d6, d3                    \n"  // dr * a
-    "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
-    "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
-    "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
-    "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
-    "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
-    "vqadd.u8   q0, q0, q2                     \n"  // + sbg
-    "vqadd.u8   d2, d2, d6                     \n"  // + sr
-    "vmov.u8    d3, #255                       \n"  // a = 255
-    MEMACCESS(2)
-    "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
-    "bge        1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
-  );
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %3, #8                         \n"
+      "blt        89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB0.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 pixels of ARGB1.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"  // db * a
+      "vmull.u8   q11, d5, d3                    \n"  // dg * a
+      "vmull.u8   q12, d6, d3                    \n"  // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"  // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"  // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"  // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"  // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"  // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"  // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"  // + sr
+      "vmov.u8    d3, #255                       \n"  // a = 255
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 pixels of ARGB.
+      "bge        8b                             \n"
+
+      "89:                                       \n"
+      "adds       %3, #8-1                       \n"
+      "blt        99f                            \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n"  // load 1 pixel ARGB0.
+      "vld4.8     {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n"  // load 1 pixel ARGB1.
+      "subs       %3, %3, #1                     \n"    // 1 processed per loop.
+      "vmull.u8   q10, d4, d3                    \n"    // db * a
+      "vmull.u8   q11, d5, d3                    \n"    // dg * a
+      "vmull.u8   q12, d6, d3                    \n"    // dr * a
+      "vqrshrn.u16 d20, q10, #8                  \n"    // db >>= 8
+      "vqrshrn.u16 d21, q11, #8                  \n"    // dg >>= 8
+      "vqrshrn.u16 d22, q12, #8                  \n"    // dr >>= 8
+      "vqsub.u8   q2, q2, q10                    \n"    // dbg - dbg * a / 256
+      "vqsub.u8   d6, d6, d22                    \n"    // dr - dr * a / 256
+      "vqadd.u8   q0, q0, q2                     \n"    // + sbg
+      "vqadd.u8   d2, d2, d6                     \n"    // + sr
+      "vmov.u8    d3, #255                       \n"    // a = 255
+      "vst4.8     {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n"  // store 1 pixel.
+      "bge        1b                             \n"
+
+      "99:                                         \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12");
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q10, d0, d3                    \n"  // b * a
-    "vmull.u8   q11, d1, d3                    \n"  // g * a
-    "vmull.u8   q12, d2, d3                    \n"  // r * a
-    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
-    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
-    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q10, d0, d3                    \n"  // b * a
+      "vmull.u8   q11, d1, d3                    \n"  // g * a
+      "vmull.u8   q12, d2, d3                    \n"  // r * a
+      "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+      "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+      "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q10", "q11", "q12");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "vdup.u16   q8, %2                         \n"
-    "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
-    "vdup.u16   q9, %3                         \n"  // interval multiply.
-    "vdup.u16   q10, %4                        \n"  // interval add
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
-    "vmovl.u8   q1, d2                         \n"
-    "vmovl.u8   q2, d4                         \n"
-    "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
-    "vqdmulh.s16 q1, q1, q8                    \n"  // g
-    "vqdmulh.s16 q2, q2, q8                    \n"  // r
-    "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
-    "vmul.u16   q1, q1, q9                     \n"  // g
-    "vmul.u16   q2, q2, q9                     \n"  // r
-    "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
-    "vadd.u16   q1, q1, q10                    \n"  // g
-    "vadd.u16   q2, q2, q10                    \n"  // r
-    "vqmovn.u16 d0, q0                         \n"
-    "vqmovn.u16 d2, q1                         \n"
-    "vqmovn.u16 d4, q2                         \n"
-    MEMACCESS(0)
-    "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
-  );
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "vdup.u16   q8, %2                         \n"
+      "vshr.u16   q8, q8, #1                     \n"  // scale >>= 1
+      "vdup.u16   q9, %3                         \n"  // interval multiply.
+      "vdup.u16   q10, %4                        \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]         \n"  // load 8 pixels of ARGB.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q0, d0                         \n"  // b (0 .. 255)
+      "vmovl.u8   q1, d2                         \n"
+      "vmovl.u8   q2, d4                         \n"
+      "vqdmulh.s16 q0, q0, q8                    \n"  // b * scale
+      "vqdmulh.s16 q1, q1, q8                    \n"  // g
+      "vqdmulh.s16 q2, q2, q8                    \n"  // r
+      "vmul.u16   q0, q0, q9                     \n"  // b * interval_size
+      "vmul.u16   q1, q1, q9                     \n"  // g
+      "vmul.u16   q2, q2, q9                     \n"  // r
+      "vadd.u16   q0, q0, q10                    \n"  // b + interval_offset
+      "vadd.u16   q1, q1, q10                    \n"  // g
+      "vadd.u16   q2, q2, q10                    \n"  // r
+      "vqmovn.u16 d0, q0                         \n"
+      "vqmovn.u16 d2, q1                         \n"
+      "vqmovn.u16 d4, q2                         \n"
+      "vst4.8     {d0, d2, d4, d6}, [%0]!        \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10");
 }
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "vdup.u32   q0, %3                         \n"  // duplicate scale value.
-    "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
-    "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
-    "vmovl.u8   q11, d22                       \n"
-    "vmovl.u8   q12, d24                       \n"
-    "vmovl.u8   q13, d26                       \n"
-    "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
-    "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
-    "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
-    "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
-    "vqmovn.u16 d20, q10                       \n"
-    "vqmovn.u16 d22, q11                       \n"
-    "vqmovn.u16 d24, q12                       \n"
-    "vqmovn.u16 d26, q13                       \n"
-    MEMACCESS(1)
-    "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
-  );
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "vdup.u32   q0, %3                         \n"  // duplicate scale value.
+      "vzip.u8    d0, d1                         \n"  // d0 aarrggbb.
+      "vshr.u16   q0, q0, #1                     \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d20, d22, d24, d26}, [%0]!    \n"  // load 8 pixels of ARGB.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q10, d20                       \n"  // b (0 .. 255)
+      "vmovl.u8   q11, d22                       \n"
+      "vmovl.u8   q12, d24                       \n"
+      "vmovl.u8   q13, d26                       \n"
+      "vqrdmulh.s16 q10, q10, d0[0]              \n"  // b * scale * 2
+      "vqrdmulh.s16 q11, q11, d0[1]              \n"  // g
+      "vqrdmulh.s16 q12, q12, d0[2]              \n"  // r
+      "vqrdmulh.s16 q13, q13, d0[3]              \n"  // a
+      "vqmovn.u16 d20, q10                       \n"
+      "vqmovn.u16 d22, q11                       \n"
+      "vqmovn.u16 d24, q12                       \n"
+      "vqmovn.u16 d26, q13                       \n"
+      "vst4.8     {d20, d22, d24, d26}, [%1]!    \n"  // store 8 pixels of ARGB.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "q0", "q10", "q11", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
-    "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
-    "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d24                    \n"  // B
-    "vmlal.u8   q2, d1, d25                    \n"  // G
-    "vmlal.u8   q2, d2, d26                    \n"  // R
-    "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
-    "vmov       d1, d0                         \n"  // G
-    "vmov       d2, d0                         \n"  // R
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
-  );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d24, #15                       \n"  // B * 0.11400 coefficient
+      "vmov.u8    d25, #75                       \n"  // G * 0.58700 coefficient
+      "vmov.u8    d26, #38                       \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d24                    \n"  // B
+      "vmlal.u8   q2, d1, d25                    \n"  // G
+      "vmlal.u8   q2, d2, d26                    \n"  // R
+      "vqrshrun.s16 d0, q2, #7                   \n"  // 15 bit to 8 bit B
+      "vmov       d1, d0                         \n"  // G
+      "vmov       d2, d0                         \n"  // R
+      "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q12", "q13");
 }
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
 //    b = (r * 35 + g * 68 + b * 17) >> 7
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d20, #17                       \n"  // BB coefficient
-    "vmov.u8    d21, #68                       \n"  // BG coefficient
-    "vmov.u8    d22, #35                       \n"  // BR coefficient
-    "vmov.u8    d24, #22                       \n"  // GB coefficient
-    "vmov.u8    d25, #88                       \n"  // GG coefficient
-    "vmov.u8    d26, #45                       \n"  // GR coefficient
-    "vmov.u8    d28, #24                       \n"  // BB coefficient
-    "vmov.u8    d29, #98                       \n"  // BG coefficient
-    "vmov.u8    d30, #50                       \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
-    "subs       %1, %1, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
-    "vmlal.u8   q2, d1, d21                    \n"  // G
-    "vmlal.u8   q2, d2, d22                    \n"  // R
-    "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
-    "vmlal.u8   q3, d1, d25                    \n"  // G
-    "vmlal.u8   q3, d2, d26                    \n"  // R
-    "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
-    "vmlal.u8   q8, d1, d29                    \n"  // G
-    "vmlal.u8   q8, d2, d30                    \n"  // R
-    "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
-    "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
-    "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "vmov.u8    d20, #17                       \n"  // BB coefficient
+      "vmov.u8    d21, #68                       \n"  // BG coefficient
+      "vmov.u8    d22, #35                       \n"  // BR coefficient
+      "vmov.u8    d24, #22                       \n"  // GB coefficient
+      "vmov.u8    d25, #88                       \n"  // GG coefficient
+      "vmov.u8    d26, #45                       \n"  // GR coefficient
+      "vmov.u8    d28, #24                       \n"  // BB coefficient
+      "vmov.u8    d29, #98                       \n"  // BG coefficient
+      "vmov.u8    d30, #50                       \n"  // BR coefficient
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]         \n"  // load 8 ARGB pixels.
+      "subs       %1, %1, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q2, d0, d20                    \n"  // B to Sepia B
+      "vmlal.u8   q2, d1, d21                    \n"  // G
+      "vmlal.u8   q2, d2, d22                    \n"  // R
+      "vmull.u8   q3, d0, d24                    \n"  // B to Sepia G
+      "vmlal.u8   q3, d1, d25                    \n"  // G
+      "vmlal.u8   q3, d2, d26                    \n"  // R
+      "vmull.u8   q8, d0, d28                    \n"  // B to Sepia R
+      "vmlal.u8   q8, d1, d29                    \n"  // G
+      "vmlal.u8   q8, d2, d30                    \n"  // R
+      "vqshrn.u16 d0, q2, #7                     \n"  // 16 bit to 8 bit B
+      "vqshrn.u16 d1, q3, #7                     \n"  // 16 bit to 8 bit G
+      "vqshrn.u16 d2, q8, #7                     \n"  // 16 bit to 8 bit R
+      "vst4.8     {d0, d1, d2, d3}, [%0]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12", "q13",
+        "q14", "q15");
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
-    "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
-    "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
-    "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
-    "vmovl.u8   q9, d18                        \n"  // g
-    "vmovl.u8   q10, d20                       \n"  // r
-    "vmovl.u8   q11, d22                       \n"  // a
-    "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
-    "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
-    "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
-    "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
-    "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
-    "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
-    "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
-    "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
-    "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
-    "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
-    "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
-    "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
-    "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
-    "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
-    "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
-    "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
-    "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
-    "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
-    "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
-    "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
-    "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
-    "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
-    "q10", "q11", "q12", "q13", "q14", "q15"
-  );
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "vld1.8     {q2}, [%3]                     \n"  // load 3 ARGB vectors.
+      "vmovl.s8   q0, d4                         \n"  // B,G coefficients s16.
+      "vmovl.s8   q1, d5                         \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "vld4.8     {d16, d18, d20, d22}, [%0]!    \n"  // load 8 ARGB pixels.
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+      "vmovl.u8   q8, d16                        \n"  // b (0 .. 255) 16 bit
+      "vmovl.u8   q9, d18                        \n"  // g
+      "vmovl.u8   q10, d20                       \n"  // r
+      "vmovl.u8   q11, d22                       \n"  // a
+      "vmul.s16   q12, q8, d0[0]                 \n"  // B = B * Matrix B
+      "vmul.s16   q13, q8, d1[0]                 \n"  // G = B * Matrix G
+      "vmul.s16   q14, q8, d2[0]                 \n"  // R = B * Matrix R
+      "vmul.s16   q15, q8, d3[0]                 \n"  // A = B * Matrix A
+      "vmul.s16   q4, q9, d0[1]                  \n"  // B += G * Matrix B
+      "vmul.s16   q5, q9, d1[1]                  \n"  // G += G * Matrix G
+      "vmul.s16   q6, q9, d2[1]                  \n"  // R += G * Matrix R
+      "vmul.s16   q7, q9, d3[1]                  \n"  // A += G * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q10, d0[2]                 \n"  // B += R * Matrix B
+      "vmul.s16   q5, q10, d1[2]                 \n"  // G += R * Matrix G
+      "vmul.s16   q6, q10, d2[2]                 \n"  // R += R * Matrix R
+      "vmul.s16   q7, q10, d3[2]                 \n"  // A += R * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vmul.s16   q4, q11, d0[3]                 \n"  // B += A * Matrix B
+      "vmul.s16   q5, q11, d1[3]                 \n"  // G += A * Matrix G
+      "vmul.s16   q6, q11, d2[3]                 \n"  // R += A * Matrix R
+      "vmul.s16   q7, q11, d3[3]                 \n"  // A += A * Matrix A
+      "vqadd.s16  q12, q12, q4                   \n"  // Accumulate B
+      "vqadd.s16  q13, q13, q5                   \n"  // Accumulate G
+      "vqadd.s16  q14, q14, q6                   \n"  // Accumulate R
+      "vqadd.s16  q15, q15, q7                   \n"  // Accumulate A
+      "vqshrun.s16 d16, q12, #6                  \n"  // 16 bit to 8 bit B
+      "vqshrun.s16 d18, q13, #6                  \n"  // 16 bit to 8 bit G
+      "vqshrun.s16 d20, q14, #6                  \n"  // 16 bit to 8 bit R
+      "vqshrun.s16 d22, q15, #6                  \n"  // 16 bit to 8 bit A
+      "vst4.8     {d16, d18, d20, d22}, [%1]!    \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9",
+        "q10", "q11", "q12", "q13", "q14", "q15");
 }
 
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vmull.u8   q0, d0, d1                     \n"  // multiply B
-    "vmull.u8   q1, d2, d3                     \n"  // multiply G
-    "vmull.u8   q2, d4, d5                     \n"  // multiply R
-    "vmull.u8   q3, d6, d7                     \n"  // multiply A
-    "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
-    "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
-    "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
-    "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vmull.u8   q0, d0, d1                     \n"  // multiply B
+      "vmull.u8   q1, d2, d3                     \n"  // multiply G
+      "vmull.u8   q2, d4, d5                     \n"  // multiply R
+      "vmull.u8   q3, d6, d7                     \n"  // multiply A
+      "vrshrn.u16 d0, q0, #8                     \n"  // 16 bit to 8 bit B
+      "vrshrn.u16 d1, q1, #8                     \n"  // 16 bit to 8 bit G
+      "vrshrn.u16 d2, q2, #8                     \n"  // 16 bit to 8 bit R
+      "vrshrn.u16 d3, q3, #8                     \n"  // 16 bit to 8 bit A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   q0, q0, q2                     \n"  // add B, G
-    "vqadd.u8   q1, q1, q3                     \n"  // add R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   q0, q0, q2                     \n"  // add B, G
+      "vqadd.u8   q1, q1, q3                     \n"  // add R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
-    "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "q0", "q1", "q2", "q3"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d4, d5, d6, d7}, [%1]!        \n"  // load 8 more ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqsub.u8   q0, q0, q2                     \n"  // subtract B, G
+      "vqsub.u8   q1, q1, q3                     \n"  // subtract R, A
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2672,54 +2455,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d0, d0, d1                     \n"  // add
-    "vmov.u8    d1, d0                         \n"
-    "vmov.u8    d2, d0                         \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d1}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d0, d0, d1                     \n"  // add
+      "vmov.u8    d1, d0                         \n"
+      "vmov.u8    d2, d0                         \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop.
-    "vqadd.u8   q0, q0, q1                     \n"  // add
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 sobelx.
+      "vld1.8     {q1}, [%1]!                    \n"  // load 16 sobely.
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vqadd.u8   q0, q0, q1                     \n"  // add
+      "vst1.8     {q0}, [%2]!                    \n"  // store 16 pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2727,115 +2506,186 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "vmov.u8    d3, #255                       \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vqadd.u8   d1, d0, d2                     \n"  // add
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
-    "bgt        1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "q0", "q1"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "vmov.u8    d3, #255                       \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 sobelx.
+      "vld1.8     {d0}, [%1]!                    \n"  // load 8 sobely.
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vqadd.u8   d1, d0, d2                     \n"  // add
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"  // store 8 ARGB pixels.
+      "bgt        1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "q0", "q1");
 }
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%5                  \n"  // top
-    MEMACCESS(0)
-    "vld1.8     {d1}, [%0],%6                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(1)
-    "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%6                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(2)
-    "vld1.8     {d2}, [%2],%5                  \n"  // bottom
-    MEMACCESS(2)
-    "vld1.8     {d3}, [%2],%6                  \n"
-    "subs       %4, %4, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(3)
-    "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2),            // %5
-    "r"(6)             // %6
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%5                  \n"  // top
+      "vld1.8     {d1}, [%0],%6                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%1],%5                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%6                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%2],%5                  \n"  // bottom
+      "vld1.8     {d3}, [%2],%6                  \n"
+      "subs       %4, %4, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%3]!                    \n"  // store 8 sobelx
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(src_y2),               // %2
+        "+r"(dst_sobelx),           // %3
+        "+r"(width)                 // %4
+      : "r"(2),                     // %5
+        "r"(6)                      // %6
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
 }
 
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0],%4                  \n"  // left
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1],%4                  \n"
-    "vsubl.u8   q0, d0, d1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%4                  \n"
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0],%5                  \n"  // right
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1],%5                  \n"
-    "subs       %3, %3, #8                     \n"  // 8 pixels
-    "vsubl.u8   q1, d2, d3                     \n"
-    "vadd.s16   q0, q0, q1                     \n"
-    "vabs.s16   q0, q0                         \n"
-    "vqmovn.u16 d0, q0                         \n"
-    MEMACCESS(2)
-    "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
-    "bgt        1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1),            // %4
-    "r"(6)             // %5
-  : "cc", "memory", "q0", "q1"  // Clobber List
-  );
-}
-#endif  // defined(__ARM_NEON__) && !defined(__aarch64__)
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0],%4                  \n"  // left
+      "vld1.8     {d1}, [%1],%4                  \n"
+      "vsubl.u8   q0, d0, d1                     \n"
+      "vld1.8     {d2}, [%0],%4                  \n"  // center * 2
+      "vld1.8     {d3}, [%1],%4                  \n"
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vld1.8     {d2}, [%0],%5                  \n"  // right
+      "vld1.8     {d3}, [%1],%5                  \n"
+      "subs       %3, %3, #8                     \n"  // 8 pixels
+      "vsubl.u8   q1, d2, d3                     \n"
+      "vadd.s16   q0, q0, q1                     \n"
+      "vabs.s16   q0, q0                         \n"
+      "vqmovn.u16 d0, q0                         \n"
+      "vst1.8     {d0}, [%2]!                    \n"  // store 8 sobely
+      "bgt        1b                             \n"
+      : "+r"(src_y0),               // %0
+        "+r"(src_y1),               // %1
+        "+r"(dst_sobely),           // %2
+        "+r"(width)                 // %3
+      : "r"(1),                     // %4
+        "r"(6)                      // %5
+      : "cc", "memory", "q0", "q1"  // Clobber List
+      );
+}
+
+// %y passes a float as a scalar vector for vector * scalar multiply.
+// the regoster must be d0 to d15 and indexed with [0] or [1] to access
+// the float in the first or second float of the d-reg
+
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),              // %0
+        "+r"(dst),              // %1
+        "+r"(width)             // %2
+      : "w"(1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {q1}, [%0]!                    \n"  // load 8 shorts
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u16  q2, d2                         \n"  // 8 int's
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // adjust exponent
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vqshrn.u32 d2, q2, #13                    \n"  // isolate halffloat
+      "vqshrn.u32 d3, q3, #13                    \n"
+      "vst1.8     {q1}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+
+      "1:                                        \n"
+      "vld1.8     {d2}, [%0]!                    \n"  // load 8 bytes
+      "subs       %2, %2, #8                     \n"  // 8 pixels per loop
+      "vmovl.u8   q1, d2                         \n"  // 8 shorts
+      "vmovl.u16  q2, d2                         \n"  // 8 ints
+      "vmovl.u16  q3, d3                         \n"
+      "vcvt.f32.u32  q2, q2                      \n"  // 8 floats
+      "vcvt.f32.u32  q3, q3                      \n"
+      "vmul.f32   q2, q2, %y3                    \n"  // scale
+      "vmul.f32   q3, q3, %y3                    \n"
+      "vst1.8     {q2, q3}, [%1]!                \n"  // store 8 floats
+      "bgt        1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "q1", "q2", "q3");
+}
+
+#endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libvpx/third_party/libyuv/source/row_neon64.cc b/libs/libvpx/third_party/libyuv/source/row_neon64.cc
index 6375d4f55f6..24b4520babc 100644
--- a/libs/libvpx/third_party/libyuv/source/row_neon64.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_neon64.cc
@@ -19,118 +19,103 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 8 Y, 4 U and 4 V from 422
-#define READYUV422                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.s}[0], [%1], #4            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.s}[1], [%2], #4            \n"
-
-// Read 8 Y, 2 U and 2 V from 422
-#define READYUV411                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.h}[0], [%1], #2            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v2.h}[1], [%2], #2            \n"                             \
-    "zip1       v1.8b, v2.8b, v2.8b            \n"
+#define READYUV422                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.s}[0], [%1], #4            \n" \
+  "ld1        {v1.s}[1], [%2], #4            \n"
 
 // Read 8 Y, 8 U and 8 V from 444
-#define READYUV444                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v1.d}[0], [%1], #8            \n"                             \
-    MEMACCESS(2)                                                               \
-    "ld1        {v1.d}[1], [%2], #8            \n"                             \
-    "uaddlp     v1.8h, v1.16b                  \n"                             \
-    "rshrn      v1.8b, v1.8h, #1               \n"
+#define READYUV444                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v1.d}[0], [%1], #8            \n" \
+  "ld1        {v1.d}[1], [%2], #8            \n" \
+  "uaddlp     v1.8h, v1.16b                  \n" \
+  "rshrn      v1.8b, v1.8h, #1               \n"
 
 // Read 8 Y, and set 4 U and 4 V to 128
-#define READYUV400                                                             \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    "movi       v1.8b , #128                   \n"
+#define READYUV400                               \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "movi       v1.8b , #128                   \n"
 
 // Read 8 Y and 4 UV from NV12
-#define READNV12                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV12                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 Y and 4 VU from NV21
-#define READNV21                                                               \
-    MEMACCESS(0)                                                               \
-    "ld1        {v0.8b}, [%0], #8              \n"                             \
-    MEMACCESS(1)                                                               \
-    "ld1        {v2.8b}, [%1], #8              \n"                             \
-    "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READNV21                                 \
+  "ld1        {v0.8b}, [%0], #8              \n" \
+  "ld1        {v2.8b}, [%1], #8              \n" \
+  "uzp1       v3.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v1.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 YUY2
-#define READYUY2                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"                             \
-    "uzp2       v3.8b, v1.8b, v1.8b            \n"                             \
-    "uzp1       v1.8b, v1.8b, v1.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
+#define READYUY2                                 \
+  "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
+  "uzp2       v3.8b, v1.8b, v1.8b            \n" \
+  "uzp1       v1.8b, v1.8b, v1.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
 
 // Read 8 UYVY
-#define READUYVY                                                               \
-    MEMACCESS(0)                                                               \
-    "ld2        {v2.8b, v3.8b}, [%0], #16      \n"                             \
-    "orr        v0.8b, v3.8b, v3.8b            \n"                             \
-    "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
-    "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
-    "ins        v1.s[1], v3.s[0]               \n"
-
-#define YUVTORGB_SETUP                                                         \
-    "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n"                             \
-    "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n"                             \
-    "ld1r       {v31.4s}, [%[kYToRgb]]         \n"                             \
-    "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n"                             \
-    "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
-
-#define YUVTORGB(vR, vG, vB)                                                   \
-    "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */          \
-    "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */          \
-    "ushll2     v3.4s, v0.8h, #0               \n" /* Y */                     \
-    "ushll      v0.4s, v0.4h, #0               \n"                             \
-    "mul        v3.4s, v3.4s, v31.4s           \n"                             \
-    "mul        v0.4s, v0.4s, v31.4s           \n"                             \
-    "sqshrun    v0.4h, v0.4s, #16              \n"                             \
-    "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */                     \
-    "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */          \
-    "mov        v2.d[0], v1.d[1]               \n" /* Extract V */             \
-    "uxtl       v2.8h, v2.8b                   \n"                             \
-    "uxtl       v1.8h, v1.8b                   \n" /* Extract U */             \
-    "mul        v3.8h, v1.8h, v27.8h           \n"                             \
-    "mul        v5.8h, v1.8h, v29.8h           \n"                             \
-    "mul        v6.8h, v2.8h, v30.8h           \n"                             \
-    "mul        v7.8h, v2.8h, v28.8h           \n"                             \
-    "sqadd      v6.8h, v6.8h, v5.8h            \n"                             \
-    "sqadd      " #vB ".8h, v24.8h, v0.8h      \n" /* B */                     \
-    "sqadd      " #vG ".8h, v25.8h, v0.8h      \n" /* G */                     \
-    "sqadd      " #vR ".8h, v26.8h, v0.8h      \n" /* R */                     \
-    "sqadd      " #vB ".8h, " #vB ".8h, v3.8h  \n" /* B */                     \
-    "sqsub      " #vG ".8h, " #vG ".8h, v6.8h  \n" /* G */                     \
-    "sqadd      " #vR ".8h, " #vR ".8h, v7.8h  \n" /* R */                     \
-    "sqshrun    " #vB ".8b, " #vB ".8h, #6     \n" /* B */                     \
-    "sqshrun    " #vG ".8b, " #vG ".8h, #6     \n" /* G */                     \
-    "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */                     \
-
-void I444ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+#define READUYVY                                 \
+  "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
+  "orr        v0.8b, v3.8b, v3.8b            \n" \
+  "uzp1       v1.8b, v2.8b, v2.8b            \n" \
+  "uzp2       v3.8b, v2.8b, v2.8b            \n" \
+  "ins        v1.s[1], v3.s[0]               \n"
+
+#define YUVTORGB_SETUP                           \
+  "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
+  "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
+  "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
+  "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
+  "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
+
+#define YUVTORGB(vR, vG, vB)                                        \
+  "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
+  "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
+  "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
+  "ushll      v0.4s, v0.4h, #0               \n"                    \
+  "mul        v3.4s, v3.4s, v31.4s           \n"                    \
+  "mul        v0.4s, v0.4s, v31.4s           \n"                    \
+  "sqshrun    v0.4h, v0.4s, #16              \n"                    \
+  "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
+  "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
+  "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
+  "uxtl       v2.8h, v2.8b                   \n"                    \
+  "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
+  "mul        v3.8h, v1.8h, v27.8h           \n"                    \
+  "mul        v5.8h, v1.8h, v29.8h           \n"                    \
+  "mul        v6.8h, v2.8h, v30.8h           \n"                    \
+  "mul        v7.8h, v2.8h, v28.8h           \n"                    \
+  "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
+  "sqadd      " #vB                                                 \
+  ".8h, v24.8h, v0.8h      \n" /* B */                              \
+  "sqadd      " #vG                                                 \
+  ".8h, v25.8h, v0.8h      \n" /* G */                              \
+  "sqadd      " #vR                                                 \
+  ".8h, v26.8h, v0.8h      \n" /* R */                              \
+  "sqadd      " #vB ".8h, " #vB                                     \
+  ".8h, v3.8h  \n" /* B */                                          \
+  "sqsub      " #vG ".8h, " #vG                                     \
+  ".8h, v6.8h  \n" /* G */                                          \
+  "sqadd      " #vR ".8h, " #vR                                     \
+  ".8h, v7.8h  \n" /* R */                                          \
+  "sqshrun    " #vB ".8b, " #vB                                     \
+  ".8h, #6     \n" /* B */                                          \
+  "sqshrun    " #vG ".8b, " #vG                                     \
+  ".8h, #6     \n"                               /* G */            \
+  "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
+
+void I444ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -140,7 +125,6 @@ void I444ToARGBRow_NEON(const uint8* src_y,
     READYUV444
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -157,10 +141,10 @@ void I444ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I422ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
+void I422ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -170,7 +154,6 @@ void I422ToARGBRow_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -187,11 +170,11 @@ void I422ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
-                             const uint8* src_u,
-                             const uint8* src_v,
-                             const uint8* src_a,
-                             uint8* dst_argb,
+void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
+                             const uint8_t* src_u,
+                             const uint8_t* src_v,
+                             const uint8_t* src_a,
+                             uint8_t* dst_argb,
                              const struct YuvConstants* yuvconstants,
                              int width) {
   asm volatile (
@@ -199,10 +182,8 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
   "1:                                          \n"
     READYUV422
     YUVTORGB(v22, v21, v20)
-    MEMACCESS(3)
     "ld1        {v23.8b}, [%3], #8             \n"
     "subs       %w5, %w5, #8                   \n"
-    MEMACCESS(4)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -220,40 +201,10 @@ void I422AlphaToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void I411ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n" /* A */
-  "1:                                          \n"
-    READYUV411
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(src_u),     // %1
-      "+r"(src_v),     // %2
-      "+r"(dst_argb),  // %3
-      "+r"(width)      // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-void I422ToRGBARow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_rgba,
+void I422ToRGBARow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_rgba,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -263,7 +214,6 @@ void I422ToRGBARow_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v23, v22, v21)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -280,10 +230,10 @@ void I422ToRGBARow_NEON(const uint8* src_y,
   );
 }
 
-void I422ToRGB24Row_NEON(const uint8* src_y,
-                         const uint8* src_u,
-                         const uint8* src_v,
-                         uint8* dst_rgb24,
+void I422ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_u,
+                         const uint8_t* src_v,
+                         uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   asm volatile (
@@ -292,7 +242,6 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
     READYUV422
     YUVTORGB(v22, v21, v20)
     "subs       %w4, %w4, #8                   \n"
-    MEMACCESS(3)
     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -309,97 +258,91 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
   );
 }
 
-#define ARGBTORGB565                                                           \
-    "shll       v0.8h,  v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v21.8h, #5             \n"  /* RG                   */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* RGB                  */
+#define ARGBTORGB565                                                        \
+  "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
 
-void I422ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_u,
-                          const uint8* src_v,
-                          uint8* dst_rgb565,
+void I422ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_u,
+                          const uint8_t* src_v,
+                          uint8_t* dst_rgb565,
                           const struct YuvConstants* yuvconstants,
                           int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_rgb565),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-#define ARGBTOARGB1555                                                         \
-    "shll       v0.8h,  v23.8b, #8             \n"  /* A                    */ \
-    "shll       v22.8h, v22.8b, #8             \n"  /* R                    */ \
-    "shll       v21.8h, v21.8b, #8             \n"  /* G                    */ \
-    "shll       v20.8h, v20.8b, #8             \n"  /* B                    */ \
-    "sri        v0.8h,  v22.8h, #1             \n"  /* AR                   */ \
-    "sri        v0.8h,  v21.8h, #6             \n"  /* ARG                  */ \
-    "sri        v0.8h,  v20.8h, #11            \n"  /* ARGB                 */
-
-void I422ToARGB1555Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb1555,
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_u),       // %1
+        "+r"(src_v),       // %2
+        "+r"(dst_rgb565),  // %3
+        "+r"(width)        // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB1555                                                      \
+  "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
+  "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
+  "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
+  "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
+  "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
+  "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
+  "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
+
+void I422ToARGB1555Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb1555,
                             const struct YuvConstants* yuvconstants,
                             int width) {
-  asm volatile (
-    YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    READYUV422
-    YUVTORGB(v22, v21, v20)
-    "subs       %w4, %w4, #8                   \n"
-    ARGBTOARGB1555
-    MEMACCESS(3)
-    "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-    : "+r"(src_y),    // %0
-      "+r"(src_u),    // %1
-      "+r"(src_v),    // %2
-      "+r"(dst_argb1555),  // %3
-      "+r"(width)     // %4
-    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
-      [kUVToG]"r"(&yuvconstants->kUVToG),
-      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
-      [kYToRgb]"r"(&yuvconstants->kYToRgb)
-    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
-      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
-  );
-}
-
-#define ARGBTOARGB4444                                                         \
-    /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
-    "ushr       v20.8b, v20.8b, #4             \n"  /* B                    */ \
-    "bic        v21.8b, v21.8b, v4.8b          \n"  /* G                    */ \
-    "ushr       v22.8b, v22.8b, #4             \n"  /* R                    */ \
-    "bic        v23.8b, v23.8b, v4.8b          \n"  /* A                    */ \
-    "orr        v0.8b,  v20.8b, v21.8b         \n"  /* BG                   */ \
-    "orr        v1.8b,  v22.8b, v23.8b         \n"  /* RA                   */ \
-    "zip1       v0.16b, v0.16b, v1.16b         \n"  /* BGRA                 */
-
-void I422ToARGB4444Row_NEON(const uint8* src_y,
-                            const uint8* src_u,
-                            const uint8* src_v,
-                            uint8* dst_argb4444,
+  asm volatile(
+      YUVTORGB_SETUP
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n" READYUV422 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
+               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),         // %0
+        "+r"(src_u),         // %1
+        "+r"(src_v),         // %2
+        "+r"(dst_argb1555),  // %3
+        "+r"(width)          // %4
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+#define ARGBTOARGB4444                                                       \
+  /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
+  "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
+  "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
+  "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
+  "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
+  "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
+  "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
+  "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
+
+void I422ToARGB4444Row_NEON(const uint8_t* src_y,
+                            const uint8_t* src_u,
+                            const uint8_t* src_v,
+                            uint8_t* dst_argb4444,
                             const struct YuvConstants* yuvconstants,
                             int width) {
   asm volatile (
@@ -411,7 +354,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
     "subs       %w4, %w4, #8                   \n"
     "movi       v23.8b, #255                   \n"
     ARGBTOARGB4444
-    MEMACCESS(3)
     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
     "b.gt       1b                             \n"
     : "+r"(src_y),    // %0
@@ -428,9 +370,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
   );
 }
 
-void I400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
+void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
   asm volatile (
     YUVTORGB_SETUP
     "movi       v23.8b, #255                   \n"
@@ -438,7 +378,6 @@ void I400ToARGBRow_NEON(const uint8* src_y,
     READYUV400
     YUVTORGB(v22, v21, v20)
     "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -453,31 +392,26 @@ void I400ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void J400ToARGBRow_NEON(const uint8* src_y,
-                        uint8* dst_argb,
-                        int width) {
-  asm volatile (
-    "movi       v23.8b, #255                   \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v20.8b}, [%0], #8             \n"
-    "orr        v21.8b, v20.8b, v20.8b         \n"
-    "orr        v22.8b, v20.8b, v20.8b         \n"
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-    : "+r"(src_y),     // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
-    :
-    : "cc", "memory", "v20", "v21", "v22", "v23"
-  );
+void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v23.8b, #255                   \n"
+      "1:                                        \n"
+      "ld1        {v20.8b}, [%0], #8             \n"
+      "orr        v21.8b, v20.8b, v20.8b         \n"
+      "orr        v22.8b, v20.8b, v20.8b         \n"
+      "subs       %w2, %w2, #8                   \n"
+      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v20", "v21", "v22", "v23");
 }
 
-void NV12ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_uv,
-                        uint8* dst_argb,
+void NV12ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_uv,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -487,7 +421,6 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
     READNV12
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -503,9 +436,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void NV21ToARGBRow_NEON(const uint8* src_y,
-                        const uint8* src_vu,
-                        uint8* dst_argb,
+void NV21ToARGBRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_vu,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
@@ -515,7 +448,6 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
     READNV21
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    MEMACCESS(2)
     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
@@ -531,24 +463,22 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
   );
 }
 
-void NV12ToRGB565Row_NEON(const uint8* src_y,
-                          const uint8* src_uv,
-                          uint8* dst_rgb565,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+void NV12ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_uv,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
     YUVTORGB_SETUP
   "1:                                          \n"
     READNV12
     YUVTORGB(v22, v21, v20)
     "subs       %w3, %w3, #8                   \n"
-    ARGBTORGB565
-    MEMACCESS(2)
-    "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
     "b.gt       1b                             \n"
     : "+r"(src_y),     // %0
       "+r"(src_uv),    // %1
-      "+r"(dst_rgb565),  // %2
+      "+r"(dst_rgb24),  // %2
       "+r"(width)      // %3
     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
       [kUVToG]"r"(&yuvconstants->kUVToG),
@@ -559,23 +489,23 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
   );
 }
 
-void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+void NV21ToRGB24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_rgb24,
+                         const struct YuvConstants* yuvconstants,
+                         int width) {
   asm volatile (
     YUVTORGB_SETUP
-    "movi       v23.8b, #255                   \n"
   "1:                                          \n"
-    READYUY2
+    READNV21
     YUVTORGB(v22, v21, v20)
-    "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
+    "subs       %w3, %w3, #8                   \n"
+    "st3        {v20.8b,v21.8b,v22.8b}, [%2], #24     \n"
     "b.gt       1b                             \n"
-    : "+r"(src_yuy2),  // %0
-      "+r"(dst_argb),  // %1
-      "+r"(width)      // %2
+    : "+r"(src_y),     // %0
+      "+r"(src_vu),    // %1
+      "+r"(dst_rgb24),  // %2
+      "+r"(width)      // %3
     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
       [kUVToG]"r"(&yuvconstants->kUVToG),
       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
@@ -585,21 +515,45 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
   );
 }
 
-void UYVYToARGBRow_NEON(const uint8* src_uyvy,
-                        uint8* dst_argb,
+void NV12ToRGB565Row_NEON(const uint8_t* src_y,
+                          const uint8_t* src_uv,
+                          uint8_t* dst_rgb565,
+                          const struct YuvConstants* yuvconstants,
+                          int width) {
+  asm volatile(
+      YUVTORGB_SETUP
+      "1:                                        \n" READNV12 YUVTORGB(
+          v22, v21,
+          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
+               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
+                                                               // RGB565.
+               "b.gt       1b                             \n"
+      : "+r"(src_y),       // %0
+        "+r"(src_uv),      // %1
+        "+r"(dst_rgb565),  // %2
+        "+r"(width)        // %3
+      : [kUVToRB] "r"(&yuvconstants->kUVToRB),
+        [kUVToG] "r"(&yuvconstants->kUVToG),
+        [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
+        [kYToRgb] "r"(&yuvconstants->kYToRgb)
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
+}
+
+void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
+                        uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
                         int width) {
   asm volatile (
     YUVTORGB_SETUP
     "movi       v23.8b, #255                   \n"
   "1:                                          \n"
-    READUYVY
+    READYUY2
     YUVTORGB(v22, v21, v20)
     "subs       %w2, %w2, #8                   \n"
-    MEMACCESS(1)
-    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
     "b.gt       1b                             \n"
-    : "+r"(src_uyvy),  // %0
+    : "+r"(src_yuy2),  // %0
       "+r"(dst_argb),  // %1
       "+r"(width)      // %2
     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
@@ -611,870 +565,845 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
   );
 }
 
-// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
-void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store U
-    MEMACCESS(2)
-    "st1        {v1.16b}, [%2], #16            \n"  // store V
-    "b.gt       1b                             \n"
-    : "+r"(src_uv),  // %0
-      "+r"(dst_u),   // %1
-      "+r"(dst_v),   // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Reads 16 U's and V's and writes out 16 pairs of UV.
-void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load U
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load V
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    MEMACCESS(2)
-    "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
-    "b.gt       1b                             \n"
-    :
-      "+r"(src_u),   // %0
-      "+r"(src_v),   // %1
-      "+r"(dst_uv),  // %2
-      "+r"(width)    // %3  // Output registers
-    :                       // Input registers
-    : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-// Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
-void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
-    "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
-    MEMACCESS(1)
-    "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(count)  // %2  // Output registers
-  :                     // Input registers
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-// SetRow writes 'count' bytes using an 8 bit value repeated.
-void SetRow_NEON(uint8* dst, uint8 v8, int count) {
-  asm volatile (
-    "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
-  "1:                                          \n"
-    "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt       1b                             \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v8)      // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
-  asm volatile (
-    "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
-  "1:                                          \n"
-    "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store
-    "b.gt       1b                             \n"
-  : "+r"(dst),   // %0
-    "+r"(count)  // %1
-  : "r"(v32)     // %2
-  : "cc", "memory", "v0"
-  );
-}
-
-void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %w2, sxtw              \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-    "rev64      v0.16b, v0.16b                 \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-
-void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                      int width) {
-  asm volatile (
-    // Start at end of source row.
-    "add        %0, %0, %w3, sxtw #1           \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
-    "rev64      v0.8b, v0.8b                   \n"
-    "rev64      v1.8b, v1.8b                   \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_uv),  // %0
-    "+r"(dst_u),   // %1
-    "+r"(dst_v),   // %2
-    "+r"(width)    // %3
-  : "r"((ptrdiff_t)-16)      // %4
-  : "cc", "memory", "v0", "v1"
-  );
-}
-
-void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
-  asm volatile (
-  // Start at end of source row.
-    "add        %0, %0, %w2, sxtw #2           \n"
-    "sub        %0, %0, #16                    \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    "rev64      v0.4s, v0.4s                   \n"
-    MEMACCESS(1)
-    "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
-    MEMACCESS(1)
-    "st1        {v0.D}[0], [%1], #8            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src),   // %0
-    "+r"(dst),   // %1
-    "+r"(width)  // %2
-  : "r"((ptrdiff_t)-16)    // %3
-  : "cc", "memory", "v0"
-  );
-}
-
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v4.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v5.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),   // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
-    "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
-    MEMACCESS(1)
-    "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),    // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-#define RGB565TOARGB                                                           \
-    "shrn       v6.8b, v0.8h, #5               \n"  /* G xxGGGGGG           */ \
-    "shl        v6.8b, v6.8b, #2               \n"  /* G GGGGGG00 upper 6   */ \
-    "ushr       v4.8b, v6.8b, #6               \n"  /* G 000000GG lower 2   */ \
-    "orr        v1.8b, v4.8b, v6.8b            \n"  /* G                    */ \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "ushr       v0.8h, v0.8h, #11              \n"  /* R 000RRRRR           */ \
-    "xtn2       v2.16b,v0.8h                   \n"  /* R in upper part      */ \
-    "shl        v2.16b, v2.16b, #3             \n"  /* R,B BBBBB000 upper 5 */ \
-    "ushr       v0.16b, v2.16b, #5             \n"  /* R,B 00000BBB lower 3 */ \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* R,B                  */ \
-    "dup        v2.2D, v0.D[1]                 \n"  /* R                    */
-
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-  );
-}
-
-#define ARGB1555TOARGB                                                         \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000 AAAAAAAA    */ \
-                                                                               \
-    "sshr       v2.8h, v0.8h, #15              \n"  /* A AAAAAAAA           */ \
-    "xtn2       v3.16b, v2.8h                  \n"                             \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R,A 00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R,A                  */ \
-    "dup        v1.2D, v0.D[1]                 \n"                             \
-    "dup        v3.2D, v2.D[1]                 \n"
-
-// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
-#define RGB555TOARGB                                                           \
-    "ushr       v2.8h, v0.8h, #10              \n"  /* R xxxRRRRR           */ \
-    "shl        v2.8h, v2.8h, #3               \n"  /* R RRRRR000 upper 5   */ \
-    "xtn        v3.8b, v2.8h                   \n"  /* RRRRR000             */ \
-                                                                               \
-    "xtn        v2.8b, v0.8h                   \n"  /* B xxxBBBBB           */ \
-    "shrn2      v2.16b,v0.8h, #5               \n"  /* G xxxGGGGG           */ \
-                                                                               \
-    "ushr       v1.16b, v3.16b, #5             \n"  /* R   00000RRR lower 3 */ \
-    "shl        v0.16b, v2.16b, #3             \n"  /* B,G BBBBB000 upper 5 */ \
-    "ushr       v2.16b, v0.16b, #5             \n"  /* B,G 00000BBB lower 3 */ \
-                                                                               \
-    "orr        v0.16b, v0.16b, v2.16b         \n"  /* B,G                  */ \
-    "orr        v2.16b, v1.16b, v3.16b         \n"  /* R                    */ \
-    "dup        v1.2D, v0.D[1]                 \n"  /* G */                    \
-
-void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // Alpha
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-#define ARGB4444TOARGB                                                         \
-    "shrn       v1.8b,  v0.8h, #8              \n"  /* v1(l) AR             */ \
-    "xtn2       v1.16b, v0.8h                  \n"  /* v1(h) GB             */ \
-    "shl        v2.16b, v1.16b, #4             \n"  /* B,R BBBB0000         */ \
-    "ushr       v3.16b, v1.16b, #4             \n"  /* G,A 0000GGGG         */ \
-    "ushr       v0.16b, v2.16b, #4             \n"  /* B,R 0000BBBB         */ \
-    "shl        v1.16b, v3.16b, #4             \n"  /* G,A GGGG0000         */ \
-    "orr        v2.16b, v0.16b, v2.16b         \n"  /* B,R BBBBBBBB         */ \
-    "orr        v3.16b, v1.16b, v3.16b         \n"  /* G,A GGGGGGGG         */ \
-    "dup        v0.2D, v2.D[1]                 \n"                             \
-    "dup        v1.2D, v3.D[1]                 \n"
-
-void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_argb),    // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    MEMACCESS(1)
-    "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_rgb24),  // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-  );
-}
-
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
-    "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
-    MEMACCESS(1)
-    "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_raw),   // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-  );
-}
-
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1"  // Clobber List
-  );
-}
-
-void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
-                         int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
-    "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
-    MEMACCESS(2)
-    "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
-    "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_yuy2),     // %0
-    "+r"(src_yuy2b),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
-}
-
-void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_uyvyb = src_uyvy + stride_uyvy;
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
-    "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
-    "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
-    MEMACCESS(3)
-    "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
-    "b.gt       1b                             \n"
-  : "+r"(src_uyvy),     // %0
-    "+r"(src_uyvyb),    // %1
-    "+r"(dst_u),        // %2
-    "+r"(dst_v),        // %3
-    "+r"(width)           // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v5", "v6", "v7"  // Clobber List
-  );
-}
-
-// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // shuffler
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
-    "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-    "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store 4.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)        // %2
-  : "r"(shuffler)    // %3
-  : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-  );
-}
-
-void I422ToYUY2Row_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_yuy2, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
-    "orr        v2.8b, v1.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_yuy2),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-
-void I422ToUYVYRow_NEON(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_uyvy, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
-    "orr        v3.8b, v2.8b, v2.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
-    "subs       %w4, %w4, #16                  \n"  // 16 pixels
-    MEMACCESS(3)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_y),     // %0
-    "+r"(src_u),     // %1
-    "+r"(src_v),     // %2
-    "+r"(dst_uyvy),  // %3
-    "+r"(width)      // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
-}
-
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTORGB565
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_rgb565),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-
-void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
-  asm volatile (
-    "dup        v1.4s, %w2                     \n"  // dither4
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v20.8b, v20.8b, v1.8b          \n"
-    "uqadd      v21.8b, v21.8b, v1.8b          \n"
-    "uqadd      v22.8b, v22.8b, v1.8b          \n"
-    ARGBTORGB565
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
-    "b.gt       1b                             \n"
-  : "+r"(dst_rgb)    // %0
-  : "r"(src_argb),   // %1
-    "r"(dither4),    // %2
-    "r"(width)       // %3
-  : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
-  );
-}
-
-void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
-                            int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB1555
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb1555),  // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
-  );
-}
-
-void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
-                            int width) {
-  asm volatile (
-    "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGBTOARGB4444
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),      // %0
-    "+r"(dst_argb4444),  // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
-  );
-}
-
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_a),      // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
-    "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
-    "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
-}
-
-// 8x1 pixels.
-void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
-                         int width) {
+void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
+                        uint8_t* dst_argb,
+                        const struct YuvConstants* yuvconstants,
+                        int width) {
   asm volatile (
-    "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
-    "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
-    "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
-    "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
-    "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
-    "movi       v29.16b,#0x80                  \n"  // 128.5
+    YUVTORGB_SETUP
+    "movi       v23.8b, #255                   \n"
   "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
-    "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "umull      v3.8h, v2.8b, v24.8b           \n"  // R
-    "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
-    "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
-    "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
-
-    "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+    READUYVY
+    YUVTORGB(v22, v21, v20)
+    "subs       %w2, %w2, #8                   \n"
+    "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
     "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
-    "v24", "v25", "v26", "v27", "v28", "v29"
+    : "+r"(src_uyvy),  // %0
+      "+r"(dst_argb),  // %1
+      "+r"(width)      // %2
+    : [kUVToRB]"r"(&yuvconstants->kUVToRB),
+      [kUVToG]"r"(&yuvconstants->kUVToG),
+      [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
+      [kYToRgb]"r"(&yuvconstants->kYToRgb)
+    : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+      "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
   );
 }
 
-#define RGBTOUV_SETUP_REG                                                      \
-    "movi       v20.8h, #56, lsl #0  \n"  /* UB/VR coefficient (0.875) / 2 */  \
-    "movi       v21.8h, #37, lsl #0  \n"  /* UG coefficient (-0.5781) / 2  */  \
-    "movi       v22.8h, #19, lsl #0  \n"  /* UR coefficient (-0.2969) / 2  */  \
-    "movi       v23.8h, #9,  lsl #0  \n"  /* VB coefficient (-0.1406) / 2  */  \
-    "movi       v24.8h, #47, lsl #0  \n"  /* VG coefficient (-0.7344) / 2  */  \
-    "movi       v25.16b, #0x80       \n"  /* 128.5 (0x8080 in 16-bit)      */
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+void SplitUVRow_NEON(const uint8_t* src_uv,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store U
+      "st1        {v1.16b}, [%2], #16            \n"  // store V
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),               // %0
+        "+r"(dst_u),                // %1
+        "+r"(dst_v),                // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
 
-// 32x1 pixels -> 8x1.  width is number of argb pixels. e.g. 32.
-void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+void MergeUVRow_NEON(const uint8_t* src_u,
+                     const uint8_t* src_v,
+                     uint8_t* dst_uv,
+                     int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load U
+      "ld1        {v1.16b}, [%1], #16            \n"  // load V
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+      "b.gt       1b                             \n"
+      : "+r"(src_u),                // %0
+        "+r"(src_v),                // %1
+        "+r"(dst_uv),               // %2
+        "+r"(width)                 // %3  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
+void SplitRGBRow_NEON(const uint8_t* src_rgb,
+                      uint8_t* dst_r,
+                      uint8_t* dst_g,
+                      uint8_t* dst_b,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 RGB
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st1        {v0.16b}, [%1], #16            \n"  // store R
+      "st1        {v1.16b}, [%2], #16            \n"  // store G
+      "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb),                    // %0
+        "+r"(dst_r),                      // %1
+        "+r"(dst_g),                      // %2
+        "+r"(dst_b),                      // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
+void MergeRGBRow_NEON(const uint8_t* src_r,
+                      const uint8_t* src_g,
+                      const uint8_t* src_b,
+                      uint8_t* dst_rgb,
+                      int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load R
+      "ld1        {v1.16b}, [%1], #16            \n"  // load G
+      "ld1        {v2.16b}, [%2], #16            \n"  // load B
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "b.gt       1b                             \n"
+      : "+r"(src_r),                      // %0
+        "+r"(src_g),                      // %1
+        "+r"(src_b),                      // %2
+        "+r"(dst_rgb),                    // %3
+        "+r"(width)                       // %4
+      :                                   // Input registers
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+// Copy multiple of 32.
+void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ldp        q0, q1, [%0], #32              \n"
+      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
+      "stp        q0, q1, [%1], #32              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),                  // %0
+        "+r"(dst),                  // %1
+        "+r"(width)                 // %2  // Output registers
+      :                             // Input registers
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+// SetRow writes 'width' bytes using an 8 bit value repeated.
+void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
+  asm volatile(
+      "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
+      "1:                                        \n"
+      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v8)      // %2
+      : "cc", "memory", "v0");
+}
+
+void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
+  asm volatile(
+      "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
+      "1:                                        \n"
+      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
+      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "b.gt       1b                             \n"
+      : "+r"(dst),   // %0
+        "+r"(width)  // %1
+      : "r"(v32)     // %2
+      : "cc", "memory", "v0");
+}
+
+void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw              \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+      "rev64      v0.16b, v0.16b                 \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
+}
+
+void MirrorUVRow_NEON(const uint8_t* src_uv,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w3, sxtw #1           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
+      "rev64      v0.8b, v0.8b                   \n"
+      "rev64      v1.8b, v1.8b                   \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+      "st1        {v1.8b}, [%2], #8              \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_u),         // %1
+        "+r"(dst_v),         // %2
+        "+r"(width)          // %3
+      : "r"((ptrdiff_t)-16)  // %4
+      : "cc", "memory", "v0", "v1");
+}
+
+void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "add        %0, %0, %w2, sxtw #2           \n"
+      "sub        %0, %0, #16                    \n"
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "rev64      v0.4s, v0.4s                   \n"
+      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
+      "st1        {v0.D}[0], [%1], #8            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src),           // %0
+        "+r"(dst),           // %1
+        "+r"(width)          // %2
+      : "r"((ptrdiff_t)-16)  // %3
+      : "cc", "memory", "v0");
+}
+
+void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
+                         uint8_t* dst_argb,
                          int width) {
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(0)
-    "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%0], #64 \n"  // load next 16.
-    "uaddlp     v4.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v5.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v6.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
-
-    "addp       v0.8h, v0.8h, v4.8h            \n"  // B 16 shorts -> 8 shorts.
-    "addp       v1.8h, v1.8h, v5.8h            \n"  // G 16 shorts -> 8 shorts.
-    "addp       v2.8h, v2.8h, v6.8h            \n"  // R 16 shorts -> 8 shorts.
+  asm volatile(
+      "movi       v4.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v5.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),   // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
+      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
+      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),    // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+#define RGB565TOARGB                                                        \
+  "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
+  "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
+  "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
+  "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
+  "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
+  "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
+  "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
+  "dup        v2.2D, v0.D[1]                 \n" /* R                    */
+
+void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_argb),    // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
+      );
+}
+
+#define ARGB1555TOARGB                                                      \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
+                                                                            \
+  "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
+  "xtn2       v3.16b, v2.8h                  \n"                            \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
+  "dup        v1.2D, v0.D[1]                 \n"                            \
+  "dup        v3.2D, v2.D[1]                 \n"
 
-    "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
-    "urshr      v1.8h, v1.8h, #1               \n"
-    "urshr      v2.8h, v2.8h, #1               \n"
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB                                                        \
+  "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
+  "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
+  "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
+                                                                            \
+  "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
+  "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
+                                                                            \
+  "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
+  "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
+  "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
+                                                                            \
+  "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
+  "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
+  "dup        v1.2D, v0.D[1]                 \n" /* G */
+
+void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // Alpha
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+#define ARGB4444TOARGB                                                      \
+  "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
+  "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
+  "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
+  "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
+  "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
+  "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
+  "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
+  "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
+  "dup        v0.2D, v2.D[1]                 \n"                            \
+  "dup        v1.2D, v3.D[1]                 \n"
+
+void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
+                            uint8_t* dst_argb,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_argb),      // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_rgb24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
+                                                      // RGB24.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_rgb24),  // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
+      );
+}
+
+void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
+      "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
+      "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_raw),   // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
+      );
+}
+
+void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
+      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1"  // Clobber List
+      );
+}
+
+void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
+      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
+                      int stride_yuy2,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v1.8b, v1.8b, v5.8b            \n"        // average rows of U
+      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
+      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_yuy2),   // %0
+        "+r"(src_yuy2b),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
+}
+
+void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
+                      int stride_uyvy,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_uyvyb = src_uyvy + stride_uyvy;
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
+      "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
+      "urhadd     v0.8b, v0.8b, v4.8b            \n"        // average rows of U
+      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
+      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
+      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
+      "b.gt       1b                             \n"
+      : "+r"(src_uyvy),   // %0
+        "+r"(src_uyvyb),  // %1
+        "+r"(dst_u),      // %2
+        "+r"(dst_v),      // %3
+        "+r"(width)       // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+        "v7"  // Clobber List
+      );
+}
 
-    "subs       %w3, %w3, #32                  \n"  // 32 processed per loop.
-    "mul        v3.8h, v0.8h, v20.8h           \n"  // B
-    "mls        v3.8h, v1.8h, v21.8h           \n"  // G
-    "mls        v3.8h, v2.8h, v22.8h           \n"  // R
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v4.8h, v2.8h, v20.8h           \n"  // R
-    "mls        v4.8h, v1.8h, v24.8h           \n"  // G
-    "mls        v4.8h, v0.8h, v23.8h           \n"  // B
-    "add        v4.8h, v4.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v4.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
-    MEMACCESS(2)
-    "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_u),     // %1
-    "+r"(dst_v),     // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v23", "v24", "v25"
-  );
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+void ARGBShuffleRow_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_argb,
+                         const uint8_t* shuffler,
+                         int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
+      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
+      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
+      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                   // %0
+        "+r"(dst_argb),                   // %1
+        "+r"(width)                       // %2
+      : "r"(shuffler)                     // %3
+      : "cc", "memory", "v0", "v1", "v2"  // Clobber List
+      );
+}
+
+void I422ToYUY2Row_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_yuy2,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
+      "orr        v2.8b, v1.8b, v1.8b            \n"
+      "ld1        {v1.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_yuy2),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void I422ToUYVYRow_NEON(const uint8_t* src_y,
+                        const uint8_t* src_u,
+                        const uint8_t* src_v,
+                        uint8_t* dst_uyvy,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
+      "orr        v3.8b, v2.8b, v2.8b            \n"
+      "ld1        {v0.8b}, [%1], #8              \n"        // load 8 Us
+      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
+      "subs       %w4, %w4, #16                  \n"        // 16 pixels
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_y),     // %0
+        "+r"(src_u),     // %1
+        "+r"(src_v),     // %2
+        "+r"(dst_uyvy),  // %3
+        "+r"(width)      // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
+                          uint8_t* dst_rgb565,
+                          int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTORGB565
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(dst_rgb565),  // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
+                                uint8_t* dst_rgb,
+                                const uint32_t dither4,
+                                int width) {
+  asm volatile(
+      "dup        v1.4s, %w2                     \n"  // dither4
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v20.8b, v20.8b, v1.8b          \n"
+      "uqadd      v21.8b, v21.8b, v1.8b          \n"
+      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
+      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+      "b.gt       1b                             \n"
+      : "+r"(dst_rgb)   // %0
+      : "r"(src_argb),  // %1
+        "r"(dither4),   // %2
+        "r"(width)      // %3
+      : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb1555,
+                            int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB1555
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB1555.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb1555),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
+                            uint8_t* dst_argb4444,
+                            int width) {
+  asm volatile(
+      "movi       v4.16b, #0x0f                  \n"  // bits to clear with
+                                                      // vbic.
+      "1:                                        \n"
+      "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGBTOARGB4444
+      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
+                                                      // ARGB4444.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),      // %0
+        "+r"(dst_argb4444),  // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23");
+}
+
+void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
+                              uint8_t* dst_a,
+                              int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_a),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
+      "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
+      "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
+// 8x1 pixels.
+void ARGBToUV444Row_NEON(const uint8_t* src_argb,
+                         uint8_t* dst_u,
+                         uint8_t* dst_v,
+                         int width) {
+  asm volatile(
+      "movi       v24.8b, #112                   \n"  // UB / VR 0.875
+                                                      // coefficient
+      "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
+      "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
+      "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
+      "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
+      "movi       v29.16b,#0x80                  \n"  // 128.5
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+                                                            // pixels.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
+      "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
+
+      "umull      v3.8h, v2.8b, v24.8b           \n"  // R
+      "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
+      "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
+      "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
+
+      "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_u),     // %1
+        "+r"(dst_v),     // %2
+        "+r"(width)      // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
+        "v27", "v28", "v29");
+}
+
+#define RGBTOUV_SETUP_REG                                                  \
+  "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
+  "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
+  "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
+  "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
+  "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
+  "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
+
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-#define RGBTOUV(QB, QG, QR) \
-    "mul        v3.8h, " #QB ",v20.8h          \n"  /* B                    */ \
-    "mul        v4.8h, " #QR ",v20.8h          \n"  /* R                    */ \
-    "mls        v3.8h, " #QG ",v21.8h          \n"  /* G                    */ \
-    "mls        v4.8h, " #QG ",v24.8h          \n"  /* G                    */ \
-    "mls        v3.8h, " #QR ",v22.8h          \n"  /* R                    */ \
-    "mls        v4.8h, " #QB ",v23.8h          \n"  /* B                    */ \
-    "add        v3.8h, v3.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "add        v4.8h, v4.8h, v25.8h           \n"  /* +128 -> unsigned     */ \
-    "uqshrn     v0.8b, v3.8h, #8               \n"  /* 16 bit to 8 bit U    */ \
-    "uqshrn     v1.8b, v4.8h, #8               \n"  /* 16 bit to 8 bit V    */
+// clang-format off
+#define RGBTOUV(QB, QG, QR)                                                 \
+  "mul        v3.8h, " #QB ",v20.8h          \n" /* B                    */ \
+  "mul        v4.8h, " #QR ",v20.8h          \n" /* R                    */ \
+  "mls        v3.8h, " #QG ",v21.8h          \n" /* G                    */ \
+  "mls        v4.8h, " #QG ",v24.8h          \n" /* G                    */ \
+  "mls        v3.8h, " #QR ",v22.8h          \n" /* R                    */ \
+  "mls        v4.8h, " #QB ",v23.8h          \n" /* B                    */ \
+  "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
+  "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
+  "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
+// clang-format on
 
 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
 // TODO(fbarchard): consider ptrdiff_t for all strides.
 
-void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
+void ARGBToUVRow_NEON(const uint8_t* src_argb,
+                      int src_stride_argb,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
 
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1486,9 +1415,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1503,9 +1430,12 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
 }
 
 // TODO(fbarchard): Subsample match C code.
-void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb_1 = src_argb + src_stride_argb;
+void ARGBToUVJRow_NEON(const uint8_t* src_argb,
+                       int src_stride_argb,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_argb_1 = src_argb + src_stride_argb;
   asm volatile (
     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
@@ -1514,12 +1444,10 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1531,9 +1459,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_argb),  // %0
@@ -1547,18 +1473,19 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
   );
 }
 
-void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
+void BGRAToUVRow_NEON(const uint8_t* src_bgra,
+                      int src_stride_bgra,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1570,9 +1497,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_bgra),  // %0
@@ -1586,18 +1511,19 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
   );
 }
 
-void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
+void ABGRToUVRow_NEON(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1609,9 +1535,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v2.8h, v1.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_abgr),  // %0
@@ -1625,18 +1549,19 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
   );
 }
 
-void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
-                      uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
+void RGBAToUVRow_NEON(const uint8_t* src_rgba,
+                      int src_stride_rgba,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1648,9 +1573,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_rgba),  // %0
@@ -1664,18 +1587,19 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
   );
 }
 
-void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
-                       uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
+void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
+                       int src_stride_rgb24,
+                       uint8_t* dst_u,
+                       uint8_t* dst_v,
+                       int width) {
+  const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1687,9 +1611,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v0.8h, v1.8h, v2.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_rgb24),  // %0
@@ -1703,18 +1625,19 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
   );
 }
 
-void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
-                     uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_raw_1 = src_raw + src_stride_raw;
+void RAWToUVRow_NEON(const uint8_t* src_raw,
+                     int src_stride_raw,
+                     uint8_t* dst_u,
+                     uint8_t* dst_v,
+                     int width) {
+  const uint8_t* src_raw_1 = src_raw + src_stride_raw;
   asm volatile (
     RGBTOUV_SETUP_REG
   "1:                                          \n"
-    MEMACCESS(0)
     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
-    MEMACCESS(1)
     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
@@ -1726,9 +1649,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 
     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
     RGBTOUV(v2.8h, v1.8h, v0.8h)
-    MEMACCESS(2)
     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
     "b.gt       1b                             \n"
   : "+r"(src_raw),  // %0
@@ -1743,699 +1664,656 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
-  asm volatile (
-    "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
-    "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
-    "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
-    "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
-    "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
-    "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
-    RGB565TOARGB
-    "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v17.D[0]             \n"
-    "ins        v18.D[1], v19.D[0]             \n"
-    "ins        v20.D[1], v21.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v18.8h, #1              \n"
-    "urshr      v6.8h, v20.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v16.8h, v4.8h, v22.8h          \n"  // B
-    "mls        v16.8h, v5.8h, v23.8h          \n"  // G
-    "mls        v16.8h, v6.8h, v24.8h          \n"  // R
-    "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
-    "mul        v17.8h, v6.8h, v22.8h          \n"  // R
-    "mls        v17.8h, v5.8h, v26.8h          \n"  // G
-    "mls        v17.8h, v4.8h, v25.8h          \n"  // B
-    "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(src_rgb565_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
-    "v25", "v26", "v27"
-  );
+void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
+                        int src_stride_rgb565,
+                        uint8_t* dst_u,
+                        uint8_t* dst_v,
+                        int width) {
+  const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
+  asm volatile(
+      "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) /
+                                                      // 2
+      "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
+      "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
+      "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
+      "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
+      "movi       v27.16b, #0x80                 \n"  // 128.5 0x8080 in 16bit
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
+      RGB565TOARGB
+      "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v17.D[0]             \n"
+      "ins        v18.D[1], v19.D[0]             \n"
+      "ins        v20.D[1], v21.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v18.8h, #1              \n"
+      "urshr      v6.8h, v20.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v16.8h, v4.8h, v22.8h          \n"  // B
+      "mls        v16.8h, v5.8h, v23.8h          \n"  // G
+      "mls        v16.8h, v6.8h, v24.8h          \n"  // R
+      "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
+      "mul        v17.8h, v6.8h, v22.8h          \n"  // R
+      "mls        v17.8h, v5.8h, v26.8h          \n"  // G
+      "mls        v17.8h, v4.8h, v25.8h          \n"  // B
+      "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),    // %0
+        "+r"(src_rgb565_1),  // %1
+        "+r"(dst_u),         // %2
+        "+r"(dst_v),         // %3
+        "+r"(width)          // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+        "v27");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
-                        uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
-    RGB555TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(src_argb1555_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-  );
+void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
+                          int src_stride_argb1555,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
+      RGB555TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),    // %0
+        "+r"(src_argb1555_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28");
 }
 
 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
-void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
-                          uint8* dst_u, uint8* dst_v, int width) {
-  const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
-  asm volatile (
-    RGBTOUV_SETUP_REG
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
-    ARGB4444TOARGB
-    "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
-    "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
-    "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
-
-    "ins        v16.D[1], v26.D[0]             \n"
-    "ins        v17.D[1], v27.D[0]             \n"
-    "ins        v18.D[1], v28.D[0]             \n"
-
-    "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
-    "urshr      v5.8h, v17.8h, #1              \n"
-    "urshr      v6.8h, v18.8h, #1              \n"
-
-    "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
-    "mul        v2.8h, v4.8h, v20.8h           \n"  // B
-    "mls        v2.8h, v5.8h, v21.8h           \n"  // G
-    "mls        v2.8h, v6.8h, v22.8h           \n"  // R
-    "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
-    "mul        v3.8h, v6.8h, v20.8h           \n"  // R
-    "mls        v3.8h, v5.8h, v24.8h           \n"  // G
-    "mls        v3.8h, v4.8h, v23.8h           \n"  // B
-    "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
-    "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
-    "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
-    MEMACCESS(3)
-    "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(src_argb4444_1),  // %1
-    "+r"(dst_u),     // %2
-    "+r"(dst_v),     // %3
-    "+r"(width)        // %4
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
-    "v26", "v27", "v28"
-
-  );
-}
-
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    RGB565TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb565),  // %0
-    "+r"(dst_y),       // %1
-    "+r"(width)          // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
-    "v24", "v25", "v26", "v27"
-  );
-}
-
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB1555TOARGB
-    "umull      v3.8h, v0.8b, v4.8b            \n"  // B
-    "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
-    "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb1555),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
-}
-
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
-    "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
-    "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
-    "movi       v27.8b, #16                    \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    ARGB4444TOARGB
-    "umull      v3.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v27.8b           \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb4444),  // %0
-    "+r"(dst_y),         // %1
-    "+r"(width)            // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
-  );
-}
-
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_bgra),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // R
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_abgr),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v1.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgba),  // %0
-    "+r"(dst_y),     // %1
-    "+r"(width)        // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_rgb24),  // %0
-    "+r"(dst_y),      // %1
-    "+r"(width)         // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
-}
-
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
-  asm volatile (
-    "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
-    "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
-    "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
-    "movi       v7.8b, #16                     \n"  // Add 16 constant
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v0.8b, v4.8b           \n"  // B
-    "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
-    "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
-    "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
-    "uqadd      v0.8b, v0.8b, v7.8b            \n"
-    MEMACCESS(1)
-    "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
-    "b.gt       1b                             \n"
-  : "+r"(src_raw),  // %0
-    "+r"(dst_y),    // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
+                          int src_stride_argb4444,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width) {
+  const uint8_t* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
+  asm volatile(
+      RGBTOUV_SETUP_REG
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+      "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
+      ARGB4444TOARGB
+      "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
+      "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
+      "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
+
+      "ins        v16.D[1], v26.D[0]             \n"
+      "ins        v17.D[1], v27.D[0]             \n"
+      "ins        v18.D[1], v28.D[0]             \n"
+
+      "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
+      "urshr      v5.8h, v17.8h, #1              \n"
+      "urshr      v6.8h, v18.8h, #1              \n"
+
+      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
+      "mul        v2.8h, v4.8h, v20.8h           \n"  // B
+      "mls        v2.8h, v5.8h, v21.8h           \n"  // G
+      "mls        v2.8h, v6.8h, v22.8h           \n"  // R
+      "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
+      "mul        v3.8h, v6.8h, v20.8h           \n"  // R
+      "mls        v3.8h, v5.8h, v24.8h           \n"  // G
+      "mls        v3.8h, v4.8h, v23.8h           \n"  // B
+      "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
+      "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
+      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
+      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),    // %0
+        "+r"(src_argb4444_1),  // %1
+        "+r"(dst_u),           // %2
+        "+r"(dst_v),           // %3
+        "+r"(width)            // %4
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17",
+        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
+        "v28"
+
+      );
+}
+
+void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      RGB565TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb565),  // %0
+        "+r"(dst_y),       // %1
+        "+r"(width)        // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26",
+        "v27");
+}
+
+void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB1555TOARGB
+      "umull      v3.8h, v0.8b, v4.8b            \n"  // B
+      "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
+      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb1555),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
+                         uint8_t* dst_y,
+                         int width) {
+  asm volatile(
+      "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
+      "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
+      "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
+      "movi       v27.8b, #16                    \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      ARGB4444TOARGB
+      "umull      v3.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v27.8b           \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb4444),  // %0
+        "+r"(dst_y),         // %1
+        "+r"(width)          // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
+}
+
+void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_bgra),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // R
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_abgr),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v1.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgba),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_rgb24),  // %0
+        "+r"(dst_y),      // %1
+        "+r"(width)       // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
+}
+
+void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
+  asm volatile(
+      "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
+      "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
+      "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
+      "movi       v7.8b, #16                     \n"  // Add 16 constant
+      "1:                                        \n"
+      "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v0.8b, v4.8b           \n"  // B
+      "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
+      "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
+      "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
+      "uqadd      v0.8b, v0.8b, v7.8b            \n"
+      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "b.gt       1b                             \n"
+      : "+r"(src_raw),  // %0
+        "+r"(dst_y),    // %1
+        "+r"(width)     // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
 // Bilinear filter 16x2 -> 16x1
-void InterpolateRow_NEON(uint8* dst_ptr,
-                         const uint8* src_ptr, ptrdiff_t src_stride,
-                         int dst_width, int source_y_fraction) {
+void InterpolateRow_NEON(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         int dst_width,
+                         int source_y_fraction) {
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  asm volatile (
-    "cmp        %w4, #0                        \n"
-    "b.eq       100f                           \n"
-    "cmp        %w4, #128                      \n"
-    "b.eq       50f                            \n"
-
-    "dup        v5.16b, %w4                    \n"
-    "dup        v4.16b, %w5                    \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "umull      v2.8h, v0.8b,  v4.8b           \n"
-    "umull2     v3.8h, v0.16b, v4.16b          \n"
-    "umlal      v2.8h, v1.8b,  v5.8b           \n"
-    "umlal2     v3.8h, v1.16b, v5.16b          \n"
-    "rshrn      v0.8b,  v2.8h, #8              \n"
-    "rshrn2     v0.16b, v3.8h, #8              \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       1b                             \n"
-    "b          99f                            \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    MEMACCESS(2)
-    "ld1        {v1.16b}, [%2], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    "urhadd     v0.16b, v0.16b, v1.16b         \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       50b                            \n"
-    "b          99f                            \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1        {v0.16b}, [%1], #16            \n"
-    "subs       %w3, %w3, #16                  \n"
-    MEMACCESS(0)
-    "st1        {v0.16b}, [%0], #16            \n"
-    "b.gt       100b                           \n"
-
-  "99:                                         \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_ptr1),         // %2
-    "+r"(dst_width),        // %3
-    "+r"(y1_fraction),      // %4
-    "+r"(y0_fraction)       // %5
-  :
-  : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
-  );
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  asm volatile(
+      "cmp        %w4, #0                        \n"
+      "b.eq       100f                           \n"
+      "cmp        %w4, #128                      \n"
+      "b.eq       50f                            \n"
+
+      "dup        v5.16b, %w4                    \n"
+      "dup        v4.16b, %w5                    \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "umull      v2.8h, v0.8b,  v4.8b           \n"
+      "umull2     v3.8h, v0.16b, v4.16b          \n"
+      "umlal      v2.8h, v1.8b,  v5.8b           \n"
+      "umlal2     v3.8h, v1.16b, v5.16b          \n"
+      "rshrn      v0.8b,  v2.8h, #8              \n"
+      "rshrn2     v0.16b, v3.8h, #8              \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       1b                             \n"
+      "b          99f                            \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "ld1        {v1.16b}, [%2], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       50b                            \n"
+      "b          99f                            \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1        {v0.16b}, [%1], #16            \n"
+      "subs       %w3, %w3, #16                  \n"
+      "st1        {v0.16b}, [%0], #16            \n"
+      "b.gt       100b                           \n"
+
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "+r"(src_ptr1),     // %2
+        "+r"(dst_width),    // %3
+        "+r"(y1_fraction),  // %4
+        "+r"(y0_fraction)   // %5
+      :
+      : "cc", "memory", "v0", "v1", "v3", "v4", "v5");
 }
 
 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
-void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                       uint8* dst_argb, int width) {
-  asm volatile (
-    "subs       %w3, %w3, #8                   \n"
-    "b.lt       89f                            \n"
-    // Blend 8 pixels.
-  "8:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.ge       8b                             \n"
-
-  "89:                                         \n"
-    "adds       %w3, %w3, #8-1                 \n"
-    "b.lt       99f                            \n"
-
-    // Blend 1 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
-    MEMACCESS(1)
-    "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
-    "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
-    "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
-    "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
-    "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
-    "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
-    "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
-    "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
-    "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
-    "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
-    "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
-    "movi       v3.8b, #255                    \n"  // a = 255
-    MEMACCESS(2)
-    "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
-    "b.ge       1b                             \n"
-
-  "99:                                         \n"
-
-  : "+r"(src_argb0),    // %0
-    "+r"(src_argb1),    // %1
-    "+r"(dst_argb),     // %2
-    "+r"(width)         // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v16", "v17", "v18"
-  );
+void ARGBBlendRow_NEON(const uint8_t* src_argb0,
+                       const uint8_t* src_argb1,
+                       uint8_t* dst_argb,
+                       int width) {
+  asm volatile(
+      "subs       %w3, %w3, #8                   \n"
+      "b.lt       89f                            \n"
+      // Blend 8 pixels.
+      "8:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0
+                                                            // pixels
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1
+                                                            // pixels
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.ge       8b                             \n"
+
+      "89:                                       \n"
+      "adds       %w3, %w3, #8-1                 \n"
+      "b.lt       99f                            \n"
+
+      // Blend 1 pixels.
+      "1:                                        \n"
+      "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
+      "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
+      "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
+      "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
+      "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
+      "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
+      "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
+      "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
+      "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
+      "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
+      "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
+      "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
+      "movi       v3.8b, #255                    \n"  // a = 255
+      "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
+      "b.ge       1b                             \n"
+
+      "99:                                       \n"
+
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18");
 }
 
 // Attenuate 8 pixels at a time.
-void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    // Attenuate 8 pixels.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
-    "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
-    "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
-    "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
-    "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
-    "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int width) {
+  asm volatile(
+      // Attenuate 8 pixels.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
+      "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
+      "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
+      "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
+      "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
+      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+                                                            // pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // Quantize 8 ARGB pixels (32 bytes).
 // dst = (dst * scale >> 16) * interval_size + interval_offset;
-void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  asm volatile (
-    "dup        v4.8h, %w2                     \n"
-    "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
-    "dup        v5.8h, %w3                     \n"  // interval multiply.
-    "dup        v6.8h, %w4                     \n"  // interval add
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
-    "uxtl       v1.8h, v1.8b                   \n"
-    "uxtl       v2.8h, v2.8b                   \n"
-    "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
-    "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
-    "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
-    "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
-    "mul        v1.8h, v1.8h, v5.8h            \n"  // g
-    "mul        v2.8h, v2.8h, v5.8h            \n"  // r
-    "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
-    "add        v1.8h, v1.8h, v6.8h            \n"  // g
-    "add        v2.8h, v2.8h, v6.8h            \n"  // r
-    "uqxtn      v0.8b, v0.8h                   \n"
-    "uqxtn      v1.8b, v1.8h                   \n"
-    "uqxtn      v2.8b, v2.8h                   \n"
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),       // %0
-    "+r"(width)           // %1
-  : "r"(scale),           // %2
-    "r"(interval_size),   // %3
-    "r"(interval_offset)  // %4
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
-  );
+void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
+                          int scale,
+                          int interval_size,
+                          int interval_offset,
+                          int width) {
+  asm volatile(
+      "dup        v4.8h, %w2                     \n"
+      "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
+      "dup        v5.8h, %w3                     \n"  // interval multiply.
+      "dup        v6.8h, %w4                     \n"  // interval add
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8  ARGB.
+      "subs       %w1, %w1, #8                   \n"    // 8 processed per loop.
+      "uxtl       v0.8h, v0.8b                   \n"    // b (0 .. 255)
+      "uxtl       v1.8h, v1.8b                   \n"
+      "uxtl       v2.8h, v2.8b                   \n"
+      "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
+      "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
+      "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
+      "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
+      "mul        v1.8h, v1.8h, v5.8h            \n"  // g
+      "mul        v2.8h, v2.8h, v5.8h            \n"  // r
+      "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
+      "add        v1.8h, v1.8h, v6.8h            \n"  // g
+      "add        v2.8h, v2.8h, v6.8h            \n"  // r
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "uqxtn      v1.8b, v1.8h                   \n"
+      "uqxtn      v2.8b, v2.8h                   \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),       // %0
+        "+r"(width)           // %1
+      : "r"(scale),           // %2
+        "r"(interval_size),   // %3
+        "r"(interval_offset)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
 }
 
 // Shade 8 pixels at a time by specified value.
 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
-void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
-  asm volatile (
-    "dup        v0.4s, %w3                     \n"  // duplicate scale value.
-    "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
-    "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
-
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
-    "uxtl       v5.8h, v5.8b                   \n"
-    "uxtl       v6.8h, v6.8b                   \n"
-    "uxtl       v7.8h, v7.8b                   \n"
-    "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
-    "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
-    "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
-    "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
-    "uqxtn      v4.8b, v4.8h                   \n"
-    "uqxtn      v5.8b, v5.8h                   \n"
-    "uqxtn      v6.8b, v6.8h                   \n"
-    "uqxtn      v7.8b, v7.8h                   \n"
-    MEMACCESS(1)
-    "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),       // %0
-    "+r"(dst_argb),       // %1
-    "+r"(width)           // %2
-  : "r"(value)            // %3
-  : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
-  );
+void ARGBShadeRow_NEON(const uint8_t* src_argb,
+                       uint8_t* dst_argb,
+                       int width,
+                       uint32_t value) {
+  asm volatile(
+      "dup        v0.4s, %w3                     \n"  // duplicate scale value.
+      "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
+      "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
+
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
+      "uxtl       v5.8h, v5.8b                   \n"
+      "uxtl       v6.8h, v6.8b                   \n"
+      "uxtl       v7.8h, v7.8b                   \n"
+      "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
+      "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
+      "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
+      "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
+      "uqxtn      v4.8b, v4.8h                   \n"
+      "uqxtn      v5.8b, v5.8h                   \n"
+      "uqxtn      v6.8b, v6.8h                   \n"
+      "uqxtn      v7.8b, v7.8h                   \n"
+      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      : "r"(value)       // %3
+      : "cc", "memory", "v0", "v4", "v5", "v6", "v7");
 }
 
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
 // Similar to ARGBToYJ but stores ARGB.
 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
-void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
-    "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
-    "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v24.8b           \n"  // B
-    "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
-    "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
-    "orr        v1.8b, v0.8b, v0.8b            \n"  // G
-    "orr        v2.8b, v0.8b, v0.8b            \n"  // R
-    MEMACCESS(1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(width)      // %2
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
-  );
+void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
+      "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
+      "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v24.8b           \n"  // B
+      "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
+      "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
+      "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
+      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
+      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26");
 }
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
@@ -2443,194 +2321,180 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 
-void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v20.8b, #17                    \n"  // BB coefficient
-    "movi       v21.8b, #68                    \n"  // BG coefficient
-    "movi       v22.8b, #35                    \n"  // BR coefficient
-    "movi       v24.8b, #22                    \n"  // GB coefficient
-    "movi       v25.8b, #88                    \n"  // GG coefficient
-    "movi       v26.8b, #45                    \n"  // GR coefficient
-    "movi       v28.8b, #24                    \n"  // BB coefficient
-    "movi       v29.8b, #98                    \n"  // BG coefficient
-    "movi       v30.8b, #50                    \n"  // BR coefficient
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
-    "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
-    "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
-    "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
-    "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
-    "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
-    "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
-    "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
-    "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
-    "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
-    "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
-    "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
-    "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
-    "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
-    MEMACCESS(0)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(dst_argb),  // %0
-    "+r"(width)      // %1
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-    "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
-  );
+void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
+  asm volatile(
+      "movi       v20.8b, #17                    \n"  // BB coefficient
+      "movi       v21.8b, #68                    \n"  // BG coefficient
+      "movi       v22.8b, #35                    \n"  // BR coefficient
+      "movi       v24.8b, #22                    \n"  // GB coefficient
+      "movi       v25.8b, #88                    \n"  // GG coefficient
+      "movi       v26.8b, #45                    \n"  // GR coefficient
+      "movi       v28.8b, #24                    \n"  // BB coefficient
+      "movi       v29.8b, #98                    \n"  // BG coefficient
+      "movi       v30.8b, #50                    \n"  // BR coefficient
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
+      "subs       %w1, %w1, #8                   \n"   // 8 processed per loop.
+      "umull      v4.8h, v0.8b, v20.8b           \n"   // B to Sepia B
+      "umlal      v4.8h, v1.8b, v21.8b           \n"   // G
+      "umlal      v4.8h, v2.8b, v22.8b           \n"   // R
+      "umull      v5.8h, v0.8b, v24.8b           \n"   // B to Sepia G
+      "umlal      v5.8h, v1.8b, v25.8b           \n"   // G
+      "umlal      v5.8h, v2.8b, v26.8b           \n"   // R
+      "umull      v6.8h, v0.8b, v28.8b           \n"   // B to Sepia R
+      "umlal      v6.8h, v1.8b, v29.8b           \n"   // G
+      "umlal      v6.8h, v2.8b, v30.8b           \n"   // R
+      "uqshrn     v0.8b, v4.8h, #7               \n"   // 16 bit to 8 bit B
+      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
+      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),  // %0
+        "+r"(width)      // %1
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+        "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30");
 }
 
 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
 // needs to saturate.  Consider doing a non-saturating version.
-void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
-                             const int8* matrix_argb, int width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
-    "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
-    "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
-
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
-    "uxtl       v17.8h, v17.8b                 \n"  // g
-    "uxtl       v18.8h, v18.8b                 \n"  // r
-    "uxtl       v19.8h, v19.8b                 \n"  // a
-    "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
-    "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
-    "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
-    "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
-    "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
-    "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
-    "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
-    "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
-    "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
-    "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
-    "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
-    "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
-    "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
-    "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
-    "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
-    "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
-    "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
-    "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
-    "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
-    "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
-    "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
-    "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
-    MEMACCESS(1)
-    "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(width)       // %2
-  : "r"(matrix_argb)  // %3
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v22", "v23", "v24", "v25"
-  );
+void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
+                             uint8_t* dst_argb,
+                             const int8_t* matrix_argb,
+                             int width) {
+  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
+      "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
+      "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
+
+      "1:                                        \n"
+      "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 ARGB
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
+      "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
+      "uxtl       v17.8h, v17.8b                 \n"  // g
+      "uxtl       v18.8h, v18.8b                 \n"  // r
+      "uxtl       v19.8h, v19.8b                 \n"  // a
+      "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
+      "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
+      "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
+      "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
+      "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
+      "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
+      "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
+      "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
+      "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
+      "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
+      "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
+      "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
+      "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
+      "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
+      "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
+      "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
+      "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
+      "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
+      "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
+      "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
+      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
+      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
+      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),   // %0
+        "+r"(dst_argb),   // %1
+        "+r"(width)       // %2
+      : "r"(matrix_argb)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
+        "v17", "v18", "v19", "v22", "v23", "v24", "v25");
 }
 
 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
-    "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
-    "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
-    "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
-    "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
-    "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
-    "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
-    "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
+      "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
+      "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
+      "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
+      "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
+      "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
+      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
+      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v4.8b            \n"
-    "uqadd      v1.8b, v1.8b, v5.8b            \n"
-    "uqadd      v2.8b, v2.8b, v6.8b            \n"
-    "uqadd      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBAddRow_NEON(const uint8_t* src_argb0,
+                     const uint8_t* src_argb1,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v4.8b            \n"
+      "uqadd      v1.8b, v1.8b, v5.8b            \n"
+      "uqadd      v2.8b, v2.8b, v6.8b            \n"
+      "uqadd      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
-void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
-  asm volatile (
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
-    MEMACCESS(1)
-    "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqsub      v0.8b, v0.8b, v4.8b            \n"
-    "uqsub      v1.8b, v1.8b, v5.8b            \n"
-    "uqsub      v2.8b, v2.8b, v6.8b            \n"
-    "uqsub      v3.8b, v3.8b, v7.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-
-  : "+r"(src_argb0),  // %0
-    "+r"(src_argb1),  // %1
-    "+r"(dst_argb),   // %2
-    "+r"(width)       // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
-  );
+void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
+                          const uint8_t* src_argb1,
+                          uint8_t* dst_argb,
+                          int width) {
+  asm volatile(
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB
+      "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqsub      v0.8b, v0.8b, v4.8b            \n"
+      "uqsub      v1.8b, v1.8b, v5.8b            \n"
+      "uqsub      v2.8b, v2.8b, v6.8b            \n"
+      "uqsub      v3.8b, v3.8b, v7.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_argb0),  // %0
+        "+r"(src_argb1),  // %1
+        "+r"(dst_argb),   // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
@@ -2638,54 +2502,50 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
-    "orr        v1.8b, v0.8b, v0.8b            \n"
-    "orr        v2.8b, v0.8b, v0.8b            \n"
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelRow_NEON(const uint8_t* src_sobelx,
+                   const uint8_t* src_sobely,
+                   uint8_t* dst_argb,
+                   int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
+      "orr        v1.8b, v0.8b, v0.8b            \n"
+      "orr        v2.8b, v0.8b, v0.8b            \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 // Adds Sobel X and Sobel Y and stores Sobel into plane.
-void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
-  asm volatile (
-    // 16 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
-    MEMACCESS(1)
-    "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
-    "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_y),       // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1"
-  );
+void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
+                          const uint8_t* src_sobely,
+                          uint8_t* dst_y,
+                          int width) {
+  asm volatile(
+      // 16 pixel loop.
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
+      "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
+      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_y),       // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1");
 }
 
 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
@@ -2693,114 +2553,329 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
-  asm volatile (
-    "movi       v3.8b, #255                    \n"  // alpha
-    // 8 pixel loop.
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
-    MEMACCESS(1)
-    "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
-    MEMACCESS(2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_sobelx),  // %0
-    "+r"(src_sobely),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(width)        // %3
-  :
-  : "cc", "memory", "v0", "v1", "v2", "v3"
-  );
+void SobelXYRow_NEON(const uint8_t* src_sobelx,
+                     const uint8_t* src_sobely,
+                     uint8_t* dst_argb,
+                     int width) {
+  asm volatile(
+      "movi       v3.8b, #255                    \n"  // alpha
+      // 8 pixel loop.
+      "1:                                        \n"
+      "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
+      "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "b.gt       1b                             \n"
+      : "+r"(src_sobelx),  // %0
+        "+r"(src_sobely),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 
 // SobelX as a matrix is
 // -1  0  1
 // -2  0  2
 // -1  0  1
-void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%5               \n"  // top
-    MEMACCESS(0)
-    "ld1        {v1.8b}, [%0],%6               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(1)
-    "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%6               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(2)
-    "ld1        {v2.8b}, [%2],%5               \n"  // bottom
-    MEMACCESS(2)
-    "ld1        {v3.8b}, [%2],%6               \n"
-    "subs       %w4, %w4, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(3)
-    "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(src_y2),      // %2
-    "+r"(dst_sobelx),  // %3
-    "+r"(width)        // %4
-  : "r"(2LL),          // %5
-    "r"(6LL)           // %6
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void SobelXRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    const uint8_t* src_y2,
+                    uint8_t* dst_sobelx,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%5               \n"  // top
+      "ld1        {v1.8b}, [%0],%6               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%6               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%2],%5               \n"  // bottom
+      "ld1        {v3.8b}, [%2],%6               \n"
+      "subs       %w4, %w4, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(src_y2),                           // %2
+        "+r"(dst_sobelx),                       // %3
+        "+r"(width)                             // %4
+      : "r"(2LL),                               // %5
+        "r"(6LL)                                // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
 // SobelY as a matrix is
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0],%4               \n"  // left
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1],%4               \n"
-    "usubl      v0.8h, v0.8b, v1.8b            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%4               \n"
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0],%5               \n"  // right
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1],%5               \n"
-    "subs       %w3, %w3, #8                   \n"  // 8 pixels
-    "usubl      v1.8h, v2.8b, v3.8b            \n"
-    "add        v0.8h, v0.8h, v1.8h            \n"
-    "abs        v0.8h, v0.8h                   \n"
-    "uqxtn      v0.8b, v0.8h                   \n"
-    MEMACCESS(2)
-    "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
-    "b.gt       1b                             \n"
-  : "+r"(src_y0),      // %0
-    "+r"(src_y1),      // %1
-    "+r"(dst_sobely),  // %2
-    "+r"(width)        // %3
-  : "r"(1LL),          // %4
-    "r"(6LL)           // %5
-  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void SobelYRow_NEON(const uint8_t* src_y0,
+                    const uint8_t* src_y1,
+                    uint8_t* dst_sobely,
+                    int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0],%4               \n"  // left
+      "ld1        {v1.8b}, [%1],%4               \n"
+      "usubl      v0.8h, v0.8b, v1.8b            \n"
+      "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
+      "ld1        {v3.8b}, [%1],%4               \n"
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "ld1        {v2.8b}, [%0],%5               \n"  // right
+      "ld1        {v3.8b}, [%1],%5               \n"
+      "subs       %w3, %w3, #8                   \n"  // 8 pixels
+      "usubl      v1.8h, v2.8b, v3.8b            \n"
+      "add        v0.8h, v0.8h, v1.8h            \n"
+      "abs        v0.8h, v0.8h                   \n"
+      "uqxtn      v0.8b, v0.8h                   \n"
+      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+      "b.gt       1b                             \n"
+      : "+r"(src_y0),                           // %0
+        "+r"(src_y1),                           // %1
+        "+r"(dst_sobely),                       // %2
+        "+r"(width)                             // %3
+      : "r"(1LL),                               // %4
+        "r"(6LL)                                // %5
+      : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// Caveat - rounds float to half float whereas scaling version truncates.
+void HalfFloat1Row_NEON(const uint16_t* src,
+                        uint16_t* dst,
+                        float /*unused*/,
+                        int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
+      "fcvtn2     v1.8h, v3.4s                   \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      :
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void HalfFloatRow_NEON(const uint16_t* src,
+                       uint16_t* dst,
+                       float scale,
+                       int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
+      "uqshrn2    v1.8h, v3.4s, #13              \n"
+      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "b.gt       1b                             \n"
+      : "+r"(src),                      // %0
+        "+r"(dst),                      // %1
+        "+r"(width)                     // %2
+      : "w"(scale * 1.9259299444e-34f)  // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+void ByteToFloatRow_NEON(const uint8_t* src,
+                         float* dst,
+                         float scale,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.8b}, [%0], #8              \n"  // load 8 bytes
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
+      "uxtl       v1.8h, v1.8b                   \n"  // 8 shorts
+      "uxtl       v2.4s, v1.4h                   \n"  // 8 ints
+      "uxtl2      v3.4s, v1.8h                   \n"
+      "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
+      "scvtf      v3.4s, v3.4s                   \n"
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2", "v3");
+}
+
+float ScaleMaxSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fmax;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"  // scale
+      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
+      "fmax       v6.4s, v6.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
+      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fmax)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fmax;
+}
+
+float ScaleSumSamples_NEON(const float* src,
+                           float* dst,
+                           float scale,
+                           int width) {
+  float fsum;
+  asm volatile(
+      "movi       v5.4s, #0                      \n"  // max
+      "movi       v6.4s, #0                      \n"  // max
+
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v3.4s, v1.4s, %4.s[0]          \n"  // scale
+      "fmul       v4.4s, v2.4s, %4.s[0]          \n"
+      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
+      "fmla       v6.4s, v2.4s, v2.4s            \n"
+      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      "faddp      v5.4s, v5.4s, v6.4s            \n"
+      "faddp      v5.4s, v5.4s, v5.4s            \n"
+      "faddp      %3.4s, v5.4s, v5.4s            \n"  // sum
+      : "+r"(src),                                    // %0
+        "+r"(dst),                                    // %1
+        "+r"(width),                                  // %2
+        "=w"(fsum)                                    // %3
+      : "w"(scale)                                    // %4
+      : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
+  return fsum;
+}
+
+void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v1.4s, v2.4s}, [%0], #32      \n"  // load 8 samples
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
+      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
+      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(dst),   // %1
+        "+r"(width)  // %2
+      : "w"(scale)   // %3
+      : "cc", "memory", "v1", "v2");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "movi       v6.8h, #4                      \n"  // constant 4
+      "movi       v7.8h, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
+      "ld1        {v2.8h}, [%4], #16             \n"
+      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
+      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
+      "ld1        {v2.8h}, [%1], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "ld1        {v2.8h}, [%2], #16             \n"
+      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
+      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
+      "ld1        {v2.8h}, [%3], #16             \n"
+      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
+      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v6", "v7");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "movi       v6.4s, #4                      \n"  // constant 4
+      "movi       v7.4s, #6                      \n"  // constant 6
+
+      "1:                                        \n"
+      "ld1        {v0.4s,v1.4s,v2.4s}, [%0], %6  \n"  // load 12 source samples
+      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
+      "add        v1.4s, v1.4s, v2.4s            \n"  // * 1
+      "ld1        {v2.4s,v3.4s}, [%2], #32       \n"
+      "mla        v0.4s, v2.4s, v7.4s            \n"  // * 6
+      "mla        v1.4s, v3.4s, v7.4s            \n"  // * 6
+      "ld1        {v2.4s,v3.4s}, [%1], #32       \n"
+      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
+      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
+      "add        v3.4s, v3.4s, v5.4s            \n"
+      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
+      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
+      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
+      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
+      "uqrshrn2   v0.8h, v1.4s, #8               \n"
+      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
+      "b.gt       1b                             \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      : "r"(32LL)    // %6
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/third_party/libyuv/source/row_win.cc b/libs/libvpx/third_party/libyuv/source/row_win.cc
index 2a3da8969f1..5500d7f5a64 100644
--- a/libs/libvpx/third_party/libyuv/source/row_win.cc
+++ b/libs/libvpx/third_party/libyuv/source/row_win.cc
@@ -28,72 +28,71 @@ extern "C" {
 #if defined(_M_X64)
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422                                                             \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;
+#define READYUV422                                        \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422                                                            \
-    xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);                                 \
-    xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));                      \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                                     \
-    u_buf += 4;                                                                \
-    xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                                   \
-    xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                                      \
-    y_buf += 8;                                                                \
-    xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                                   \
-    a_buf += 8;
+#define READYUVA422                                       \
+  xmm0 = _mm_cvtsi32_si128(*(uint32_t*)u_buf);            \
+  xmm1 = _mm_cvtsi32_si128(*(uint32_t*)(u_buf + offset)); \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                   \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);                  \
+  u_buf += 4;                                             \
+  xmm4 = _mm_loadl_epi64((__m128i*)y_buf);                \
+  xmm4 = _mm_unpacklo_epi8(xmm4, xmm4);                   \
+  y_buf += 8;                                             \
+  xmm5 = _mm_loadl_epi64((__m128i*)a_buf);                \
+  a_buf += 8;
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(yuvconstants)                                                 \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm2 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB);           \
-    xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG);           \
-    xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR);           \
-    xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);             \
-    xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);             \
-    xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);             \
-    xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);            \
-    xmm0 = _mm_adds_epi16(xmm0, xmm4);                                         \
-    xmm1 = _mm_adds_epi16(xmm1, xmm4);                                         \
-    xmm2 = _mm_adds_epi16(xmm2, xmm4);                                         \
-    xmm0 = _mm_srai_epi16(xmm0, 6);                                            \
-    xmm1 = _mm_srai_epi16(xmm1, 6);                                            \
-    xmm2 = _mm_srai_epi16(xmm2, 6);                                            \
-    xmm0 = _mm_packus_epi16(xmm0, xmm0);                                       \
-    xmm1 = _mm_packus_epi16(xmm1, xmm1);                                       \
-    xmm2 = _mm_packus_epi16(xmm2, xmm2);
+#define YUVTORGB(yuvconstants)                                     \
+  xmm1 = _mm_loadu_si128(&xmm0);                                   \
+  xmm2 = _mm_loadu_si128(&xmm0);                                   \
+  xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \
+  xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \
+  xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \
+  xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0);   \
+  xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1);   \
+  xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2);   \
+  xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb);  \
+  xmm0 = _mm_adds_epi16(xmm0, xmm4);                               \
+  xmm1 = _mm_adds_epi16(xmm1, xmm4);                               \
+  xmm2 = _mm_adds_epi16(xmm2, xmm4);                               \
+  xmm0 = _mm_srai_epi16(xmm0, 6);                                  \
+  xmm1 = _mm_srai_epi16(xmm1, 6);                                  \
+  xmm2 = _mm_srai_epi16(xmm2, 6);                                  \
+  xmm0 = _mm_packus_epi16(xmm0, xmm0);                             \
+  xmm1 = _mm_packus_epi16(xmm1, xmm1);                             \
+  xmm2 = _mm_packus_epi16(xmm2, xmm2);
 
 // Store 8 ARGB values.
-#define STOREARGB                                                              \
-    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);                                      \
-    xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);                                      \
-    xmm1 = _mm_loadu_si128(&xmm0);                                             \
-    xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);                                     \
-    xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);                                     \
-    _mm_storeu_si128((__m128i *)dst_argb, xmm0);                               \
-    _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);                        \
-    dst_argb += 32;
-
+#define STOREARGB                                    \
+  xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);              \
+  xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);              \
+  xmm1 = _mm_loadu_si128(&xmm0);                     \
+  xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);             \
+  xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);             \
+  _mm_storeu_si128((__m128i*)dst_argb, xmm0);        \
+  _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
+  dst_argb += 32;
 
 #if defined(HAS_I422TOARGBROW_SSSE3)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
+void I422ToARGBRow_SSSE3(const uint8_t* y_buf,
+                         const uint8_t* u_buf,
+                         const uint8_t* v_buf,
+                         uint8_t* dst_argb,
                          const struct YuvConstants* yuvconstants,
                          int width) {
   __m128i xmm0, xmm1, xmm2, xmm4;
   const __m128i xmm5 = _mm_set1_epi8(-1);
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUV422
     YUVTORGB(yuvconstants)
@@ -104,15 +103,15 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 #endif
 
 #if defined(HAS_I422ALPHATOARGBROW_SSSE3)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
+void I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf,
+                              const uint8_t* u_buf,
+                              const uint8_t* v_buf,
+                              const uint8_t* a_buf,
+                              uint8_t* dst_argb,
                               const struct YuvConstants* yuvconstants,
                               int width) {
   __m128i xmm0, xmm1, xmm2, xmm4, xmm5;
-  const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+  const ptrdiff_t offset = (uint8_t*)v_buf - (uint8_t*)u_buf;
   while (width > 0) {
     READYUVA422
     YUVTORGB(yuvconstants)
@@ -127,175 +126,143 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
 #ifdef HAS_ARGBTOYROW_SSSE3
 
 // Constants for ARGB.
-static const vec8 kARGBToY = {
-  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
-};
+static const vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
+                              13, 65, 33, 0, 13, 65, 33, 0};
 
 // JPeg full range.
-static const vec8 kARGBToYJ = {
-  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
-};
+static const vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
+                               15, 75, 38, 0, 15, 75, 38, 0};
 
-static const vec8 kARGBToU = {
-  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
-};
+static const vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
+                              112, -74, -38, 0, 112, -74, -38, 0};
 
-static const vec8 kARGBToUJ = {
-  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
-};
+static const vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
+                               127, -84, -43, 0, 127, -84, -43, 0};
 
 static const vec8 kARGBToV = {
-  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
+    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
 };
 
-static const vec8 kARGBToVJ = {
-  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
-};
+static const vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
+                               -20, -107, 127, 0, -20, -107, 127, 0};
 
 // vpshufb for vphaddw + vpackuswb packed to shorts.
 static const lvec8 kShufARGBToUV_AVX = {
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
-  0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-};
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
+    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
 
 // Constants for BGRA.
-static const vec8 kBGRAToY = {
-  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
-};
+static const vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
+                              0, 33, 65, 13, 0, 33, 65, 13};
 
-static const vec8 kBGRAToU = {
-  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
-};
+static const vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
+                              0, -38, -74, 112, 0, -38, -74, 112};
 
-static const vec8 kBGRAToV = {
-  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
-};
+static const vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
+                              0, 112, -94, -18, 0, 112, -94, -18};
 
 // Constants for ABGR.
-static const vec8 kABGRToY = {
-  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
-};
+static const vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
+                              33, 65, 13, 0, 33, 65, 13, 0};
 
-static const vec8 kABGRToU = {
-  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
-};
+static const vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
+                              -38, -74, 112, 0, -38, -74, 112, 0};
 
-static const vec8 kABGRToV = {
-  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
-};
+static const vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
+                              112, -94, -18, 0, 112, -94, -18, 0};
 
 // Constants for RGBA.
-static const vec8 kRGBAToY = {
-  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
-};
+static const vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
+                              0, 13, 65, 33, 0, 13, 65, 33};
 
-static const vec8 kRGBAToU = {
-  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
-};
+static const vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
+                              0, 112, -74, -38, 0, 112, -74, -38};
 
-static const vec8 kRGBAToV = {
-  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
-};
+static const vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
+                              0, -18, -94, 112, 0, -18, -94, 112};
 
-static const uvec8 kAddY16 = {
-  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
-};
+static const uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
+                              16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
 
 // 7 bit fixed point 0.5.
-static const vec16 kAddYJ64 = {
-  64, 64, 64, 64, 64, 64, 64, 64
-};
+static const vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
 
-static const uvec8 kAddUV128 = {
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
+                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
-static const uvec16 kAddUVJ128 = {
-  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
-};
+static const uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
+                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
 
 // Shuffle table for converting RGB24 to ARGB.
 static const uvec8 kShuffleMaskRGB24ToARGB = {
-  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
-};
+    0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
 
 // Shuffle table for converting RAW to ARGB.
-static const uvec8 kShuffleMaskRAWToARGB = {
-  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
-};
+static const uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
+                                            8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
 
 // Shuffle table for converting RAW to RGB24.  First 8.
 static const uvec8 kShuffleMaskRAWToRGB24_0 = {
-  2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Middle 8.
 static const uvec8 kShuffleMaskRAWToRGB24_1 = {
-  2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting RAW to RGB24.  Last 8.
 static const uvec8 kShuffleMaskRAWToRGB24_2 = {
-  8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
-  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
-};
+    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
+    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RGB24.
 static const uvec8 kShuffleMaskARGBToRGB24 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGB to RAW.
 static const uvec8 kShuffleMaskARGBToRAW = {
-  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
-};
+    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
 
 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
-  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
-};
+    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
 
 // YUY2 shuf 16 Y to 32 Y.
-static const lvec8 kShuffleYUY2Y = {
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14,
-  0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14
-};
+static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
+                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
+                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
 
 // YUY2 shuf 8 UV to 16 UV.
-static const lvec8 kShuffleYUY2UV = {
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15,
-  1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15
-};
+static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
+                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
+                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
 
 // UYVY shuf 16 Y to 32 Y.
-static const lvec8 kShuffleUYVYY = {
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15,
-  1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15
-};
+static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
+                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
+                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
 
 // UYVY shuf 8 UV to 16 UV.
-static const lvec8 kShuffleUYVYUV = {
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14,
-  0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14
-};
+static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
+                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
+                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
 
 // NV21 shuf 8 VU to 16 UV.
 static const lvec8 kShuffleNV21 = {
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
-  1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
+    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
 };
 
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
+__declspec(naked) void J400ToARGBRow_SSE2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov        eax, [esp + 4]        // src_y
-    mov        edx, [esp + 8]        // dst_argb
-    mov        ecx, [esp + 12]       // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src_y
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff000000
     pslld      xmm5, 24
 
   convertloop:
@@ -318,13 +285,14 @@ void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
 
 #ifdef HAS_J400TOARGBROW_AVX2
 // Duplicates gray value 3 times and fills in alpha opaque.
-__declspec(naked)
-void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
-  __asm {
-    mov         eax, [esp + 4]        // src_y
-    mov         edx, [esp + 8]        // dst_argb
-    mov         ecx, [esp + 12]       // width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0xff000000
+__declspec(naked) void J400ToARGBRow_AVX2(const uint8_t* src_y,
+                                          uint8_t* dst_argb,
+                                          int width) {
+  __asm {
+    mov         eax, [esp + 4]  // src_y
+    mov         edx, [esp + 8]  // dst_argb
+    mov         ecx, [esp + 12]  // width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld      ymm5, ymm5, 24
 
   convertloop:
@@ -348,13 +316,14 @@ void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width) {
 }
 #endif  // HAS_J400TOARGBROW_AVX2
 
-__declspec(naked)
-void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
+__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_rgb24
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb24
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRGB24ToARGB
 
@@ -364,17 +333,17 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -386,14 +355,14 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
   }
 }
 
-__declspec(naked)
-void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
-                        int width) {
+__declspec(naked) void RAWToARGBRow_SSSE3(const uint8_t* src_raw,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
+    pcmpeqb   xmm5, xmm5  // generate mask 0xff000000
     pslld     xmm5, 24
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToARGB
 
@@ -403,17 +372,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
     movdqu    xmm3, [eax + 32]
     lea       eax, [eax + 48]
     movdqa    xmm2, xmm3
-    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
+    palignr   xmm2, xmm1, 8  // xmm2 = { xmm3[0:3] xmm1[8:15]}
     pshufb    xmm2, xmm4
     por       xmm2, xmm5
-    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
+    palignr   xmm1, xmm0, 12  // xmm1 = { xmm3[0:7] xmm0[12:15]}
     pshufb    xmm0, xmm4
     movdqu    [edx + 32], xmm2
     por       xmm0, xmm5
     pshufb    xmm1, xmm4
     movdqu    [edx], xmm0
     por       xmm1, xmm5
-    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
+    palignr   xmm3, xmm3, 4  // xmm3 = { xmm3[4:15]}
     pshufb    xmm3, xmm4
     movdqu    [edx + 16], xmm1
     por       xmm3, xmm5
@@ -425,11 +394,12 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
   }
 }
 
-__declspec(naked)
-void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
+__declspec(naked) void RAWToRGB24Row_SSSE3(const uint8_t* src_raw,
+                                           uint8_t* dst_rgb24,
+                                           int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_raw
-    mov       edx, [esp + 8]   // dst_rgb24
+    mov       eax, [esp + 4]  // src_raw
+    mov       edx, [esp + 8]  // dst_rgb24
     mov       ecx, [esp + 12]  // width
     movdqa    xmm3, xmmword ptr kShuffleMaskRAWToRGB24_0
     movdqa    xmm4, xmmword ptr kShuffleMaskRAWToRGB24_1
@@ -460,9 +430,9 @@ void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
 // 20 instructions.
-__declspec(naked)
-void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -470,33 +440,33 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
     mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
+    pcmpeqb   xmm4, xmm4  // generate mask 0x07e007e0 for Green
     psllw     xmm4, 10
     psrlw     xmm4, 5
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_rgb565
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_rgb565
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgr565
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    pand      xmm1, xmm3    // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
+    pand      xmm1, xmm3  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
-    pand      xmm0, xmm4    // G in middle 6 bits
-    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
-    por       xmm0, xmm7    // AG
+    por       xmm1, xmm2  // RB
+    pand      xmm0, xmm4  // G in middle 6 bits
+    pmulhuw   xmm0, xmm6  // << 5 * (256 + 4)
+    por       xmm0, xmm7  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -516,9 +486,9 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
 // v * 256 + v * 8
 // v * (256 + 8)
 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
-__declspec(naked)
-void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
-                          int width) {
+__declspec(naked) void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -526,32 +496,32 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
     mov        eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3       // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpcmpeqb   ymm4, ymm4, ymm4       // generate mask 0x07e007e0 for Green
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x07e007e0 for Green
     vpsllw     ymm4, ymm4, 10
     vpsrlw     ymm4, ymm4, 5
-    vpcmpeqb   ymm7, ymm7, ymm7       // generate mask 0xff00ff00 for Alpha
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax, [esp + 4]   // src_rgb565
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_rgb565
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     sub        edx, eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]   // fetch 16 pixels of bgr565
-    vpand      ymm1, ymm0, ymm3    // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgr565
+    vpand      ymm1, ymm0, ymm3  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpand      ymm0, ymm0, ymm4    // G in middle 6 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 5 * (256 + 4)
-    vpor       ymm0, ymm0, ymm7    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpand      ymm0, ymm0, ymm4  // G in middle 6 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 5 * (256 + 4)
+    vpor       ymm0, ymm0, ymm7  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -567,9 +537,9 @@ void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb,
 #endif  // HAS_RGB565TOARGBROW_AVX2
 
 #ifdef HAS_ARGB1555TOARGBROW_AVX2
-__declspec(naked)
-void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov        eax, 0x01080108  // generate multiplier to repeat 5 bits
     vmovd      xmm5, eax
@@ -577,33 +547,33 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
     mov        eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     vmovd      xmm6, eax
     vbroadcastss ymm6, xmm6
-    vpcmpeqb   ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0xf800f800 for Red
     vpsllw     ymm3, ymm3, 11
-    vpsrlw     ymm4, ymm3, 6    // generate mask 0x03e003e0 for Green
-    vpcmpeqb   ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha
+    vpsrlw     ymm4, ymm3, 6  // generate mask 0x03e003e0 for Green
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xff00ff00 for Alpha
     vpsllw     ymm7, ymm7, 8
 
-    mov        eax,  [esp + 4]   // src_argb1555
-    mov        edx,  [esp + 8]   // dst_argb
+    mov        eax,  [esp + 4]  // src_argb1555
+    mov        edx,  [esp + 8]  // dst_argb
     mov        ecx,  [esp + 12]  // width
     sub        edx,  eax
     sub        edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of 1555
-    vpsllw     ymm1, ymm0, 1       // R in upper 5 bits
-    vpsllw     ymm2, ymm0, 11      // B in upper 5 bits
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of 1555
+    vpsllw     ymm1, ymm0, 1  // R in upper 5 bits
+    vpsllw     ymm2, ymm0, 11  // B in upper 5 bits
     vpand      ymm1, ymm1, ymm3
-    vpmulhuw   ymm2, ymm2, ymm5    // * (256 + 8)
-    vpmulhuw   ymm1, ymm1, ymm5    // * (256 + 8)
+    vpmulhuw   ymm2, ymm2, ymm5  // * (256 + 8)
+    vpmulhuw   ymm1, ymm1, ymm5  // * (256 + 8)
     vpsllw     ymm1, ymm1, 8
-    vpor       ymm1, ymm1, ymm2    // RB
-    vpsraw     ymm2, ymm0, 8       // A
-    vpand      ymm0, ymm0, ymm4    // G in middle 5 bits
-    vpmulhuw   ymm0, ymm0, ymm6    // << 6 * (256 + 8)
+    vpor       ymm1, ymm1, ymm2  // RB
+    vpsraw     ymm2, ymm0, 8  // A
+    vpand      ymm0, ymm0, ymm4  // G in middle 5 bits
+    vpmulhuw   ymm0, ymm0, ymm6  // << 6 * (256 + 8)
     vpand      ymm2, ymm2, ymm7
-    vpor       ymm0, ymm0, ymm2    // AG
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpor       ymm0, ymm0, ymm2  // AG
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm1, ymm1, 0xd8
     vpunpckhbw ymm2, ymm1, ymm0
     vpunpcklbw ymm1, ymm1, ymm0
@@ -619,29 +589,29 @@ void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb,
 #endif  // HAS_ARGB1555TOARGBROW_AVX2
 
 #ifdef HAS_ARGB4444TOARGBROW_AVX2
-__declspec(naked)
-void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax,  0x0f0f0f0f  // generate mask 0x0f0f0f0f
     vmovd     xmm4, eax
     vbroadcastss ymm4, xmm4
-    vpslld    ymm5, ymm4, 4     // 0xf0f0f0f0 for high nibbles
-    mov       eax,  [esp + 4]   // src_argb4444
-    mov       edx,  [esp + 8]   // dst_argb
+    vpslld    ymm5, ymm4, 4  // 0xf0f0f0f0 for high nibbles
+    mov       eax,  [esp + 4]  // src_argb4444
+    mov       edx,  [esp + 8]  // dst_argb
     mov       ecx,  [esp + 12]  // width
     sub       edx,  eax
     sub       edx,  eax
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 16 pixels of bgra4444
-    vpand      ymm2, ymm0, ymm5    // mask high nibbles
-    vpand      ymm0, ymm0, ymm4    // mask low nibbles
+    vmovdqu    ymm0, [eax]  // fetch 16 pixels of bgra4444
+    vpand      ymm2, ymm0, ymm5  // mask high nibbles
+    vpand      ymm0, ymm0, ymm4  // mask low nibbles
     vpsrlw     ymm3, ymm2, 4
     vpsllw     ymm1, ymm0, 4
     vpor       ymm2, ymm2, ymm3
     vpor       ymm0, ymm0, ymm1
-    vpermq     ymm0, ymm0, 0xd8    // mutate for unpack
+    vpermq     ymm0, ymm0, 0xd8  // mutate for unpack
     vpermq     ymm2, ymm2, 0xd8
     vpunpckhbw ymm1, ymm0, ymm2
     vpunpcklbw ymm0, ymm0, ymm2
@@ -657,9 +627,9 @@ void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb,
 #endif  // HAS_ARGB4444TOARGBROW_AVX2
 
 // 24 instructions
-__declspec(naked)
-void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8_t* src_argb1555,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
     movd      xmm5, eax
@@ -667,36 +637,36 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
     mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
     movd      xmm6, eax
     pshufd    xmm6, xmm6, 0
-    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
+    pcmpeqb   xmm3, xmm3  // generate mask 0xf800f800 for Red
     psllw     xmm3, 11
-    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
+    movdqa    xmm4, xmm3  // generate mask 0x03e003e0 for Green
     psrlw     xmm4, 6
-    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
+    pcmpeqb   xmm7, xmm7  // generate mask 0xff00ff00 for Alpha
     psllw     xmm7, 8
 
-    mov       eax, [esp + 4]   // src_argb1555
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb1555
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
+    movdqu    xmm0, [eax]  // fetch 8 pixels of 1555
     movdqa    xmm1, xmm0
     movdqa    xmm2, xmm0
-    psllw     xmm1, 1       // R in upper 5 bits
-    psllw     xmm2, 11      // B in upper 5 bits
+    psllw     xmm1, 1  // R in upper 5 bits
+    psllw     xmm2, 11  // B in upper 5 bits
     pand      xmm1, xmm3
-    pmulhuw   xmm2, xmm5    // * (256 + 8)
-    pmulhuw   xmm1, xmm5    // * (256 + 8)
+    pmulhuw   xmm2, xmm5  // * (256 + 8)
+    pmulhuw   xmm1, xmm5  // * (256 + 8)
     psllw     xmm1, 8
-    por       xmm1, xmm2    // RB
+    por       xmm1, xmm2  // RB
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // G in middle 5 bits
-    psraw     xmm2, 8       // A
-    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
+    pand      xmm0, xmm4  // G in middle 5 bits
+    psraw     xmm2, 8  // A
+    pmulhuw   xmm0, xmm6  // << 6 * (256 + 8)
     pand      xmm2, xmm7
-    por       xmm0, xmm2    // AG
+    por       xmm0, xmm2  // AG
     movdqa    xmm2, xmm1
     punpcklbw xmm1, xmm0
     punpckhbw xmm2, xmm0
@@ -710,26 +680,26 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
 }
 
 // 18 instructions.
-__declspec(naked)
-void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
-                            int width) {
+__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8_t* src_argb4444,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
     mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
     movd      xmm4, eax
     pshufd    xmm4, xmm4, 0
-    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
+    movdqa    xmm5, xmm4  // 0xf0f0f0f0 for high nibbles
     pslld     xmm5, 4
-    mov       eax, [esp + 4]   // src_argb4444
-    mov       edx, [esp + 8]   // dst_argb
+    mov       eax, [esp + 4]  // src_argb4444
+    mov       edx, [esp + 8]  // dst_argb
     mov       ecx, [esp + 12]  // width
     sub       edx, eax
     sub       edx, eax
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
+    movdqu    xmm0, [eax]  // fetch 8 pixels of bgra4444
     movdqa    xmm2, xmm0
-    pand      xmm0, xmm4    // mask low nibbles
-    pand      xmm2, xmm5    // mask high nibbles
+    pand      xmm0, xmm4  // mask low nibbles
+    pand      xmm2, xmm5  // mask high nibbles
     movdqa    xmm1, xmm0
     movdqa    xmm3, xmm2
     psllw     xmm1, 4
@@ -748,37 +718,38 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRGB24
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -786,37 +757,38 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8_t* src_argb,
+                                          uint8_t* dst_rgb,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
     movdqa    xmm6, xmmword ptr kShuffleMaskARGBToRAW
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 16 pixels of argb
     movdqu    xmm1, [eax + 16]
     movdqu    xmm2, [eax + 32]
     movdqu    xmm3, [eax + 48]
     lea       eax, [eax + 64]
-    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
+    pshufb    xmm0, xmm6  // pack 16 bytes of ARGB to 12 bytes of RGB
     pshufb    xmm1, xmm6
     pshufb    xmm2, xmm6
     pshufb    xmm3, xmm6
-    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
-    psrldq    xmm1, 4      // 8 bytes from 1
-    pslldq    xmm4, 12     // 4 bytes from 1 for 0
-    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
-    por       xmm0, xmm4   // 4 bytes from 1 for 0
-    pslldq    xmm5, 8      // 8 bytes from 2 for 1
+    movdqa    xmm4, xmm1  // 4 bytes from 1 for 0
+    psrldq    xmm1, 4  // 8 bytes from 1
+    pslldq    xmm4, 12  // 4 bytes from 1 for 0
+    movdqa    xmm5, xmm2  // 8 bytes from 2 for 1
+    por       xmm0, xmm4  // 4 bytes from 1 for 0
+    pslldq    xmm5, 8  // 8 bytes from 2 for 1
     movdqu    [edx], xmm0  // store 0
-    por       xmm1, xmm5   // 8 bytes from 2 for 1
-    psrldq    xmm2, 8      // 4 bytes from 2
-    pslldq    xmm3, 4      // 12 bytes from 3 for 2
-    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqu    [edx + 16], xmm1   // store 1
-    movdqu    [edx + 32], xmm2   // store 2
+    por       xmm1, xmm5  // 8 bytes from 2 for 1
+    psrldq    xmm2, 8  // 4 bytes from 2
+    pslldq    xmm3, 4  // 12 bytes from 3 for 2
+    por       xmm2, xmm3  // 12 bytes from 3 for 2
+    movdqu    [edx + 16], xmm1  // store 1
+    movdqu    [edx + 32], xmm2  // store 2
     lea       edx, [edx + 48]
     sub       ecx, 16
     jg        convertloop
@@ -824,33 +796,34 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -861,41 +834,42 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_SSE2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
 
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
-    movd      xmm6, [esp + 12] // dither4
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
+    movd      xmm6, [esp + 12]  // dither4
     mov       ecx, [esp + 16]  // width
-    punpcklbw xmm6, xmm6       // make dither 16 bytes
+    punpcklbw xmm6, xmm6  // make dither 16 bytes
     movdqa    xmm7, xmm6
     punpcklwd xmm6, xmm6
     punpckhwd xmm7, xmm7
-    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
+    pcmpeqb   xmm3, xmm3  // generate mask 0x0000001f
     psrld     xmm3, 27
-    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
+    pcmpeqb   xmm4, xmm4  // generate mask 0x000007e0
     psrld     xmm4, 26
     pslld     xmm4, 5
-    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
+    pcmpeqb   xmm5, xmm5  // generate mask 0xfffff800
     pslld     xmm5, 11
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    paddusb   xmm0, xmm6    // add dither
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    pslld     xmm0, 8       // R
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 5       // G
-    psrad     xmm0, 16      // R
-    pand      xmm1, xmm3    // B
-    pand      xmm2, xmm4    // G
-    pand      xmm0, xmm5    // R
-    por       xmm1, xmm2    // BG
-    por       xmm0, xmm1    // BGR
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    paddusb   xmm0, xmm6  // add dither
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    pslld     xmm0, 8  // R
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 5  // G
+    psrad     xmm0, 16  // R
+    pand      xmm1, xmm3  // B
+    pand      xmm2, xmm4  // G
+    pand      xmm0, xmm5  // R
+    por       xmm1, xmm2  // BG
+    por       xmm0, xmm1  // BGR
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of RGB565
@@ -907,39 +881,40 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb,
 }
 
 #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
-__declspec(naked)
-void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
-                                const uint32 dither4, int width) {
+__declspec(naked) void ARGBToRGB565DitherRow_AVX2(const uint8_t* src_argb,
+                                                  uint8_t* dst_rgb,
+                                                  const uint32_t dither4,
+                                                  int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     vbroadcastss xmm6, [esp + 12]  // dither4
-    mov        ecx, [esp + 16]     // width
-    vpunpcklbw xmm6, xmm6, xmm6    // make dither 32 bytes
+    mov        ecx, [esp + 16]  // width
+    vpunpcklbw xmm6, xmm6, xmm6  // make dither 32 bytes
     vpermq     ymm6, ymm6, 0xd8
     vpunpcklwd ymm6, ymm6, ymm6
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpaddusb   ymm0, ymm0, ymm6    // add dither
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpaddusb   ymm0, ymm0, ymm6  // add dither
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -950,37 +925,38 @@ void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb,
 #endif  // HAS_ARGBTORGB565DITHERROW_AVX2
 
 // TODO(fbarchard): Improve sign extension/packing.
-__declspec(naked)
-void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
+    pcmpeqb   xmm4, xmm4  // generate mask 0x0000001f
     psrld     xmm4, 27
-    movdqa    xmm5, xmm4       // generate mask 0x000003e0
+    movdqa    xmm5, xmm4  // generate mask 0x000003e0
     pslld     xmm5, 5
-    movdqa    xmm6, xmm4       // generate mask 0x00007c00
+    movdqa    xmm6, xmm4  // generate mask 0x00007c00
     pslld     xmm6, 10
-    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
+    pcmpeqb   xmm7, xmm7  // generate mask 0xffff8000
     pslld     xmm7, 15
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
-    movdqa    xmm1, xmm0    // B
-    movdqa    xmm2, xmm0    // G
-    movdqa    xmm3, xmm0    // R
-    psrad     xmm0, 16      // A
-    psrld     xmm1, 3       // B
-    psrld     xmm2, 6       // G
-    psrld     xmm3, 9       // R
-    pand      xmm0, xmm7    // A
-    pand      xmm1, xmm4    // B
-    pand      xmm2, xmm5    // G
-    pand      xmm3, xmm6    // R
-    por       xmm0, xmm1    // BA
-    por       xmm2, xmm3    // GR
-    por       xmm0, xmm2    // BGRA
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
+    movdqa    xmm1, xmm0  // B
+    movdqa    xmm2, xmm0  // G
+    movdqa    xmm3, xmm0  // R
+    psrad     xmm0, 16  // A
+    psrld     xmm1, 3  // B
+    psrld     xmm2, 6  // G
+    psrld     xmm3, 9  // R
+    pand      xmm0, xmm7  // A
+    pand      xmm1, xmm4  // B
+    pand      xmm2, xmm5  // G
+    pand      xmm3, xmm6  // R
+    por       xmm0, xmm1  // BA
+    por       xmm2, xmm3  // GR
+    por       xmm0, xmm2  // BGRA
     packssdw  xmm0, xmm0
     lea       eax, [eax + 16]
     movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
@@ -991,22 +967,23 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov       eax, [esp + 4]   // src_argb
-    mov       edx, [esp + 8]   // dst_rgb
+    mov       eax, [esp + 4]  // src_argb
+    mov       edx, [esp + 8]  // dst_rgb
     mov       ecx, [esp + 12]  // width
-    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
+    pcmpeqb   xmm4, xmm4  // generate mask 0xf000f000
     psllw     xmm4, 12
-    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
+    movdqa    xmm3, xmm4  // generate mask 0x00f000f0
     psrlw     xmm3, 8
 
  convertloop:
-    movdqu    xmm0, [eax]   // fetch 4 pixels of argb
+    movdqu    xmm0, [eax]  // fetch 4 pixels of argb
     movdqa    xmm1, xmm0
-    pand      xmm0, xmm3    // low nibble
-    pand      xmm1, xmm4    // high nibble
+    pand      xmm0, xmm3  // low nibble
+    pand      xmm1, xmm4  // high nibble
     psrld     xmm0, 4
     psrld     xmm1, 8
     por       xmm0, xmm1
@@ -1021,33 +998,34 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width) {
 }
 
 #ifdef HAS_ARGBTORGB565ROW_AVX2
-__declspec(naked)
-void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8_t* src_argb,
+                                            uint8_t* dst_rgb,
+                                            int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
-    vpcmpeqb   ymm3, ymm3, ymm3    // generate mask 0x0000001f
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
+    vpcmpeqb   ymm3, ymm3, ymm3  // generate mask 0x0000001f
     vpsrld     ymm3, ymm3, 27
-    vpcmpeqb   ymm4, ymm4, ymm4    // generate mask 0x000007e0
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0x000007e0
     vpsrld     ymm4, ymm4, 26
     vpslld     ymm4, ymm4, 5
-    vpslld     ymm5, ymm3, 11      // generate mask 0x0000f800
+    vpslld     ymm5, ymm3, 11  // generate mask 0x0000f800
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm2, ymm0, 5       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrld     ymm0, ymm0, 8       // R
-    vpand      ymm2, ymm2, ymm4    // G
-    vpand      ymm1, ymm1, ymm3    // B
-    vpand      ymm0, ymm0, ymm5    // R
-    vpor       ymm1, ymm1, ymm2    // BG
-    vpor       ymm0, ymm0, ymm1    // BGR
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm2, ymm0, 5  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrld     ymm0, ymm0, 8  // R
+    vpand      ymm2, ymm2, ymm4  // G
+    vpand      ymm1, ymm1, ymm3  // B
+    vpand      ymm0, ymm0, ymm5  // R
+    vpor       ymm1, ymm1, ymm2  // BG
+    vpor       ymm0, ymm0, ymm1  // BGR
     vpackusdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of RGB565
+    vmovdqu    [edx], xmm0  // store 8 pixels of RGB565
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1058,36 +1036,37 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTORGB565ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB1555ROW_AVX2
-__declspec(naked)
-void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]      // src_argb
-    mov        edx, [esp + 8]      // dst_rgb
-    mov        ecx, [esp + 12]     // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm4, ymm4, ymm4
-    vpsrld     ymm4, ymm4, 27      // generate mask 0x0000001f
-    vpslld     ymm5, ymm4, 5       // generate mask 0x000003e0
-    vpslld     ymm6, ymm4, 10      // generate mask 0x00007c00
-    vpcmpeqb   ymm7, ymm7, ymm7    // generate mask 0xffff8000
+    vpsrld     ymm4, ymm4, 27  // generate mask 0x0000001f
+    vpslld     ymm5, ymm4, 5  // generate mask 0x000003e0
+    vpslld     ymm6, ymm4, 10  // generate mask 0x00007c00
+    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffff8000
     vpslld     ymm7, ymm7, 15
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpsrld     ymm3, ymm0, 9       // R
-    vpsrld     ymm2, ymm0, 6       // G
-    vpsrld     ymm1, ymm0, 3       // B
-    vpsrad     ymm0, ymm0, 16      // A
-    vpand      ymm3, ymm3, ymm6    // R
-    vpand      ymm2, ymm2, ymm5    // G
-    vpand      ymm1, ymm1, ymm4    // B
-    vpand      ymm0, ymm0, ymm7    // A
-    vpor       ymm0, ymm0, ymm1    // BA
-    vpor       ymm2, ymm2, ymm3    // GR
-    vpor       ymm0, ymm0, ymm2    // BGRA
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpsrld     ymm3, ymm0, 9  // R
+    vpsrld     ymm2, ymm0, 6  // G
+    vpsrld     ymm1, ymm0, 3  // B
+    vpsrad     ymm0, ymm0, 16  // A
+    vpand      ymm3, ymm3, ymm6  // R
+    vpand      ymm2, ymm2, ymm5  // G
+    vpand      ymm1, ymm1, ymm4  // B
+    vpand      ymm0, ymm0, ymm7  // A
+    vpor       ymm0, ymm0, ymm1  // BA
+    vpor       ymm2, ymm2, ymm3  // GR
+    vpor       ymm0, ymm0, ymm2  // BGRA
     vpackssdw  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB1555
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB1555
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1098,27 +1077,28 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTOARGB1555ROW_AVX2
 
 #ifdef HAS_ARGBTOARGB4444ROW_AVX2
-__declspec(naked)
-void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
+__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_rgb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_rgb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_rgb
     mov        ecx, [esp + 12]  // width
-    vpcmpeqb   ymm4, ymm4, ymm4   // generate mask 0xf000f000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xf000f000
     vpsllw     ymm4, ymm4, 12
-    vpsrlw     ymm3, ymm4, 8      // generate mask 0x00f000f0
+    vpsrlw     ymm3, ymm4, 8  // generate mask 0x00f000f0
 
  convertloop:
-    vmovdqu    ymm0, [eax]         // fetch 8 pixels of argb
-    vpand      ymm1, ymm0, ymm4    // high nibble
-    vpand      ymm0, ymm0, ymm3    // low nibble
+    vmovdqu    ymm0, [eax]  // fetch 8 pixels of argb
+    vpand      ymm1, ymm0, ymm4  // high nibble
+    vpand      ymm0, ymm0, ymm3  // low nibble
     vpsrld     ymm1, ymm1, 8
     vpsrld     ymm0, ymm0, 4
     vpor       ymm0, ymm0, ymm1
     vpackuswb  ymm0, ymm0, ymm0
     vpermq     ymm0, ymm0, 0xd8
     lea        eax, [eax + 32]
-    vmovdqu    [edx], xmm0         // store 8 pixels of ARGB4444
+    vmovdqu    [edx], xmm0  // store 8 pixels of ARGB4444
     lea        edx, [edx + 16]
     sub        ecx, 8
     jg         convertloop
@@ -1129,12 +1109,13 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int width) {
 #endif  // HAS_ARGBTOARGB4444ROW_AVX2
 
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
-__declspec(naked)
-void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1164,12 +1145,13 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
 
 // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
 // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
-__declspec(naked)
-void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_y,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -1200,17 +1182,16 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
 
 #ifdef HAS_ARGBTOYROW_AVX2
 // vpermd for vphaddw + vpackuswb vpermd.
-static const lvec32 kPermdARGBToY_AVX = {
-  0, 4, 1, 5, 2, 6, 3, 7
-};
+static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
 
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYRow_AVX2(const uint8_t* src_argb,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToY
     vbroadcastf128 ymm5, xmmword ptr kAddY16
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1244,12 +1225,13 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
 
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
-__declspec(naked)
-void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ARGBToYJRow_AVX2(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     vbroadcastf128 ymm4, xmmword ptr kARGBToYJ
     vbroadcastf128 ymm5, xmmword ptr kAddYJ64
     vmovdqu    ymm6, ymmword ptr kPermdARGBToY_AVX
@@ -1283,12 +1265,13 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
 }
 #endif  //  HAS_ARGBTOYJROW_AVX2
 
-__declspec(naked)
-void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void BGRAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kBGRAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1316,12 +1299,13 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void ABGRToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kABGRToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1349,12 +1333,13 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
+__declspec(naked) void RGBAToYRow_SSSE3(const uint8_t* src_argb,
+                                        uint8_t* dst_y,
+                                        int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_y */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_y */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kRGBAToY
     movdqa     xmm5, xmmword ptr kAddY16
 
@@ -1382,24 +1367,26 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1423,9 +1410,9 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1437,11 +1424,11 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1452,24 +1439,26 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                        uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+                                          int src_stride_argb,
+                                          uint8_t* dst_u,
+                                          uint8_t* dst_v,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUVJ128
     movdqa     xmm6, xmmword ptr kARGBToVJ
     movdqa     xmm7, xmmword ptr kARGBToUJ
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1493,9 +1482,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1510,9 +1499,9 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1524,24 +1513,26 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 }
 
 #ifdef HAS_ARGBTOUVROW_AVX2
-__declspec(naked)
-void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+                                        int src_stride_argb,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1558,9 +1549,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vshufps    ymm2, ymm2, ymm3, 0xdd
     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
     vpmaddubsw ymm1, ymm0, ymm7  // U
     vpmaddubsw ymm3, ymm2, ymm7
     vpmaddubsw ymm0, ymm0, ymm6  // V
@@ -1574,9 +1565,9 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
     vpaddb     ymm0, ymm0, ymm5  // -> unsigned
 
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1590,24 +1581,26 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOUVROW_AVX2
 
 #ifdef HAS_ARGBTOUVJROW_AVX2
-__declspec(naked)
-void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     vbroadcastf128 ymm5, xmmword ptr kAddUV128
     vbroadcastf128 ymm6, xmmword ptr kARGBToV
     vbroadcastf128 ymm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx   // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 32x2 argb pixels to 16x1 */
+        /* step 1 - subsample 32x2 argb pixels to 16x1 */
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     vmovdqu    ymm2, [eax + 64]
@@ -1624,9 +1617,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vshufps    ymm2, ymm2, ymm3, 0xdd
     vpavgb     ymm2, ymm2, ymm4  // mutated by vshufps
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 32 different pixels, its 16 pixels of U and 16 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 32 different pixels, its 16 pixels of U and 16 of V
     vpmaddubsw ymm1, ymm0, ymm7  // U
     vpmaddubsw ymm3, ymm2, ymm7
     vpmaddubsw ymm0, ymm0, ymm6  // V
@@ -1641,9 +1634,9 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
     vpermq     ymm0, ymm0, 0xd8  // For vpacksswb
     vpshufb    ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX  // for vshufps/vphaddw
 
-    // step 3 - store 16 U and 16 V values
-    vextractf128 [edx], ymm0, 0 // U
-    vextractf128 [edx + edi], ymm0, 1 // V
+        // step 3 - store 16 U and 16 V values
+    vextractf128 [edx], ymm0, 0  // U
+    vextractf128 [edx + edi], ymm0, 1  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -1656,23 +1649,24 @@ void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
 }
 #endif  // HAS_ARGBTOUVJROW_AVX2
 
-__declspec(naked)
-void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
-                          uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8_t* src_argb0,
+                                            uint8_t* dst_u,
+                                            uint8_t* dst_v,
+                                            int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]   // src_argb
-    mov        edx, [esp + 4 + 8]   // dst_u
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        edx, [esp + 4 + 8]  // dst_u
     mov        edi, [esp + 4 + 12]  // dst_v
     mov        ecx, [esp + 4 + 16]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kARGBToV
     movdqa     xmm7, xmmword ptr kARGBToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx    // stride from u to v
 
  convertloop:
-    /* convert to U and V */
-    movdqu     xmm0, [eax]          // U
+        /* convert to U and V */
+    movdqu     xmm0, [eax]  // U
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1688,7 +1682,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
     paddb      xmm0, xmm5
     movdqu     [edx], xmm0
 
-    movdqu     xmm0, [eax]          // V
+    movdqu     xmm0, [eax]  // V
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
     movdqu     xmm3, [eax + 48]
@@ -1713,24 +1707,26 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
   }
 }
 
-__declspec(naked)
-void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void BGRAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kBGRAToV
     movdqa     xmm7, xmmword ptr kBGRAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1754,9 +1750,9 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1768,11 +1764,11 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1783,24 +1779,26 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void ABGRToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kABGRToV
     movdqa     xmm7, xmmword ptr kABGRToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1824,9 +1822,9 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1838,11 +1836,11 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1853,24 +1851,26 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
   }
 }
 
-__declspec(naked)
-void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
-                       uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void RGBAToUVRow_SSSE3(const uint8_t* src_argb0,
+                                         int src_stride_argb,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_argb
-    mov        esi, [esp + 8 + 8]   // src_stride_argb
+    mov        eax, [esp + 8 + 4]  // src_argb
+    mov        esi, [esp + 8 + 8]  // src_stride_argb
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
     movdqa     xmm5, xmmword ptr kAddUV128
     movdqa     xmm6, xmmword ptr kRGBAToV
     movdqa     xmm7, xmmword ptr kRGBAToU
-    sub        edi, edx             // stride from u to v
+    sub        edi, edx  // stride from u to v
 
  convertloop:
-    /* step 1 - subsample 16x2 argb pixels to 8x1 */
+         /* step 1 - subsample 16x2 argb pixels to 8x1 */
     movdqu     xmm0, [eax]
     movdqu     xmm4, [eax + esi]
     pavgb      xmm0, xmm4
@@ -1894,9 +1894,9 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     shufps     xmm4, xmm3, 0xdd
     pavgb      xmm2, xmm4
 
-    // step 2 - convert to U and V
-    // from here down is very similar to Y code except
-    // instead of 16 different pixels, its 8 pixels of U and 8 of V
+        // step 2 - convert to U and V
+        // from here down is very similar to Y code except
+        // instead of 16 different pixels, its 8 pixels of U and 8 of V
     movdqa     xmm1, xmm0
     movdqa     xmm3, xmm2
     pmaddubsw  xmm0, xmm7  // U
@@ -1908,11 +1908,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     psraw      xmm0, 8
     psraw      xmm1, 8
     packsswb   xmm0, xmm1
-    paddb      xmm0, xmm5            // -> unsigned
+    paddb      xmm0, xmm5  // -> unsigned
 
-    // step 3 - store 8 U and 8 V values
-    movlps     qword ptr [edx], xmm0 // U
-    movhps     qword ptr [edx + edi], xmm0 // V
+        // step 3 - store 8 U and 8 V values
+    movlps     qword ptr [edx], xmm0  // U
+    movhps     qword ptr [edx + edi], xmm0  // V
     lea        edx, [edx + 8]
     sub        ecx, 16
     jg         convertloop
@@ -1925,109 +1925,95 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
 #endif  // HAS_ARGBTOYROW_SSSE3
 
 // Read 16 UV from 444
-#define READYUV444_AVX2 __asm {                                                \
-    __asm vmovdqu    xmm0, [esi]                  /* U */                      \
-    __asm vmovdqu    xmm1, [esi + edi]            /* V */                      \
+#define READYUV444_AVX2 \
+  __asm {                                                \
+    __asm vmovdqu    xmm0, [esi] /* U */                      \
+    __asm vmovdqu    xmm1, [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.
-#define READYUV422_AVX2 __asm {                                                \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUV422_AVX2 \
+  __asm {                                                \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
-#define READYUVA422_AVX2 __asm {                                               \
-    __asm vmovq      xmm0, qword ptr [esi]        /* U */                      \
-    __asm vmovq      xmm1, qword ptr [esi + edi]  /* V */                      \
+#define READYUVA422_AVX2 \
+  __asm {                                               \
+    __asm vmovq      xmm0, qword ptr [esi] /* U */                      \
+    __asm vmovq      xmm1, qword ptr [esi + edi] /* V */                      \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */                     \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
     __asm lea        eax, [eax + 16]                                           \
-    __asm vmovdqu    xmm5, [ebp]                  /* A */                      \
+    __asm vmovdqu    xmm5, [ebp] /* A */                      \
     __asm vpermq     ymm5, ymm5, 0xd8                                          \
-    __asm lea        ebp, [ebp + 16]                                           \
-  }
-
-// Read 4 UV from 411, upsample to 16 UV.
-#define READYUV411_AVX2 __asm {                                                \
-    __asm vmovd      xmm0, dword ptr [esi]        /* U */                      \
-    __asm vmovd      xmm1, dword ptr [esi + edi]  /* V */                      \
-    __asm lea        esi,  [esi + 4]                                           \
-    __asm vpunpcklbw ymm0, ymm0, ymm1             /* UV */                     \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpckldq ymm0, ymm0, ymm0             /* UVUVUVUV (upsample) */    \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
-    __asm vpermq     ymm4, ymm4, 0xd8                                          \
-    __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        ebp, [ebp + 16]}
 
 // Read 8 UV from NV12, upsample to 16 UV.
-#define READNV12_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV12_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm0, ymm0             /* UVUV (upsample) */        \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */        \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 UV from NV21, upsample to 16 UV.
-#define READNV21_AVX2 __asm {                                                  \
-    __asm vmovdqu    xmm0, [esi]                  /* UV */                     \
+#define READNV21_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    xmm0, [esi] /* UV */                     \
     __asm lea        esi,  [esi + 16]                                          \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleNV21                      \
-    __asm vmovdqu    xmm4, [eax]                  /* Y */                      \
+    __asm vmovdqu    xmm4, [eax] /* Y */                      \
     __asm vpermq     ymm4, ymm4, 0xd8                                          \
     __asm vpunpcklbw ymm4, ymm4, ymm4                                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
-#define READYUY2_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* YUY2 */                           \
+#define READYUY2_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* YUY2 */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleYUY2Y                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleYUY2UV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
-#define READUYVY_AVX2 __asm {                                                  \
-    __asm vmovdqu    ymm4, [eax]          /* UYVY */                           \
+#define READUYVY_AVX2 \
+  __asm {                                                  \
+    __asm vmovdqu    ymm4, [eax] /* UYVY */                           \
     __asm vpshufb    ymm4, ymm4, ymmword ptr kShuffleUYVYY                     \
-    __asm vmovdqu    ymm0, [eax]          /* UV */                             \
+    __asm vmovdqu    ymm0, [eax] /* UV */                             \
     __asm vpshufb    ymm0, ymm0, ymmword ptr kShuffleUYVYUV                    \
-    __asm lea        eax, [eax + 32]                                           \
-  }
+    __asm lea        eax, [eax + 32]}
 
 // Convert 16 pixels: 16 UV and 16 Y.
-#define YUVTORGB_AVX2(YuvConstants) __asm {                                    \
+#define YUVTORGB_AVX2(YuvConstants) \
+  __asm {                                    \
     __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\
     __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\
     __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\
@@ -2036,68 +2022,67 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASG]               \
     __asm vpsubw     ymm1, ymm3, ymm1                                          \
     __asm vmovdqu    ymm3, ymmword ptr [YuvConstants + KUVBIASB]               \
-    __asm vpsubw     ymm0, ymm3, ymm0                                          \
-    /* Step 2: Find Y contribution to 16 R,G,B values */                       \
+    __asm vpsubw     ymm0, ymm3, ymm0 /* Step 2: Find Y contribution to 16 R,G,B values */                       \
     __asm vpmulhuw   ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB]          \
-    __asm vpaddsw    ymm0, ymm0, ymm4           /* B += Y */                   \
-    __asm vpaddsw    ymm1, ymm1, ymm4           /* G += Y */                   \
-    __asm vpaddsw    ymm2, ymm2, ymm4           /* R += Y */                   \
+    __asm vpaddsw    ymm0, ymm0, ymm4 /* B += Y */                   \
+    __asm vpaddsw    ymm1, ymm1, ymm4 /* G += Y */                   \
+    __asm vpaddsw    ymm2, ymm2, ymm4 /* R += Y */                   \
     __asm vpsraw     ymm0, ymm0, 6                                             \
     __asm vpsraw     ymm1, ymm1, 6                                             \
     __asm vpsraw     ymm2, ymm2, 6                                             \
-    __asm vpackuswb  ymm0, ymm0, ymm0           /* B */                        \
-    __asm vpackuswb  ymm1, ymm1, ymm1           /* G */                        \
-    __asm vpackuswb  ymm2, ymm2, ymm2           /* R */                        \
+    __asm vpackuswb  ymm0, ymm0, ymm0 /* B */                        \
+    __asm vpackuswb  ymm1, ymm1, ymm1 /* G */                        \
+    __asm vpackuswb  ymm2, ymm2, ymm2 /* R */                  \
   }
 
 // Store 16 ARGB values.
-#define STOREARGB_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm0, ymm0, ymm1           /* BG */                       \
+#define STOREARGB_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm0, ymm0, ymm1 /* BG */                       \
     __asm vpermq     ymm0, ymm0, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm2, ymm5           /* RA */                       \
+    __asm vpunpcklbw ymm2, ymm2, ymm5 /* RA */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm1, ymm0, ymm2           /* BGRA first 8 pixels */      \
-    __asm vpunpckhwd ymm0, ymm0, ymm2           /* BGRA next 8 pixels */       \
+    __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */      \
+    __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */       \
     __asm vmovdqu    0[edx], ymm1                                              \
     __asm vmovdqu    32[edx], ymm0                                             \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 // Store 16 RGBA values.
-#define STORERGBA_AVX2 __asm {                                                 \
-    __asm vpunpcklbw ymm1, ymm1, ymm2           /* GR */                       \
+#define STORERGBA_AVX2 \
+  __asm {                                                 \
+    __asm vpunpcklbw ymm1, ymm1, ymm2 /* GR */                       \
     __asm vpermq     ymm1, ymm1, 0xd8                                          \
-    __asm vpunpcklbw ymm2, ymm5, ymm0           /* AB */                       \
+    __asm vpunpcklbw ymm2, ymm5, ymm0 /* AB */                       \
     __asm vpermq     ymm2, ymm2, 0xd8                                          \
-    __asm vpunpcklwd ymm0, ymm2, ymm1           /* ABGR first 8 pixels */      \
-    __asm vpunpckhwd ymm1, ymm2, ymm1           /* ABGR next 8 pixels */       \
+    __asm vpunpcklwd ymm0, ymm2, ymm1 /* ABGR first 8 pixels */      \
+    __asm vpunpckhwd ymm1, ymm2, ymm1 /* ABGR next 8 pixels */       \
     __asm vmovdqu    [edx], ymm0                                               \
     __asm vmovdqu    [edx + 32], ymm1                                          \
-    __asm lea        edx,  [edx + 64]                                          \
-  }
+    __asm lea        edx,  [edx + 64]}
 
 #ifdef HAS_I422TOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I422ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2119,21 +2104,21 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I422ALPHATOARGBROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
-                             const uint8* u_buf,
-                             const uint8* v_buf,
-                             const uint8* a_buf,
-                             uint8* dst_argb,
-                             const struct YuvConstants* yuvconstants,
-                             int width) {
+__declspec(naked) void I422AlphaToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2162,25 +2147,25 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_I444TOARGBROW_AVX2
 // 16 pixels
 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I444ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I444ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
  convertloop:
     READYUV444_AVX2
     YUVTORGB_AVX2(ebx)
@@ -2198,64 +2183,24 @@ void I444ToARGBRow_AVX2(const uint8* y_buf,
 }
 #endif  // HAS_I444TOARGBROW_AVX2
 
-#ifdef HAS_I411TOARGBROW_AVX2
-// 16 pixels
-// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void I411ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
-    mov        edi, [esp + 12 + 12]  // V
-    mov        edx, [esp + 12 + 16]  // abgr
-    mov        ebx, [esp + 12 + 20]  // yuvconstants
-    mov        ecx, [esp + 12 + 24]  // width
-    sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
-
- convertloop:
-    READYUV411_AVX2
-    YUVTORGB_AVX2(ebx)
-    STOREARGB_AVX2
-
-    sub        ecx, 16
-    jg         convertloop
-
-    pop        ebx
-    pop        edi
-    pop        esi
-    vzeroupper
-    ret
-  }
-}
-#endif  // HAS_I411TOARGBROW_AVX2
-
 #ifdef HAS_NV12TOARGBROW_AVX2
 // 16 pixels.
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV12ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* uv_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV12ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV12_AVX2
@@ -2276,21 +2221,21 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_NV21TOARGBROW_AVX2
 // 16 pixels.
 // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
-__declspec(naked)
-void NV21ToARGBRow_AVX2(const uint8* y_buf,
-                        const uint8* vu_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void NV21ToARGBRow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READNV21_AVX2
@@ -2311,18 +2256,18 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf,
 #ifdef HAS_YUY2TOARGBROW_AVX2
 // 16 pixels.
 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void YUY2ToARGBRow_AVX2(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUY2_AVX2
@@ -2342,18 +2287,18 @@ void YUY2ToARGBRow_AVX2(const uint8* src_yuy2,
 #ifdef HAS_UYVYTOARGBROW_AVX2
 // 16 pixels.
 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
-__declspec(naked)
-void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void UYVYToARGBRow_AVX2(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READUYVY_AVX2
@@ -2373,25 +2318,25 @@ void UYVYToARGBRow_AVX2(const uint8* src_uyvy,
 #ifdef HAS_I422TORGBAROW_AVX2
 // 16 pixels
 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
-__declspec(naked)
-void I422ToRGBARow_AVX2(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* dst_argb,
-                        const struct YuvConstants* yuvconstants,
-                        int width) {
+__declspec(naked) void I422ToRGBARow_AVX2(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // abgr
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    vpcmpeqb   ymm5, ymm5, ymm5     // generate 0xffffffffffffffff for alpha
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate 0xffffffffffffffff for alpha
 
  convertloop:
     READYUV422_AVX2
@@ -2415,100 +2360,83 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
 // Allows a conversion with half size scaling.
 
 // Read 8 UV from 444.
-#define READYUV444 __asm {                                                     \
+#define READYUV444 \
+  __asm {                                                     \
     __asm movq       xmm0, qword ptr [esi] /* U */                             \
     __asm movq       xmm1, qword ptr [esi + edi] /* V */                       \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.
-#define READYUV422 __asm {                                                     \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUV422 \
+  __asm {                                                     \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
-#define READYUVA422 __asm {                                                    \
-    __asm movd       xmm0, [esi]          /* U */                              \
-    __asm movd       xmm1, [esi + edi]    /* V */                              \
+#define READYUVA422 \
+  __asm {                                                    \
+    __asm movd       xmm0, [esi] /* U */                              \
+    __asm movd       xmm1, [esi + edi] /* V */                              \
     __asm lea        esi,  [esi + 4]                                           \
-    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
-    __asm movq       xmm4, qword ptr [eax]   /* Y */                           \
+    __asm punpcklbw  xmm0, xmm1 /* UV */                             \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
+    __asm movq       xmm4, qword ptr [eax] /* Y */                           \
     __asm punpcklbw  xmm4, xmm4                                                \
     __asm lea        eax, [eax + 8]                                            \
-    __asm movq       xmm5, qword ptr [ebp]   /* A */                           \
-    __asm lea        ebp, [ebp + 8]                                            \
-  }
-
-// Read 2 UV from 411, upsample to 8 UV.
-// drmemory fails with memory fault if pinsrw used. libyuv bug: 525
-//  __asm pinsrw     xmm0, [esi], 0        /* U */
-//  __asm pinsrw     xmm1, [esi + edi], 0  /* V */
-#define READYUV411_EBX __asm {                                                 \
-    __asm movzx      ebx, word ptr [esi]        /* U */                        \
-    __asm movd       xmm0, ebx                                                 \
-    __asm movzx      ebx, word ptr [esi + edi]  /* V */                        \
-    __asm movd       xmm1, ebx                                                 \
-    __asm lea        esi,  [esi + 2]                                           \
-    __asm punpcklbw  xmm0, xmm1            /* UV */                            \
-    __asm punpcklwd  xmm0, xmm0            /* UVUV (upsample) */               \
-    __asm punpckldq  xmm0, xmm0            /* UVUVUVUV (upsample) */           \
-    __asm movq       xmm4, qword ptr [eax]                                     \
-    __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm movq       xmm5, qword ptr [ebp] /* A */                           \
+    __asm lea        ebp, [ebp + 8]}
 
 // Read 4 UV from NV12, upsample to 8 UV.
-#define READNV12 __asm {                                                       \
+#define READNV12 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
-    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
+    __asm punpcklwd  xmm0, xmm0 /* UVUV (upsample) */                \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 VU from NV21, upsample to 8 UV.
-#define READNV21 __asm {                                                       \
+#define READNV21 \
+  __asm {                                                       \
     __asm movq       xmm0, qword ptr [esi] /* UV */                            \
     __asm lea        esi,  [esi + 8]                                           \
     __asm pshufb     xmm0, xmmword ptr kShuffleNV21                            \
     __asm movq       xmm4, qword ptr [eax]                                     \
     __asm punpcklbw  xmm4, xmm4                                                \
-    __asm lea        eax, [eax + 8]                                            \
-  }
+    __asm lea        eax, [eax + 8]}
 
 // Read 4 YUY2 with 8 Y and upsample 4 UV to 8 UV.
-#define READYUY2 __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* YUY2 */                           \
+#define READYUY2 \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* YUY2 */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleYUY2Y                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleYUY2UV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Read 4 UYVY with 8 Y and upsample 4 UV to 8 UV.
-#define READUYVY __asm {                                                       \
-    __asm movdqu     xmm4, [eax]          /* UYVY */                           \
+#define READUYVY \
+  __asm {                                                       \
+    __asm movdqu     xmm4, [eax] /* UYVY */                           \
     __asm pshufb     xmm4, xmmword ptr kShuffleUYVYY                           \
-    __asm movdqu     xmm0, [eax]          /* UV */                             \
+    __asm movdqu     xmm0, [eax] /* UV */                             \
     __asm pshufb     xmm0, xmmword ptr kShuffleUYVYUV                          \
-    __asm lea        eax, [eax + 16]                                           \
-  }
+    __asm lea        eax, [eax + 16]}
 
 // Convert 8 pixels: 8 UV and 8 Y.
-#define YUVTORGB(YuvConstants) __asm {                                         \
+#define YUVTORGB(YuvConstants) \
+  __asm {                                         \
     __asm movdqa     xmm1, xmm0                                                \
     __asm movdqa     xmm2, xmm0                                                \
     __asm movdqa     xmm3, xmm0                                                \
@@ -2522,129 +2450,125 @@ void I422ToRGBARow_AVX2(const uint8* y_buf,
     __asm pmaddubsw  xmm3, xmmword ptr [YuvConstants + KUVTOR]                 \
     __asm psubw      xmm2, xmm3                                                \
     __asm pmulhuw    xmm4, xmmword ptr [YuvConstants + KYTORGB]                \
-    __asm paddsw     xmm0, xmm4           /* B += Y */                         \
-    __asm paddsw     xmm1, xmm4           /* G += Y */                         \
-    __asm paddsw     xmm2, xmm4           /* R += Y */                         \
+    __asm paddsw     xmm0, xmm4 /* B += Y */                         \
+    __asm paddsw     xmm1, xmm4 /* G += Y */                         \
+    __asm paddsw     xmm2, xmm4 /* R += Y */                         \
     __asm psraw      xmm0, 6                                                   \
     __asm psraw      xmm1, 6                                                   \
     __asm psraw      xmm2, 6                                                   \
-    __asm packuswb   xmm0, xmm0           /* B */                              \
-    __asm packuswb   xmm1, xmm1           /* G */                              \
-    __asm packuswb   xmm2, xmm2           /* R */                              \
+    __asm packuswb   xmm0, xmm0 /* B */                              \
+    __asm packuswb   xmm1, xmm1 /* G */                              \
+    __asm packuswb   xmm2, xmm2 /* R */             \
   }
 
 // Store 8 ARGB values.
-#define STOREARGB __asm {                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm5           /* RA */                             \
+#define STOREARGB \
+  __asm {                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm5 /* RA */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm0, xmm2 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm0                                              \
     __asm movdqu     16[edx], xmm1                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 BGRA values.
-#define STOREBGRA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm0           /* GB */                             \
-    __asm punpcklbw  xmm5, xmm2           /* AR */                             \
+#define STOREBGRA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm0 /* GB */                             \
+    __asm punpcklbw  xmm5, xmm2 /* AR */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* BGRA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* BGRA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* BGRA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* BGRA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGBA values.
-#define STORERGBA __asm {                                                      \
-    __asm pcmpeqb    xmm5, xmm5           /* generate 0xffffffff for alpha */  \
-    __asm punpcklbw  xmm1, xmm2           /* GR */                             \
-    __asm punpcklbw  xmm5, xmm0           /* AB */                             \
+#define STORERGBA \
+  __asm {                                                      \
+    __asm pcmpeqb    xmm5, xmm5 /* generate 0xffffffff for alpha */  \
+    __asm punpcklbw  xmm1, xmm2 /* GR */                             \
+    __asm punpcklbw  xmm5, xmm0 /* AB */                             \
     __asm movdqa     xmm0, xmm5                                                \
-    __asm punpcklwd  xmm5, xmm1           /* RGBA first 4 pixels */            \
-    __asm punpckhwd  xmm0, xmm1           /* RGBA next 4 pixels */             \
+    __asm punpcklwd  xmm5, xmm1 /* RGBA first 4 pixels */            \
+    __asm punpckhwd  xmm0, xmm1 /* RGBA next 4 pixels */             \
     __asm movdqu     0[edx], xmm5                                              \
     __asm movdqu     16[edx], xmm0                                             \
-    __asm lea        edx,  [edx + 32]                                          \
-  }
+    __asm lea        edx,  [edx + 32]}
 
 // Store 8 RGB24 values.
-#define STORERGB24 __asm {                                                     \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB24 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB24 */                                                        \
-    __asm pshufb     xmm0, xmm5           /* Pack first 8 and last 4 bytes. */ \
-    __asm pshufb     xmm1, xmm6           /* Pack first 12 bytes. */           \
-    __asm palignr    xmm1, xmm0, 12       /* last 4 bytes of xmm0 + 12 xmm1 */ \
-    __asm movq       qword ptr 0[edx], xmm0  /* First 8 bytes */               \
-    __asm movdqu     8[edx], xmm1         /* Last 16 bytes */                  \
-    __asm lea        edx,  [edx + 24]                                          \
-  }
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB24 */                                                        \
+    __asm pshufb     xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \
+    __asm pshufb     xmm1, xmm6 /* Pack first 12 bytes. */           \
+    __asm palignr    xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \
+    __asm movq       qword ptr 0[edx], xmm0 /* First 8 bytes */               \
+    __asm movdqu     8[edx], xmm1 /* Last 16 bytes */                  \
+    __asm lea        edx,  [edx + 24]}
 
 // Store 8 RGB565 values.
-#define STORERGB565 __asm {                                                    \
-    /* Weave into RRGB */                                                      \
-    __asm punpcklbw  xmm0, xmm1           /* BG */                             \
-    __asm punpcklbw  xmm2, xmm2           /* RR */                             \
+#define STORERGB565 \
+  __asm {/* Weave into RRGB */                                                      \
+    __asm punpcklbw  xmm0, xmm1 /* BG */                             \
+    __asm punpcklbw  xmm2, xmm2 /* RR */                             \
     __asm movdqa     xmm1, xmm0                                                \
-    __asm punpcklwd  xmm0, xmm2           /* BGRR first 4 pixels */            \
-    __asm punpckhwd  xmm1, xmm2           /* BGRR next 4 pixels */             \
-    /* RRGB -> RGB565 */                                                       \
-    __asm movdqa     xmm3, xmm0    /* B  first 4 pixels of argb */             \
-    __asm movdqa     xmm2, xmm0    /* G */                                     \
-    __asm pslld      xmm0, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm0, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm0, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm0, xmm3    /* BGR */                                   \
-    __asm movdqa     xmm3, xmm1    /* B  next 4 pixels of argb */              \
-    __asm movdqa     xmm2, xmm1    /* G */                                     \
-    __asm pslld      xmm1, 8       /* R */                                     \
-    __asm psrld      xmm3, 3       /* B */                                     \
-    __asm psrld      xmm2, 5       /* G */                                     \
-    __asm psrad      xmm1, 16      /* R */                                     \
-    __asm pand       xmm3, xmm5    /* B */                                     \
-    __asm pand       xmm2, xmm6    /* G */                                     \
-    __asm pand       xmm1, xmm7    /* R */                                     \
-    __asm por        xmm3, xmm2    /* BG */                                    \
-    __asm por        xmm1, xmm3    /* BGR */                                   \
+    __asm punpcklwd  xmm0, xmm2 /* BGRR first 4 pixels */            \
+    __asm punpckhwd  xmm1, xmm2 /* BGRR next 4 pixels */ /* RRGB -> RGB565 */                                                       \
+    __asm movdqa     xmm3, xmm0 /* B  first 4 pixels of argb */             \
+    __asm movdqa     xmm2, xmm0 /* G */                                     \
+    __asm pslld      xmm0, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm0, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm0, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm0, xmm3 /* BGR */                                   \
+    __asm movdqa     xmm3, xmm1 /* B  next 4 pixels of argb */              \
+    __asm movdqa     xmm2, xmm1 /* G */                                     \
+    __asm pslld      xmm1, 8 /* R */                                     \
+    __asm psrld      xmm3, 3 /* B */                                     \
+    __asm psrld      xmm2, 5 /* G */                                     \
+    __asm psrad      xmm1, 16 /* R */                                     \
+    __asm pand       xmm3, xmm5 /* B */                                     \
+    __asm pand       xmm2, xmm6 /* G */                                     \
+    __asm pand       xmm1, xmm7 /* R */                                     \
+    __asm por        xmm3, xmm2 /* BG */                                    \
+    __asm por        xmm1, xmm3 /* BGR */                                   \
     __asm packssdw   xmm0, xmm1                                                \
-    __asm movdqu     0[edx], xmm0  /* store 8 pixels of RGB565 */              \
-    __asm lea        edx, [edx + 16]                                           \
-  }
+    __asm movdqu     0[edx], xmm0 /* store 8 pixels of RGB565 */              \
+    __asm lea        edx, [edx + 16]}
 
 // 8 pixels.
 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I444ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I444ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV444
@@ -2663,19 +2587,19 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes).
-__declspec(naked)
-void I422ToRGB24Row_SSSE3(const uint8* y_buf,
-                          const uint8* u_buf,
-                          const uint8* v_buf,
-                          uint8* dst_rgb24,
-                          const struct YuvConstants* yuvconstants,
-                          int width) {
+__declspec(naked) void I422ToRGB24Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgb24,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -2701,30 +2625,30 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf,
 
 // 8 pixels
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes).
-__declspec(naked)
-void I422ToRGB565Row_SSSE3(const uint8* y_buf,
-                           const uint8* u_buf,
-                           const uint8* v_buf,
-                           uint8* rgb565_buf,
-                           const struct YuvConstants* yuvconstants,
-                           int width) {
+__declspec(naked) void I422ToRGB565Row_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* rgb565_buf,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5       // generate mask 0x0000001f
+    pcmpeqb    xmm5, xmm5  // generate mask 0x0000001f
     psrld      xmm5, 27
-    pcmpeqb    xmm6, xmm6       // generate mask 0x000007e0
+    pcmpeqb    xmm6, xmm6  // generate mask 0x000007e0
     psrld      xmm6, 26
     pslld      xmm6, 5
-    pcmpeqb    xmm7, xmm7       // generate mask 0xfffff800
+    pcmpeqb    xmm7, xmm7  // generate mask 0xfffff800
     pslld      xmm7, 11
 
  convertloop:
@@ -2744,25 +2668,25 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void I422ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
     mov        ecx, [esp + 12 + 24]  // width
     sub        edi, esi
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUV422
@@ -2781,21 +2705,21 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
-__declspec(naked)
-void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
-                              const uint8* u_buf,
-                              const uint8* v_buf,
-                              const uint8* a_buf,
-                              uint8* dst_argb,
-                              const struct YuvConstants* yuvconstants,
-                              int width) {
+__declspec(naked) void I422AlphaToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    const uint8_t* a_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
     push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
+    mov        eax, [esp + 16 + 4]  // Y
+    mov        esi, [esp + 16 + 8]  // U
     mov        edi, [esp + 16 + 12]  // V
     mov        ebp, [esp + 16 + 16]  // A
     mov        edx, [esp + 16 + 20]  // argb
@@ -2819,63 +2743,23 @@ void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
   }
 }
 
-// 8 pixels.
-// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-// Similar to I420 but duplicate UV once more.
-__declspec(naked)
-void I411ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
-  __asm {
-    push       esi
-    push       edi
-    push       ebx
-    push       ebp
-    mov        eax, [esp + 16 + 4]   // Y
-    mov        esi, [esp + 16 + 8]   // U
-    mov        edi, [esp + 16 + 12]  // V
-    mov        edx, [esp + 16 + 16]  // abgr
-    mov        ebp, [esp + 16 + 20]  // yuvconstants
-    mov        ecx, [esp + 16 + 24]  // width
-    sub        edi, esi
-    pcmpeqb    xmm5, xmm5            // generate 0xffffffff for alpha
-
- convertloop:
-    READYUV411_EBX
-    YUVTORGB(ebp)
-    STOREARGB
-
-    sub        ecx, 8
-    jg         convertloop
-
-    pop        ebp
-    pop        ebx
-    pop        edi
-    pop        esi
-    ret
-  }
-}
-
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV12ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* uv_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV12ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* uv_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // UV
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // UV
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV12
@@ -2893,21 +2777,21 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
-__declspec(naked)
-void NV21ToARGBRow_SSSE3(const uint8* y_buf,
-                         const uint8* vu_buf,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void NV21ToARGBRow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* vu_buf,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       ebx
-    mov        eax, [esp + 8 + 4]   // Y
-    mov        esi, [esp + 8 + 8]   // VU
+    mov        eax, [esp + 8 + 4]  // Y
+    mov        esi, [esp + 8 + 8]  // VU
     mov        edx, [esp + 8 + 12]  // argb
     mov        ebx, [esp + 8 + 16]  // yuvconstants
     mov        ecx, [esp + 8 + 20]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READNV21
@@ -2925,18 +2809,18 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf,
 
 // 8 pixels.
 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void YUY2ToARGBRow_SSSE3(
+    const uint8_t* src_yuy2,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // yuy2
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // yuy2
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READYUY2
@@ -2953,18 +2837,18 @@ void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2,
 
 // 8 pixels.
 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes).
-__declspec(naked)
-void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
-                         uint8* dst_argb,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void UYVYToARGBRow_SSSE3(
+    const uint8_t* src_uyvy,
+    uint8_t* dst_argb,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       ebx
-    mov        eax, [esp + 4 + 4]   // uyvy
-    mov        edx, [esp + 4 + 8]   // argb
+    mov        eax, [esp + 4 + 4]  // uyvy
+    mov        edx, [esp + 4 + 8]  // argb
     mov        ebx, [esp + 4 + 12]  // yuvconstants
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
+    pcmpeqb    xmm5, xmm5  // generate 0xffffffff for alpha
 
  convertloop:
     READUYVY
@@ -2979,19 +2863,19 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void I422ToRGBARow_SSSE3(const uint8* y_buf,
-                         const uint8* u_buf,
-                         const uint8* v_buf,
-                         uint8* dst_rgba,
-                         const struct YuvConstants* yuvconstants,
-                         int width) {
+__declspec(naked) void I422ToRGBARow_SSSE3(
+    const uint8_t* y_buf,
+    const uint8_t* u_buf,
+    const uint8_t* v_buf,
+    uint8_t* dst_rgba,
+    const struct YuvConstants* yuvconstants,
+    int width) {
   __asm {
     push       esi
     push       edi
     push       ebx
-    mov        eax, [esp + 12 + 4]   // Y
-    mov        esi, [esp + 12 + 8]   // U
+    mov        eax, [esp + 12 + 4]  // Y
+    mov        esi, [esp + 12 + 8]  // U
     mov        edi, [esp + 12 + 12]  // V
     mov        edx, [esp + 12 + 16]  // argb
     mov        ebx, [esp + 12 + 20]  // yuvconstants
@@ -3016,39 +2900,38 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf,
 
 #ifdef HAS_I400TOARGBROW_SSE2
 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes).
-__declspec(naked)
-void I400ToARGBRow_SSE2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     movd       xmm2, eax
     pshufd     xmm2, xmm2,0
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     movd       xmm3, eax
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
     movq       xmm0, qword ptr [eax]
     lea        eax, [eax + 8]
-    punpcklbw  xmm0, xmm0           // Y.Y
+    punpcklbw  xmm0, xmm0  // Y.Y
     pmulhuw    xmm0, xmm2
     psubusw    xmm0, xmm3
     psrlw      xmm0, 6
-    packuswb   xmm0, xmm0           // G
+    packuswb   xmm0, xmm0        // G
 
-    // Step 2: Weave into ARGB
-    punpcklbw  xmm0, xmm0           // GG
+        // Step 2: Weave into ARGB
+    punpcklbw  xmm0, xmm0  // GG
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
-    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
+    punpcklwd  xmm0, xmm0  // BGRA first 4 pixels
+    punpckhwd  xmm1, xmm1  // BGRA next 4 pixels
     por        xmm0, xmm4
     por        xmm1, xmm4
     movdqu     [edx], xmm0
@@ -3064,41 +2947,40 @@ void I400ToARGBRow_SSE2(const uint8* y_buf,
 #ifdef HAS_I400TOARGBROW_AVX2
 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
 // note: vpunpcklbw mutates and vpackuswb unmutates.
-__declspec(naked)
-void I400ToARGBRow_AVX2(const uint8* y_buf,
-                        uint8* rgb_buf,
-                        int width) {
+__declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf,
+                                          uint8_t* rgb_buf,
+                                          int width) {
   __asm {
-    mov        eax, 0x4a354a35      // 4a35 = 18997 = round(1.164 * 64 * 256)
+    mov        eax, 0x4a354a35  // 4a35 = 18997 = round(1.164 * 64 * 256)
     vmovd      xmm2, eax
     vbroadcastss ymm2, xmm2
-    mov        eax, 0x04880488      // 0488 = 1160 = round(1.164 * 64 * 16)
+    mov        eax, 0x04880488  // 0488 = 1160 = round(1.164 * 64 * 16)
     vmovd      xmm3, eax
     vbroadcastss ymm3, xmm3
-    vpcmpeqb   ymm4, ymm4, ymm4     // generate mask 0xff000000
+    vpcmpeqb   ymm4, ymm4, ymm4  // generate mask 0xff000000
     vpslld     ymm4, ymm4, 24
 
-    mov        eax, [esp + 4]       // Y
-    mov        edx, [esp + 8]       // rgb
-    mov        ecx, [esp + 12]      // width
+    mov        eax, [esp + 4]  // Y
+    mov        edx, [esp + 8]  // rgb
+    mov        ecx, [esp + 12]  // width
 
  convertloop:
-    // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
+        // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164
     vmovdqu    xmm0, [eax]
     lea        eax, [eax + 16]
-    vpermq     ymm0, ymm0, 0xd8           // vpunpcklbw mutates
-    vpunpcklbw ymm0, ymm0, ymm0           // Y.Y
+    vpermq     ymm0, ymm0, 0xd8  // vpunpcklbw mutates
+    vpunpcklbw ymm0, ymm0, ymm0  // Y.Y
     vpmulhuw   ymm0, ymm0, ymm2
     vpsubusw   ymm0, ymm0, ymm3
     vpsrlw     ymm0, ymm0, 6
-    vpackuswb  ymm0, ymm0, ymm0           // G.  still mutated: 3120
+    vpackuswb  ymm0, ymm0, ymm0        // G.  still mutated: 3120
 
-    // TODO(fbarchard): Weave alpha with unpack.
-    // Step 2: Weave into ARGB
-    vpunpcklbw ymm1, ymm0, ymm0           // GG - mutates
+        // TODO(fbarchard): Weave alpha with unpack.
+        // Step 2: Weave into ARGB
+    vpunpcklbw ymm1, ymm0, ymm0  // GG - mutates
     vpermq     ymm1, ymm1, 0xd8
-    vpunpcklwd ymm0, ymm1, ymm1           // GGGG first 8 pixels
-    vpunpckhwd ymm1, ymm1, ymm1           // GGGG next 8 pixels
+    vpunpcklwd ymm0, ymm1, ymm1  // GGGG first 8 pixels
+    vpunpckhwd ymm1, ymm1, ymm1  // GGGG next 8 pixels
     vpor       ymm0, ymm0, ymm4
     vpor       ymm1, ymm1, ymm4
     vmovdqu    [edx], ymm0
@@ -3114,16 +2996,16 @@ void I400ToARGBRow_AVX2(const uint8* y_buf,
 
 #ifdef HAS_MIRRORROW_SSSE3
 // Shuffle table for reversing the bytes.
-static const uvec8 kShuffleMirror = {
-  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
+                                     7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
 
 // TODO(fbarchard): Replace lea with -16 offset.
-__declspec(naked)
-void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_SSSE3(const uint8_t* src,
+                                       uint8_t* dst,
+                                       int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     movdqa    xmm5, xmmword ptr kShuffleMirror
 
@@ -3140,11 +3022,12 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_MIRRORROW_SSSE3
 
 #ifdef HAS_MIRRORROW_AVX2
-__declspec(naked)
-void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void MirrorRow_AVX2(const uint8_t* src,
+                                      uint8_t* dst,
+                                      int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vbroadcastf128 ymm5, xmmword ptr kShuffleMirror
 
@@ -3164,17 +3047,17 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {
-  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
-};
+static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 
-__declspec(naked)
-void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
-                       int width) {
+__declspec(naked) void MirrorUVRow_SSSE3(const uint8_t* src,
+                                         uint8_t* dst_u,
+                                         uint8_t* dst_v,
+                                         int width) {
   __asm {
     push      edi
-    mov       eax, [esp + 4 + 4]   // src
-    mov       edx, [esp + 4 + 8]   // dst_u
+    mov       eax, [esp + 4 + 4]  // src
+    mov       edx, [esp + 4 + 8]  // dst_u
     mov       edi, [esp + 4 + 12]  // dst_v
     mov       ecx, [esp + 4 + 16]  // width
     movdqa    xmm1, xmmword ptr kShuffleMirrorUV
@@ -3198,11 +3081,12 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_MIRRORUVROW_SSSE3
 
 #ifdef HAS_ARGBMIRRORROW_SSE2
-__declspec(naked)
-void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_SSE2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     lea       eax, [eax - 16 + ecx * 4]  // last 4 pixels.
 
@@ -3221,15 +3105,14 @@ void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBMIRRORROW_AVX2
 // Shuffle table for reversing the bytes.
-static const ulvec32 kARGBShuffleMirror_AVX2 = {
-  7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
-};
+static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
 
-__declspec(naked)
-void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBMirrorRow_AVX2(const uint8_t* src,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
-    mov       eax, [esp + 4]   // src
-    mov       edx, [esp + 8]   // dst
+    mov       eax, [esp + 4]  // src
+    mov       edx, [esp + 8]  // dst
     mov       ecx, [esp + 12]  // width
     vmovdqu   ymm5, ymmword ptr kARGBShuffleMirror_AVX2
 
@@ -3246,16 +3129,17 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBMIRRORROW_AVX2
 
 #ifdef HAS_SPLITUVROW_SSE2
-__declspec(naked)
-void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_SSE2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3265,10 +3149,10 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
     movdqa     xmm3, xmm1
-    pand       xmm0, xmm5   // even bytes
+    pand       xmm0, xmm5  // even bytes
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
-    psrlw      xmm2, 8      // odd bytes
+    psrlw      xmm2, 8  // odd bytes
     psrlw      xmm3, 8
     packuswb   xmm2, xmm3
     movdqu     [edx], xmm0
@@ -3285,16 +3169,17 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_SPLITUVROW_SSE2
 
 #ifdef HAS_SPLITUVROW_AVX2
-__declspec(naked)
-void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
-                     int width) {
+__declspec(naked) void SplitUVRow_AVX2(const uint8_t* src_uv,
+                                       uint8_t* dst_u,
+                                       uint8_t* dst_v,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_uv
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_uv
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3302,9 +3187,9 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm2, ymm0, 8      // odd bytes
+    vpsrlw     ymm2, ymm0, 8  // odd bytes
     vpsrlw     ymm3, ymm1, 8
-    vpand      ymm0, ymm0, ymm5   // even bytes
+    vpand      ymm0, ymm0, ymm5  // even bytes
     vpand      ymm1, ymm1, ymm5
     vpackuswb  ymm0, ymm0, ymm1
     vpackuswb  ymm2, ymm2, ymm3
@@ -3324,24 +3209,25 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
 #endif  // HAS_SPLITUVROW_AVX2
 
 #ifdef HAS_MERGEUVROW_SSE2
-__declspec(naked)
-void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_SSE2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    movdqu     xmm0, [eax]      // read 16 U's
+    movdqu     xmm0, [eax]  // read 16 U's
     movdqu     xmm1, [eax + edx]  // and 16 V's
     lea        eax,  [eax + 16]
     movdqa     xmm2, xmm0
-    punpcklbw  xmm0, xmm1       // first 8 UV pairs
-    punpckhbw  xmm2, xmm1       // next 8 UV pairs
+    punpcklbw  xmm0, xmm1  // first 8 UV pairs
+    punpckhbw  xmm2, xmm1  // next 8 UV pairs
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm2
     lea        edi, [edi + 32]
@@ -3355,24 +3241,25 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_SSE2
 
 #ifdef HAS_MERGEUVROW_AVX2
-__declspec(naked)
-void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
-                     int width) {
+__declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u,
+                                       const uint8_t* src_v,
+                                       uint8_t* dst_uv,
+                                       int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_u
-    mov        edx, [esp + 4 + 8]    // src_v
-    mov        edi, [esp + 4 + 12]   // dst_uv
-    mov        ecx, [esp + 4 + 16]   // width
+    mov        eax, [esp + 4 + 4]  // src_u
+    mov        edx, [esp + 4 + 8]  // src_v
+    mov        edi, [esp + 4 + 12]  // dst_uv
+    mov        ecx, [esp + 4 + 16]  // width
     sub        edx, eax
 
   convertloop:
-    vmovdqu    ymm0, [eax]           // read 32 U's
-    vmovdqu    ymm1, [eax + edx]     // and 32 V's
+    vmovdqu    ymm0, [eax]  // read 32 U's
+    vmovdqu    ymm1, [eax + edx]  // and 32 V's
     lea        eax,  [eax + 32]
-    vpunpcklbw ymm2, ymm0, ymm1      // low 16 UV pairs. mutated qqword 0,2
-    vpunpckhbw ymm0, ymm0, ymm1      // high 16 UV pairs. mutated qqword 1,3
-    vextractf128 [edi], ymm2, 0       // bytes 0..15
+    vpunpcklbw ymm2, ymm0, ymm1  // low 16 UV pairs. mutated qqword 0,2
+    vpunpckhbw ymm0, ymm0, ymm1  // high 16 UV pairs. mutated qqword 1,3
+    vextractf128 [edi], ymm2, 0  // bytes 0..15
     vextractf128 [edi + 16], ymm0, 0  // bytes 16..31
     vextractf128 [edi + 32], ymm2, 1  // bytes 32..47
     vextractf128 [edi + 48], ymm0, 1  // bytes 47..63
@@ -3388,13 +3275,14 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
 #endif  //  HAS_MERGEUVROW_AVX2
 
 #ifdef HAS_COPYROW_SSE2
-// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
-__declspec(naked)
-void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 16 byte load/store, 32 bytes at time.
+__declspec(naked) void CopyRow_SSE2(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     test       eax, 15
     jne        convertloopu
     test       edx, 15
@@ -3426,13 +3314,14 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_SSE2
 
 #ifdef HAS_COPYROW_AVX
-// CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time.
-__declspec(naked)
-void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
+// CopyRow copys 'width' bytes using a 32 byte load/store, 64 bytes at time.
+__declspec(naked) void CopyRow_AVX(const uint8_t* src,
+                                   uint8_t* dst,
+                                   int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
@@ -3451,14 +3340,15 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
 #endif  // HAS_COPYROW_AVX
 
 // Multiple of 1.
-__declspec(naked)
-void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
+__declspec(naked) void CopyRow_ERMS(const uint8_t* src,
+                                    uint8_t* dst,
+                                    int width) {
   __asm {
     mov        eax, esi
     mov        edx, edi
-    mov        esi, [esp + 4]   // src
-    mov        edi, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        esi, [esp + 4]  // src
+    mov        edi, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     rep movsb
     mov        edi, edx
     mov        esi, eax
@@ -3468,15 +3358,16 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3504,14 +3395,15 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8_t* src,
+                                             uint8_t* dst,
+                                             int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vmovdqu    ymm1, [eax]
@@ -3533,11 +3425,12 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
+__declspec(naked) void ARGBExtractAlphaRow_SSE2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_a
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
     mov        ecx, [esp + 12]  // width
 
   extractloop:
@@ -3558,17 +3451,54 @@ void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
 }
 #endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
 
+#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
+// width in pixels
+__declspec(naked) void ARGBExtractAlphaRow_AVX2(const uint8_t* src_argb,
+                                                uint8_t* dst_a,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_a
+    mov        ecx, [esp + 12]  // width
+    vmovdqa    ymm4, ymmword ptr kPermdARGBToY_AVX
+
+  extractloop:
+    vmovdqu    ymm0, [eax]
+    vmovdqu    ymm1, [eax + 32]
+    vpsrld     ymm0, ymm0, 24
+    vpsrld     ymm1, ymm1, 24
+    vmovdqu    ymm2, [eax + 64]
+    vmovdqu    ymm3, [eax + 96]
+    lea        eax, [eax + 128]
+    vpackssdw  ymm0, ymm0, ymm1  // mutates
+    vpsrld     ymm2, ymm2, 24
+    vpsrld     ymm3, ymm3, 24
+    vpackssdw  ymm2, ymm2, ymm3  // mutates
+    vpackuswb  ymm0, ymm0, ymm2  // mutates
+    vpermd     ymm0, ymm4, ymm0  // unmutate
+    vmovdqu    [edx], ymm0
+    lea        edx, [edx + 32]
+    sub        ecx, 32
+    jg         extractloop
+
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
+
 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
-    pcmpeqb    xmm0, xmm0       // generate mask 0xff000000
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm0, xmm0  // generate mask 0xff000000
     pslld      xmm0, 24
-    pcmpeqb    xmm1, xmm1       // generate mask 0x00ffffff
+    pcmpeqb    xmm1, xmm1  // generate mask 0x00ffffff
     psrld      xmm1, 8
 
   convertloop:
@@ -3598,14 +3528,15 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
 
 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
 // width in pixels
-__declspec(naked)
-void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
+__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src,
+                                                uint8_t* dst,
+                                                int width) {
   __asm {
-    mov        eax, [esp + 4]   // src
-    mov        edx, [esp + 8]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        eax, [esp + 4]  // src
+    mov        edx, [esp + 8]  // dst
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm0, ymm0, ymm0
-    vpsrld     ymm0, ymm0, 8    // generate mask 0x00ffffff
+    vpsrld     ymm0, ymm0, 8  // generate mask 0x00ffffff
 
   convertloop:
     vpmovzxbd  ymm1, qword ptr [eax]
@@ -3628,17 +3559,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
 #endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
 
 #ifdef HAS_SETROW_X86
-// Write 'count' bytes using an 8 bit value repeated.
-// Count should be multiple of 4.
-__declspec(naked)
-void SetRow_X86(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+// width should be multiple of 4.
+__declspec(naked) void SetRow_X86(uint8_t* dst, uint8_t v8, int width) {
   __asm {
-    movzx      eax, byte ptr [esp + 8]    // v8
+    movzx      eax, byte ptr [esp + 8]  // v8
     mov        edx, 0x01010101  // Duplicate byte to all bytes.
-    mul        edx              // overwrites edx with upper part of result.
+    mul        edx  // overwrites edx with upper part of result.
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        ecx, [esp + 12]  // width
     shr        ecx, 2
     rep stosd
     mov        edi, edx
@@ -3646,28 +3576,28 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) {
   }
 }
 
-// Write 'count' bytes using an 8 bit value repeated.
-__declspec(naked)
-void SetRow_ERMS(uint8* dst, uint8 v8, int count) {
+// Write 'width' bytes using an 8 bit value repeated.
+__declspec(naked) void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v8
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v8
+    mov        ecx, [esp + 12]  // width
     rep stosb
     mov        edi, edx
     ret
   }
 }
 
-// Write 'count' 32 bit values.
-__declspec(naked)
-void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
+// Write 'width' 32 bit values.
+__declspec(naked) void ARGBSetRow_X86(uint8_t* dst_argb,
+                                      uint32_t v32,
+                                      int width) {
   __asm {
     mov        edx, edi
-    mov        edi, [esp + 4]   // dst
-    mov        eax, [esp + 8]   // v32
-    mov        ecx, [esp + 12]  // count
+    mov        edi, [esp + 4]  // dst
+    mov        eax, [esp + 8]  // v32
+    mov        ecx, [esp + 12]  // width
     rep stosd
     mov        edi, edx
     ret
@@ -3676,12 +3606,13 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) {
 #endif  // HAS_SETROW_X86
 
 #ifdef HAS_YUY2TOYROW_AVX2
-__declspec(naked)
-void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
+__declspec(naked) void YUY2ToYRow_AVX2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
 
@@ -3689,9 +3620,9 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // even bytes are Y
+    vpand      ymm0, ymm0, ymm5  // even bytes are Y
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3702,18 +3633,20 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3723,18 +3656,18 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3746,16 +3679,17 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3763,18 +3697,18 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // YUYV -> UVUV
+    vpsrlw     ymm0, ymm0, 8  // YUYV -> UVUV
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3785,21 +3719,21 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_AVX2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_AVX2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpsrlw     ymm0, ymm0, 8      // odd bytes are Y
+    vpsrlw     ymm0, ymm0, 8  // odd bytes are Y
     vpsrlw     ymm1, ymm1, 8
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -3810,18 +3744,20 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_AVX2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3831,18 +3767,18 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
     vpavgb     ymm0, ymm0, [eax + esi]
     vpavgb     ymm1, ymm1, [eax + esi + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3854,16 +3790,17 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_AVX2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    vpcmpeqb   ymm5, ymm5, ymm5      // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0x00ff00ff
     vpsrlw     ymm5, ymm5, 8
     sub        edi, edx
 
@@ -3871,18 +3808,18 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
     vmovdqu    ymm0, [eax]
     vmovdqu    ymm1, [eax + 32]
     lea        eax,  [eax + 64]
-    vpand      ymm0, ymm0, ymm5   // UYVY -> UVUV
+    vpand      ymm0, ymm0, ymm5  // UYVY -> UVUV
     vpand      ymm1, ymm1, ymm5
-    vpackuswb  ymm0, ymm0, ymm1   // mutates.
+    vpackuswb  ymm0, ymm0, ymm1  // mutates.
     vpermq     ymm0, ymm0, 0xd8
     vpand      ymm1, ymm0, ymm5  // U
-    vpsrlw     ymm0, ymm0, 8     // V
+    vpsrlw     ymm0, ymm0, 8  // V
     vpackuswb  ymm1, ymm1, ymm1  // mutates.
     vpackuswb  ymm0, ymm0, ymm0  // mutates.
     vpermq     ymm1, ymm1, 0xd8
     vpermq     ymm0, ymm0, 0xd8
     vextractf128 [edx], ymm1, 0  // U
-    vextractf128 [edx + edi], ymm0, 0 // V
+    vextractf128 [edx + edi], ymm0, 0  // V
     lea        edx, [edx + 16]
     sub        ecx, 32
     jg         convertloop
@@ -3895,21 +3832,21 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
 #endif  // HAS_YUY2TOYROW_AVX2
 
 #ifdef HAS_YUY2TOYROW_SSE2
-__declspec(naked)
-void YUY2ToYRow_SSE2(const uint8* src_yuy2,
-                     uint8* dst_y, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_yuy2
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
-    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
+__declspec(naked) void YUY2ToYRow_SSE2(const uint8_t* src_yuy2,
+                                       uint8_t* dst_y,
+                                       int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_yuy2
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // even bytes are Y
+    pand       xmm0, xmm5  // even bytes are Y
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -3920,18 +3857,20 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
+                                        int stride_yuy2,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3943,13 +3882,13 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3963,16 +3902,17 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
   }
 }
 
-__declspec(naked)
-void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -3980,13 +3920,13 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8      // YUYV -> UVUV
+    psrlw      xmm0, 8  // YUYV -> UVUV
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -3999,19 +3939,19 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
   }
 }
 
-__declspec(naked)
-void UYVYToYRow_SSE2(const uint8* src_uyvy,
-                     uint8* dst_y, int width) {
+__declspec(naked) void UYVYToYRow_SSE2(const uint8_t* src_uyvy,
+                                       uint8_t* dst_y,
+                                       int width) {
   __asm {
-    mov        eax, [esp + 4]    // src_uyvy
-    mov        edx, [esp + 8]    // dst_y
-    mov        ecx, [esp + 12]   // width
+    mov        eax, [esp + 4]  // src_uyvy
+    mov        edx, [esp + 8]  // dst_y
+    mov        ecx, [esp + 12]  // width
 
   convertloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8    // odd bytes are Y
+    psrlw      xmm0, 8  // odd bytes are Y
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4022,18 +3962,20 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
-                      uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUVRow_SSE2(const uint8_t* src_uyvy,
+                                        int stride_uyvy,
+                                        uint8_t* dst_u,
+                                        uint8_t* dst_v,
+                                        int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_yuy2
-    mov        esi, [esp + 8 + 8]    // stride_yuy2
-    mov        edx, [esp + 8 + 12]   // dst_u
-    mov        edi, [esp + 8 + 16]   // dst_v
-    mov        ecx, [esp + 8 + 20]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 8 + 4]  // src_yuy2
+    mov        esi, [esp + 8 + 8]  // stride_yuy2
+    mov        edx, [esp + 8 + 12]  // dst_u
+    mov        edi, [esp + 8 + 16]  // dst_v
+    mov        ecx, [esp + 8 + 20]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4045,13 +3987,13 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
     lea        eax,  [eax + 32]
     pavgb      xmm0, xmm2
     pavgb      xmm1, xmm3
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4065,16 +4007,17 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
   }
 }
 
-__declspec(naked)
-void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
-                         uint8* dst_u, uint8* dst_v, int width) {
+__declspec(naked) void UYVYToUV422Row_SSE2(const uint8_t* src_uyvy,
+                                           uint8_t* dst_u,
+                                           uint8_t* dst_v,
+                                           int width) {
   __asm {
     push       edi
-    mov        eax, [esp + 4 + 4]    // src_yuy2
-    mov        edx, [esp + 4 + 8]    // dst_u
-    mov        edi, [esp + 4 + 12]   // dst_v
-    mov        ecx, [esp + 4 + 16]   // width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
+    mov        eax, [esp + 4 + 4]  // src_yuy2
+    mov        edx, [esp + 4 + 8]  // dst_u
+    mov        edi, [esp + 4 + 12]  // dst_v
+    mov        ecx, [esp + 4 + 16]  // width
+    pcmpeqb    xmm5, xmm5  // generate mask 0x00ff00ff
     psrlw      xmm5, 8
     sub        edi, edx
 
@@ -4082,13 +4025,13 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pand       xmm0, xmm5   // UYVY -> UVUV
+    pand       xmm0, xmm5  // UYVY -> UVUV
     pand       xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqa     xmm1, xmm0
     pand       xmm0, xmm5  // U
     packuswb   xmm0, xmm0
-    psrlw      xmm1, 8     // V
+    psrlw      xmm1, 8  // V
     packuswb   xmm1, xmm1
     movq       qword ptr [edx], xmm0
     movq       qword ptr [edx + edi], xmm1
@@ -4108,13 +4051,15 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_SSSE3(const uint8_t* src0,
+                                           const uint8_t* src1,
+                                           const uint8_t* alpha,
+                                           uint8_t* dst,
+                                           int width) {
   __asm {
     push       esi
     push       edi
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
     mov        eax, 0x80808080  // 128 for biasing image to signed.
     movd       xmm6, eax
@@ -4123,8 +4068,8 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
     mov        eax, 0x807f807f  // 32768 + 127 for unbias and round.
     movd       xmm7, eax
     pshufd     xmm7, xmm7, 0x00
-    mov        eax, [esp + 8 + 4]   // src0
-    mov        edx, [esp + 8 + 8]   // src1
+    mov        eax, [esp + 8 + 4]  // src0
+    mov        edx, [esp + 8 + 8]  // src1
     mov        esi, [esp + 8 + 12]  // alpha
     mov        edi, [esp + 8 + 16]  // dst
     mov        ecx, [esp + 8 + 20]  // width
@@ -4132,17 +4077,17 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
     sub        edx, esi
     sub        edi, esi
 
-    // 8 pixel loop.
+        // 8 pixel loop.
   convertloop8:
-    movq       xmm0, qword ptr [esi]        // alpha
+    movq       xmm0, qword ptr [esi]  // alpha
     punpcklbw  xmm0, xmm0
-    pxor       xmm0, xmm5         // a, 255-a
+    pxor       xmm0, xmm5  // a, 255-a
     movq       xmm1, qword ptr [eax + esi]  // src0
     movq       xmm2, qword ptr [edx + esi]  // src1
     punpcklbw  xmm1, xmm2
-    psubb      xmm1, xmm6         // bias src0/1 - 128
+    psubb      xmm1, xmm6  // bias src0/1 - 128
     pmaddubsw  xmm0, xmm1
-    paddw      xmm0, xmm7         // unbias result - 32768 and round.
+    paddw      xmm0, xmm7  // unbias result - 32768 and round.
     psrlw      xmm0, 8
     packuswb   xmm0, xmm0
     movq       qword ptr [edi + esi], xmm0
@@ -4163,13 +4108,15 @@ void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1,
 // =((A2*C2)+(B2*(255-C2))+255)/256
 // signed version of math
 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
-__declspec(naked)
-void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
-                         const uint8* alpha, uint8* dst, int width) {
+__declspec(naked) void BlendPlaneRow_AVX2(const uint8_t* src0,
+                                          const uint8_t* src1,
+                                          const uint8_t* alpha,
+                                          uint8_t* dst,
+                                          int width) {
   __asm {
     push        esi
     push        edi
-    vpcmpeqb    ymm5, ymm5, ymm5       // generate mask 0xff00ff00
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0xff00ff00
     vpsllw      ymm5, ymm5, 8
     mov         eax, 0x80808080  // 128 for biasing image to signed.
     vmovd       xmm6, eax
@@ -4177,8 +4124,8 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
     mov         eax, 0x807f807f  // 32768 + 127 for unbias and round.
     vmovd       xmm7, eax
     vbroadcastss ymm7, xmm7
-    mov         eax, [esp + 8 + 4]   // src0
-    mov         edx, [esp + 8 + 8]   // src1
+    mov         eax, [esp + 8 + 4]  // src0
+    mov         edx, [esp + 8 + 8]  // src1
     mov         esi, [esp + 8 + 12]  // alpha
     mov         edi, [esp + 8 + 16]  // dst
     mov         ecx, [esp + 8 + 20]  // width
@@ -4186,23 +4133,23 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
     sub         edx, esi
     sub         edi, esi
 
-    // 32 pixel loop.
+        // 32 pixel loop.
   convertloop32:
-    vmovdqu     ymm0, [esi]        // alpha
-    vpunpckhbw  ymm3, ymm0, ymm0   // 8..15, 24..31
-    vpunpcklbw  ymm0, ymm0, ymm0   // 0..7, 16..23
-    vpxor       ymm3, ymm3, ymm5   // a, 255-a
-    vpxor       ymm0, ymm0, ymm5   // a, 255-a
+    vmovdqu     ymm0, [esi]  // alpha
+    vpunpckhbw  ymm3, ymm0, ymm0  // 8..15, 24..31
+    vpunpcklbw  ymm0, ymm0, ymm0  // 0..7, 16..23
+    vpxor       ymm3, ymm3, ymm5  // a, 255-a
+    vpxor       ymm0, ymm0, ymm5  // a, 255-a
     vmovdqu     ymm1, [eax + esi]  // src0
     vmovdqu     ymm2, [edx + esi]  // src1
     vpunpckhbw  ymm4, ymm1, ymm2
     vpunpcklbw  ymm1, ymm1, ymm2
-    vpsubb      ymm4, ymm4, ymm6   // bias src0/1 - 128
-    vpsubb      ymm1, ymm1, ymm6   // bias src0/1 - 128
+    vpsubb      ymm4, ymm4, ymm6  // bias src0/1 - 128
+    vpsubb      ymm1, ymm1, ymm6  // bias src0/1 - 128
     vpmaddubsw  ymm3, ymm3, ymm4
     vpmaddubsw  ymm0, ymm0, ymm1
-    vpaddw      ymm3, ymm3, ymm7   // unbias result - 32768 and round.
-    vpaddw      ymm0, ymm0, ymm7   // unbias result - 32768 and round.
+    vpaddw      ymm3, ymm3, ymm7  // unbias result - 32768 and round.
+    vpaddw      ymm0, ymm0, ymm7  // unbias result - 32768 and round.
     vpsrlw      ymm3, ymm3, 8
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm3
@@ -4221,52 +4168,51 @@ void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1,
 
 #ifdef HAS_ARGBBLENDROW_SSSE3
 // Shuffle table for isolating alpha.
-static const uvec8 kShuffleAlpha = {
-  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
-  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
-};
+static const uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
+                                    11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
 
 // Blend 8 pixels at a time.
-__declspec(naked)
-void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
-                        uint8* dst_argb, int width) {
+__declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
+                                          const uint8_t* src_argb1,
+                                          uint8_t* dst_argb,
+                                          int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    pcmpeqb    xmm7, xmm7       // generate constant 0x0001
+    pcmpeqb    xmm7, xmm7  // generate constant 0x0001
     psrlw      xmm7, 15
-    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
+    pcmpeqb    xmm6, xmm6  // generate mask 0x00ff00ff
     psrlw      xmm6, 8
-    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
+    pcmpeqb    xmm5, xmm5  // generate mask 0xff00ff00
     psllw      xmm5, 8
-    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff000000
     pslld      xmm4, 24
     sub        ecx, 4
-    jl         convertloop4b    // less than 4 pixels?
+    jl         convertloop4b  // less than 4 pixels?
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop4:
-    movdqu     xmm3, [eax]      // src argb
+    movdqu     xmm3, [eax]  // src argb
     lea        eax, [eax + 16]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movdqu     xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movdqu     xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movdqu     xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movdqu     xmm1, [esi]  // _a_g
     lea        esi, [esi + 16]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4276,26 +4222,26 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-    // 1 pixel loop.
+        // 1 pixel loop.
   convertloop1:
-    movd       xmm3, [eax]      // src argb
+    movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
-    movdqa     xmm0, xmm3       // src argb
-    pxor       xmm3, xmm4       // ~alpha
-    movd       xmm2, [esi]      // _r_b
-    pshufb     xmm3, xmmword ptr kShuffleAlpha // alpha
-    pand       xmm2, xmm6       // _r_b
-    paddw      xmm3, xmm7       // 256 - alpha
-    pmullw     xmm2, xmm3       // _r_b * alpha
-    movd       xmm1, [esi]      // _a_g
+    movdqa     xmm0, xmm3  // src argb
+    pxor       xmm3, xmm4  // ~alpha
+    movd       xmm2, [esi]  // _r_b
+    pshufb     xmm3, xmmword ptr kShuffleAlpha  // alpha
+    pand       xmm2, xmm6  // _r_b
+    paddw      xmm3, xmm7  // 256 - alpha
+    pmullw     xmm2, xmm3  // _r_b * alpha
+    movd       xmm1, [esi]  // _a_g
     lea        esi, [esi + 4]
-    psrlw      xmm1, 8          // _a_g
-    por        xmm0, xmm4       // set alpha to 255
-    pmullw     xmm1, xmm3       // _a_g * alpha
-    psrlw      xmm2, 8          // _r_b convert to 8 bits again
-    paddusb    xmm0, xmm2       // + src argb
-    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
-    paddusb    xmm0, xmm1       // + src argb
+    psrlw      xmm1, 8  // _a_g
+    por        xmm0, xmm4  // set alpha to 255
+    pmullw     xmm1, xmm3  // _a_g * alpha
+    psrlw      xmm2, 8  // _r_b convert to 8 bits again
+    paddusb    xmm0, xmm2  // + src argb
+    pand       xmm1, xmm5  // a_g_ convert to 8 bits again
+    paddusb    xmm0, xmm1  // + src argb
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4311,41 +4257,42 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
 static const uvec8 kShuffleAlpha0 = {
-  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
+    3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
 };
 static const uvec8 kShuffleAlpha1 = {
-  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
-  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
+    11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
+    15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
 };
-__declspec(naked)
-void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
-    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
+    pcmpeqb    xmm3, xmm3  // generate mask 0xff000000
     pslld      xmm3, 24
     movdqa     xmm4, xmmword ptr kShuffleAlpha0
     movdqa     xmm5, xmmword ptr kShuffleAlpha1
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
-    pshufb     xmm0, xmm4       // isolate first 2 alphas
-    movdqu     xmm1, [eax]      // read 4 pixels
-    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
-    pmulhuw    xmm0, xmm1       // rgb * a
-    movdqu     xmm1, [eax]      // read 4 pixels
-    pshufb     xmm1, xmm5       // isolate next 2 alphas
-    movdqu     xmm2, [eax]      // read 4 pixels
-    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
-    pmulhuw    xmm1, xmm2       // rgb * a
-    movdqu     xmm2, [eax]      // mask original alpha
+    movdqu     xmm0, [eax]  // read 4 pixels
+    pshufb     xmm0, xmm4  // isolate first 2 alphas
+    movdqu     xmm1, [eax]  // read 4 pixels
+    punpcklbw  xmm1, xmm1  // first 2 pixel rgbs
+    pmulhuw    xmm0, xmm1  // rgb * a
+    movdqu     xmm1, [eax]  // read 4 pixels
+    pshufb     xmm1, xmm5  // isolate next 2 alphas
+    movdqu     xmm2, [eax]  // read 4 pixels
+    punpckhbw  xmm2, xmm2  // next 2 pixel rgbs
+    pmulhuw    xmm1, xmm2  // rgb * a
+    movdqu     xmm2, [eax]  // mask original alpha
     lea        eax, [eax + 16]
     pand       xmm2, xmm3
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
-    por        xmm0, xmm2       // copy original alpha
+    por        xmm0, xmm2  // copy original alpha
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4358,22 +4305,23 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
-static const uvec8 kShuffleAlpha_AVX2 = {
-  6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u
-};
-__declspec(naked)
-void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
+                                         128u, 128u, 14u,  15u, 14u, 15u,
+                                         14u,  15u,  128u, 128u};
+__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
+                                             uint8_t* dst_argb,
+                                             int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kShuffleAlpha_AVX2
-    vpcmpeqb   ymm5, ymm5, ymm5 // generate mask 0xff000000
+    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
     vpslld     ymm5, ymm5, 24
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpshufb    ymm2, ymm0, ymm4  // low 4 alphas
@@ -4398,40 +4346,40 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
 
 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
 // Unattenuate 4 pixels at a time.
-__declspec(naked)
-void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     lea        ebx, fixed_invtbl8
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 3]  // first alpha
     movzx      edi, byte ptr [eax + 7]  // second alpha
-    punpcklbw  xmm0, xmm0       // first 2
+    punpcklbw  xmm0, xmm0  // first 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words.  1, a, a, a
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words.  1, a, a, a
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm0, xmm2       // rgb * a
+    pmulhuw    xmm0, xmm2  // rgb * a
 
-    movdqu     xmm1, [eax]      // read 4 pixels
+    movdqu     xmm1, [eax]  // read 4 pixels
     movzx      esi, byte ptr [eax + 11]  // third alpha
     movzx      edi, byte ptr [eax + 15]  // forth alpha
-    punpckhbw  xmm1, xmm1       // next 2
+    punpckhbw  xmm1, xmm1  // next 2
     movd       xmm2, dword ptr [ebx + esi * 4]
     movd       xmm3, dword ptr [ebx + edi * 4]
-    pshuflw    xmm2, xmm2, 040h // first 4 inv_alpha words
-    pshuflw    xmm3, xmm3, 040h // next 4 inv_alpha words
+    pshuflw    xmm2, xmm2, 040h  // first 4 inv_alpha words
+    pshuflw    xmm3, xmm3, 040h  // next 4 inv_alpha words
     movlhps    xmm2, xmm3
-    pmulhuw    xmm1, xmm2       // rgb * a
+    pmulhuw    xmm1, xmm2  // rgb * a
     lea        eax, [eax + 16]
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -4450,25 +4398,24 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const uvec8 kUnattenShuffleAlpha_AVX2 = {
-  0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u
-};
+    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
 // USE_GATHER is not on by default, due to being a slow instruction.
 #ifdef USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb0
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb0
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     sub        edx, eax
     vbroadcastf128 ymm4, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xffffffff for gather.
-    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
+    vpsrld     ymm2, ymm6, 24  // alpha in low 8 bits.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5  // ymm5 cleared.  1, a
@@ -4488,50 +4435,50 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     ret
   }
 }
-#else  // USE_GATHER
-__declspec(naked)
-void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                             int width) {
+#else   // USE_GATHER
+__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8_t* src_argb,
+                                               uint8_t* dst_argb,
+                                               int width) {
   __asm {
 
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]   // src_argb
-    mov        edx, [esp + 12 + 8]   // dst_argb
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        edx, [esp + 12 + 8]  // dst_argb
     mov        ecx, [esp + 12 + 12]  // width
     sub        edx, eax
     lea        ebx, fixed_invtbl8
     vbroadcastf128 ymm5, xmmword ptr kUnattenShuffleAlpha_AVX2
 
  convertloop:
-    // replace VPGATHER
-    movzx      esi, byte ptr [eax + 3]                 // alpha0
-    movzx      edi, byte ptr [eax + 7]                 // alpha1
+        // replace VPGATHER
+    movzx      esi, byte ptr [eax + 3]  // alpha0
+    movzx      edi, byte ptr [eax + 7]  // alpha1
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a0]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a1]
-    movzx      esi, byte ptr [eax + 11]                // alpha2
-    movzx      edi, byte ptr [eax + 15]                // alpha3
-    vpunpckldq xmm6, xmm0, xmm1                        // [1,a1,1,a0]
+    movzx      esi, byte ptr [eax + 11]  // alpha2
+    movzx      edi, byte ptr [eax + 15]  // alpha3
+    vpunpckldq xmm6, xmm0, xmm1  // [1,a1,1,a0]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a2]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a3]
-    movzx      esi, byte ptr [eax + 19]                // alpha4
-    movzx      edi, byte ptr [eax + 23]                // alpha5
-    vpunpckldq xmm7, xmm2, xmm3                        // [1,a3,1,a2]
+    movzx      esi, byte ptr [eax + 19]  // alpha4
+    movzx      edi, byte ptr [eax + 23]  // alpha5
+    vpunpckldq xmm7, xmm2, xmm3  // [1,a3,1,a2]
     vmovd      xmm0, dword ptr [ebx + esi * 4]  // [1,a4]
     vmovd      xmm1, dword ptr [ebx + edi * 4]  // [1,a5]
-    movzx      esi, byte ptr [eax + 27]                // alpha6
-    movzx      edi, byte ptr [eax + 31]                // alpha7
-    vpunpckldq xmm0, xmm0, xmm1                        // [1,a5,1,a4]
+    movzx      esi, byte ptr [eax + 27]  // alpha6
+    movzx      edi, byte ptr [eax + 31]  // alpha7
+    vpunpckldq xmm0, xmm0, xmm1  // [1,a5,1,a4]
     vmovd      xmm2, dword ptr [ebx + esi * 4]  // [1,a6]
     vmovd      xmm3, dword ptr [ebx + edi * 4]  // [1,a7]
-    vpunpckldq xmm2, xmm2, xmm3                        // [1,a7,1,a6]
-    vpunpcklqdq xmm3, xmm6, xmm7                       // [1,a3,1,a2,1,a1,1,a0]
-    vpunpcklqdq xmm0, xmm0, xmm2                       // [1,a7,1,a6,1,a5,1,a4]
-    vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
+    vpunpckldq xmm2, xmm2, xmm3  // [1,a7,1,a6]
+    vpunpcklqdq xmm3, xmm6, xmm7  // [1,a3,1,a2,1,a1,1,a0]
+    vpunpcklqdq xmm0, xmm0, xmm2  // [1,a7,1,a6,1,a5,1,a4]
+    vinserti128 ymm3, ymm3, xmm0, 1                // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
     // end of VPGATHER
 
-    vmovdqu    ymm6, [eax]       // read 8 pixels.
+    vmovdqu    ymm6, [eax]  // read 8 pixels.
     vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
     vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
     vpunpcklwd ymm2, ymm3, ymm3  // low 4 inverted alphas. mutated. 1, 1, a, a
@@ -4540,7 +4487,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
     vpshufb    ymm3, ymm3, ymm5  // replicate high 4 alphas
     vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
     vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
-    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
+    vpackuswb  ymm0, ymm0, ymm1             // unmutated.
     vmovdqu    [eax + edx], ymm0
     lea        eax, [eax + 32]
     sub        ecx, 8
@@ -4558,12 +4505,13 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
-__declspec(naked)
-void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
+__declspec(naked) void ARGBGrayRow_SSSE3(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width) {
   __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* width */
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* width */
     movdqa     xmm4, xmmword ptr kARGBToYJ
     movdqa     xmm5, xmmword ptr kAddYJ64
 
@@ -4575,20 +4523,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
     phaddw     xmm0, xmm1
     paddw      xmm0, xmm5  // Add .5 for rounding.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 G bytes
+    packuswb   xmm0, xmm0  // 8 G bytes
     movdqu     xmm2, [eax]  // A
     movdqu     xmm3, [eax + 16]
     lea        eax, [eax + 32]
     psrld      xmm2, 24
     psrld      xmm3, 24
     packuswb   xmm2, xmm3
-    packuswb   xmm2, xmm2   // 8 A bytes
-    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
-    punpcklbw  xmm0, xmm0   // 8 GG words
-    punpcklbw  xmm3, xmm2   // 8 GA words
+    packuswb   xmm2, xmm2  // 8 A bytes
+    movdqa     xmm3, xmm0  // Weave into GG, GA, then GGGA
+    punpcklbw  xmm0, xmm0  // 8 GG words
+    punpcklbw  xmm3, xmm2  // 8 GA words
     movdqa     xmm1, xmm0
-    punpcklwd  xmm0, xmm3   // GGGA first 4
-    punpckhwd  xmm1, xmm3   // GGGA next 4
+    punpcklwd  xmm0, xmm3  // GGGA first 4
+    punpckhwd  xmm1, xmm3  // GGGA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
@@ -4604,24 +4552,20 @@ void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
 //    g = (r * 45 + g * 88 + b * 22) >> 7
 //    r = (r * 50 + g * 98 + b * 24) >> 7
 // Constant for ARGB color to sepia tone.
-static const vec8 kARGBToSepiaB = {
-  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
-};
+static const vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
+                                   17, 68, 35, 0, 17, 68, 35, 0};
 
-static const vec8 kARGBToSepiaG = {
-  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
-};
+static const vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
+                                   22, 88, 45, 0, 22, 88, 45, 0};
 
-static const vec8 kARGBToSepiaR = {
-  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
-};
+static const vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
+                                   24, 98, 50, 0, 24, 98, 50, 0};
 
 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
-__declspec(naked)
-void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
   __asm {
-    mov        eax, [esp + 4]   /* dst_argb */
-    mov        ecx, [esp + 8]   /* width */
+    mov        eax, [esp + 4] /* dst_argb */
+    mov        ecx, [esp + 8] /* width */
     movdqa     xmm2, xmmword ptr kARGBToSepiaB
     movdqa     xmm3, xmmword ptr kARGBToSepiaG
     movdqa     xmm4, xmmword ptr kARGBToSepiaR
@@ -4633,32 +4577,32 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
     pmaddubsw  xmm6, xmm2
     phaddw     xmm0, xmm6
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0   // 8 B values
+    packuswb   xmm0, xmm0  // 8 B values
     movdqu     xmm5, [eax]  // G
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm3
     pmaddubsw  xmm1, xmm3
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 G values
-    punpcklbw  xmm0, xmm5   // 8 BG values
+    packuswb   xmm5, xmm5  // 8 G values
+    punpcklbw  xmm0, xmm5  // 8 BG values
     movdqu     xmm5, [eax]  // R
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm5, xmm4
     pmaddubsw  xmm1, xmm4
     phaddw     xmm5, xmm1
     psrlw      xmm5, 7
-    packuswb   xmm5, xmm5   // 8 R values
+    packuswb   xmm5, xmm5  // 8 R values
     movdqu     xmm6, [eax]  // A
     movdqu     xmm1, [eax + 16]
     psrld      xmm6, 24
     psrld      xmm1, 24
     packuswb   xmm6, xmm1
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm5, xmm6   // 8 RA values
-    movdqa     xmm1, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm5   // BGRA first 4
-    punpckhwd  xmm1, xmm5   // BGRA next 4
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm5, xmm6  // 8 RA values
+    movdqa     xmm1, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm5  // BGRA first 4
+    punpckhwd  xmm1, xmm5  // BGRA next 4
     movdqu     [eax], xmm0
     movdqu     [eax + 16], xmm1
     lea        eax, [eax + 32]
@@ -4674,19 +4618,20 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
 // Same as Sepia except matrix is provided.
 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
-__declspec(naked)
-void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                              const int8* matrix_argb, int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]  /* matrix_argb */
+__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
+                                                uint8_t* dst_argb,
+                                                const int8_t* matrix_argb,
+                                                int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* matrix_argb */
     movdqu     xmm5, [ecx]
     pshufd     xmm2, xmm5, 0x00
     pshufd     xmm3, xmm5, 0x55
     pshufd     xmm4, xmm5, 0xaa
     pshufd     xmm5, xmm5, 0xff
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
  convertloop:
     movdqu     xmm0, [eax]  // B
@@ -4697,31 +4642,31 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
     movdqu     xmm1, [eax + 16]
     pmaddubsw  xmm6, xmm3
     pmaddubsw  xmm1, xmm3
-    phaddsw    xmm0, xmm7   // B
-    phaddsw    xmm6, xmm1   // G
-    psraw      xmm0, 6      // B
-    psraw      xmm6, 6      // G
-    packuswb   xmm0, xmm0   // 8 B values
-    packuswb   xmm6, xmm6   // 8 G values
-    punpcklbw  xmm0, xmm6   // 8 BG values
+    phaddsw    xmm0, xmm7  // B
+    phaddsw    xmm6, xmm1  // G
+    psraw      xmm0, 6  // B
+    psraw      xmm6, 6  // G
+    packuswb   xmm0, xmm0  // 8 B values
+    packuswb   xmm6, xmm6  // 8 G values
+    punpcklbw  xmm0, xmm6  // 8 BG values
     movdqu     xmm1, [eax]  // R
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm7, xmm4
-    phaddsw    xmm1, xmm7   // R
+    phaddsw    xmm1, xmm7  // R
     movdqu     xmm6, [eax]  // A
     movdqu     xmm7, [eax + 16]
     pmaddubsw  xmm6, xmm5
     pmaddubsw  xmm7, xmm5
-    phaddsw    xmm6, xmm7   // A
-    psraw      xmm1, 6      // R
-    psraw      xmm6, 6      // A
-    packuswb   xmm1, xmm1   // 8 R values
-    packuswb   xmm6, xmm6   // 8 A values
-    punpcklbw  xmm1, xmm6   // 8 RA values
-    movdqa     xmm6, xmm0   // Weave BG, RA together
-    punpcklwd  xmm0, xmm1   // BGRA first 4
-    punpckhwd  xmm6, xmm1   // BGRA next 4
+    phaddsw    xmm6, xmm7  // A
+    psraw      xmm1, 6  // R
+    psraw      xmm6, 6  // A
+    packuswb   xmm1, xmm1  // 8 R values
+    packuswb   xmm6, xmm6  // 8 A values
+    punpcklbw  xmm1, xmm6  // 8 RA values
+    movdqa     xmm6, xmm0  // Weave BG, RA together
+    punpcklwd  xmm0, xmm1  // BGRA first 4
+    punpckhwd  xmm6, xmm1  // BGRA next 4
     movdqu     [edx], xmm0
     movdqu     [edx + 16], xmm6
     lea        eax, [eax + 32]
@@ -4735,15 +4680,17 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 
 #ifdef HAS_ARGBQUANTIZEROW_SSE2
 // Quantize 4 ARGB pixels (16 bytes).
-__declspec(naked)
-void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
-                          int interval_offset, int width) {
-  __asm {
-    mov        eax, [esp + 4]    /* dst_argb */
-    movd       xmm2, [esp + 8]   /* scale */
-    movd       xmm3, [esp + 12]  /* interval_size */
-    movd       xmm4, [esp + 16]  /* interval_offset */
-    mov        ecx, [esp + 20]   /* width */
+__declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
+                                            int scale,
+                                            int interval_size,
+                                            int interval_offset,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4] /* dst_argb */
+    movd       xmm2, [esp + 8] /* scale */
+    movd       xmm3, [esp + 12] /* interval_size */
+    movd       xmm4, [esp + 16] /* interval_offset */
+    mov        ecx, [esp + 20] /* width */
     pshuflw    xmm2, xmm2, 040h
     pshufd     xmm2, xmm2, 044h
     pshuflw    xmm3, xmm3, 040h
@@ -4756,16 +4703,16 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
  convertloop:
     movdqu     xmm0, [eax]  // read 4 pixels
-    punpcklbw  xmm0, xmm5   // first 2 pixels
-    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
+    punpcklbw  xmm0, xmm5  // first 2 pixels
+    pmulhuw    xmm0, xmm2  // pixel * scale >> 16
     movdqu     xmm1, [eax]  // read 4 pixels
-    punpckhbw  xmm1, xmm5   // next 2 pixels
+    punpckhbw  xmm1, xmm5  // next 2 pixels
     pmulhuw    xmm1, xmm2
-    pmullw     xmm0, xmm3   // * interval_size
+    pmullw     xmm0, xmm3  // * interval_size
     movdqu     xmm7, [eax]  // read 4 pixels
     pmullw     xmm1, xmm3
-    pand       xmm7, xmm6   // mask alpha
-    paddw      xmm0, xmm4   // + interval_size / 2
+    pand       xmm7, xmm6  // mask alpha
+    paddw      xmm0, xmm4  // + interval_size / 2
     paddw      xmm1, xmm4
     packuswb   xmm0, xmm1
     por        xmm0, xmm7
@@ -4780,25 +4727,26 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
 
 #ifdef HAS_ARGBSHADEROW_SSE2
 // Shade 4 pixels at a time by specified value.
-__declspec(naked)
-void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
-                       uint32 value) {
+__declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
+                                         uint8_t* dst_argb,
+                                         int width,
+                                         uint32_t value) {
   __asm {
-    mov        eax, [esp + 4]   // src_argb
-    mov        edx, [esp + 8]   // dst_argb
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
     mov        ecx, [esp + 12]  // width
     movd       xmm2, [esp + 16]  // value
     punpcklbw  xmm2, xmm2
     punpcklqdq xmm2, xmm2
 
  convertloop:
-    movdqu     xmm0, [eax]      // read 4 pixels
+    movdqu     xmm0, [eax]  // read 4 pixels
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm0       // first 2
-    punpckhbw  xmm1, xmm1       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
-    pmulhuw    xmm1, xmm2       // argb * value
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    pmulhuw    xmm0, xmm2  // argb * value
+    pmulhuw    xmm1, xmm2  // argb * value
     psrlw      xmm0, 8
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
@@ -4814,28 +4762,29 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
-    movdqu     xmm2, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
+    movdqu     xmm2, [esi]  // read 4 pixels from src_argb1
     movdqu     xmm1, xmm0
     movdqu     xmm3, xmm2
-    punpcklbw  xmm0, xmm0         // first 2
-    punpckhbw  xmm1, xmm1         // next 2
-    punpcklbw  xmm2, xmm5         // first 2
-    punpckhbw  xmm3, xmm5         // next 2
-    pmulhuw    xmm0, xmm2         // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3         // src_argb0 * src_argb1 next 2
+    punpcklbw  xmm0, xmm0  // first 2
+    punpckhbw  xmm1, xmm1  // next 2
+    punpcklbw  xmm2, xmm5  // first 2
+    punpckhbw  xmm3, xmm5  // next 2
+    pmulhuw    xmm0, xmm2  // src_argb0 * src_argb1 first 2
+    pmulhuw    xmm1, xmm3  // src_argb0 * src_argb1 next 2
     lea        eax, [eax + 16]
     lea        esi, [esi + 16]
     packuswb   xmm0, xmm1
@@ -4853,13 +4802,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // TODO(fbarchard): Port this to posix, neon and other math functions.
-__declspec(naked)
-void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
@@ -4867,11 +4817,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop49
 
  convertloop4:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4882,11 +4832,11 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
     jl         convertloop19
 
  convertloop1:
-    movd       xmm0, [eax]        // read 1 pixels from src_argb0
+    movd       xmm0, [eax]  // read 1 pixels from src_argb0
     lea        eax, [eax + 4]
-    movd       xmm1, [esi]        // read 1 pixels from src_argb1
+    movd       xmm1, [esi]  // read 1 pixels from src_argb1
     lea        esi, [esi + 4]
-    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
+    paddusb    xmm0, xmm1  // src_argb0 + src_argb1
     movd       [edx], xmm0
     lea        edx, [edx + 4]
     sub        ecx, 1
@@ -4901,22 +4851,23 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_SSE2
 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    movdqu     xmm0, [eax]        // read 4 pixels from src_argb0
+    movdqu     xmm0, [eax]  // read 4 pixels from src_argb0
     lea        eax, [eax + 16]
-    movdqu     xmm1, [esi]        // read 4 pixels from src_argb1
+    movdqu     xmm1, [esi]  // read 4 pixels from src_argb1
     lea        esi, [esi + 16]
-    psubusb    xmm0, xmm1         // src_argb0 - src_argb1
+    psubusb    xmm0, xmm1  // src_argb0 - src_argb1
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -4930,28 +4881,29 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBMULTIPLYROW_AVX2
 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
-    vpxor      ymm5, ymm5, ymm5     // constant 0
+    vpxor      ymm5, ymm5, ymm5  // constant 0
 
  convertloop:
-    vmovdqu    ymm1, [eax]        // read 8 pixels from src_argb0
+    vmovdqu    ymm1, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vmovdqu    ymm3, [esi]        // read 8 pixels from src_argb1
+    vmovdqu    ymm3, [esi]  // read 8 pixels from src_argb1
     lea        esi, [esi + 32]
-    vpunpcklbw ymm0, ymm1, ymm1   // low 4
-    vpunpckhbw ymm1, ymm1, ymm1   // high 4
-    vpunpcklbw ymm2, ymm3, ymm5   // low 4
-    vpunpckhbw ymm3, ymm3, ymm5   // high 4
-    vpmulhuw   ymm0, ymm0, ymm2   // src_argb0 * src_argb1 low 4
-    vpmulhuw   ymm1, ymm1, ymm3   // src_argb0 * src_argb1 high 4
+    vpunpcklbw ymm0, ymm1, ymm1  // low 4
+    vpunpckhbw ymm1, ymm1, ymm1  // high 4
+    vpunpcklbw ymm2, ymm3, ymm5  // low 4
+    vpunpckhbw ymm3, ymm3, ymm5  // high 4
+    vpmulhuw   ymm0, ymm0, ymm2  // src_argb0 * src_argb1 low 4
+    vpmulhuw   ymm1, ymm1, ymm3  // src_argb0 * src_argb1 high 4
     vpackuswb  ymm0, ymm0, ymm1
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4967,20 +4919,21 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBADDROW_AVX2
 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb0,
+                                       const uint8_t* src_argb1,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpaddusb   ymm0, ymm0, [esi]        // add 8 pixels from src_argb1
+    vpaddusb   ymm0, ymm0, [esi]  // add 8 pixels from src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -4996,20 +4949,21 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 
 #ifdef HAS_ARGBSUBTRACTROW_AVX2
 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
-__declspec(naked)
-void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
-                          uint8* dst_argb, int width) {
+__declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb0,
+                                            const uint8_t* src_argb1,
+                                            uint8_t* dst_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_argb0
-    mov        esi, [esp + 4 + 8]   // src_argb1
+    mov        eax, [esp + 4 + 4]  // src_argb0
+    mov        esi, [esp + 4 + 8]  // src_argb1
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
 
  convertloop:
-    vmovdqu    ymm0, [eax]              // read 8 pixels from src_argb0
+    vmovdqu    ymm0, [eax]  // read 8 pixels from src_argb0
     lea        eax, [eax + 32]
-    vpsubusb   ymm0, ymm0, [esi]        // src_argb0 - src_argb1
+    vpsubusb   ymm0, ymm0, [esi]  // src_argb0 - src_argb1
     lea        esi, [esi + 32]
     vmovdqu    [edx], ymm0
     lea        edx, [edx + 32]
@@ -5028,14 +4982,16 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
 // -1  0  1
 // -2  0  2
 // -1  0  1
-__declspec(naked)
-void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    const uint8* src_y2, uint8* dst_sobelx, int width) {
+__declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      const uint8_t* src_y2,
+                                      uint8_t* dst_sobelx,
+                                      int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   // src_y0
-    mov        esi, [esp + 8 + 8]   // src_y1
+    mov        eax, [esp + 8 + 4]  // src_y0
+    mov        esi, [esp + 8 + 8]  // src_y1
     mov        edi, [esp + 8 + 12]  // src_y2
     mov        edx, [esp + 8 + 16]  // dst_sobelx
     mov        ecx, [esp + 8 + 20]  // width
@@ -5045,17 +5001,17 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     movq       xmm2, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + edi]      // read 8 pixels from src_y2[0]
+    movq       xmm2, qword ptr [eax + edi]  // read 8 pixels from src_y2[0]
     movq       xmm3, qword ptr [eax + edi + 2]  // read 8 pixels from src_y2[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5063,7 +5019,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5084,13 +5040,14 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // -1 -2 -1
 //  0  0  0
 //  1  2  1
-__declspec(naked)
-void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
-                    uint8* dst_sobely, int width) {
+__declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
+                                      const uint8_t* src_y1,
+                                      uint8_t* dst_sobely,
+                                      int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_y0
-    mov        esi, [esp + 4 + 8]   // src_y1
+    mov        eax, [esp + 4 + 4]  // src_y0
+    mov        esi, [esp + 4 + 8]  // src_y1
     mov        edx, [esp + 4 + 12]  // dst_sobely
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
@@ -5098,17 +5055,17 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     pxor       xmm5, xmm5  // constant 0
 
  convertloop:
-    movq       xmm0, qword ptr [eax]            // read 8 pixels from src_y0[0]
-    movq       xmm1, qword ptr [eax + esi]      // read 8 pixels from src_y1[0]
+    movq       xmm0, qword ptr [eax]  // read 8 pixels from src_y0[0]
+    movq       xmm1, qword ptr [eax + esi]  // read 8 pixels from src_y1[0]
     punpcklbw  xmm0, xmm5
     punpcklbw  xmm1, xmm5
     psubw      xmm0, xmm1
-    movq       xmm1, qword ptr [eax + 1]        // read 8 pixels from src_y0[1]
+    movq       xmm1, qword ptr [eax + 1]  // read 8 pixels from src_y0[1]
     movq       xmm2, qword ptr [eax + esi + 1]  // read 8 pixels from src_y1[1]
     punpcklbw  xmm1, xmm5
     punpcklbw  xmm2, xmm5
     psubw      xmm1, xmm2
-    movq       xmm2, qword ptr [eax + 2]        // read 8 pixels from src_y0[2]
+    movq       xmm2, qword ptr [eax + 2]  // read 8 pixels from src_y0[2]
     movq       xmm3, qword ptr [eax + esi + 2]  // read 8 pixels from src_y1[2]
     punpcklbw  xmm2, xmm5
     punpcklbw  xmm3, xmm5
@@ -5116,7 +5073,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
     paddw      xmm0, xmm2
     paddw      xmm0, xmm1
     paddw      xmm0, xmm1
-    pxor       xmm1, xmm1   // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
+    pxor       xmm1, xmm1  // abs = max(xmm0, -xmm0).  SSSE3 could use pabsw
     psubw      xmm1, xmm0
     pmaxsw     xmm0, xmm1
     packuswb   xmm0, xmm0
@@ -5137,36 +5094,37 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
 // R = Sobel
 // G = Sobel
 // B = Sobel
-__declspec(naked)
-void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                   uint8* dst_argb, int width) {
+__declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
+                                     const uint8_t* src_sobely,
+                                     uint8_t* dst_argb,
+                                     int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
-    pslld      xmm5, 24             // 0xff000000
+    pcmpeqb    xmm5, xmm5  // alpha 255
+    pslld      xmm5, 24  // 0xff000000
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm2, xmm0             // GG
-    punpcklbw  xmm2, xmm0             // First 8
-    punpckhbw  xmm0, xmm0             // Next 8
-    movdqa     xmm1, xmm2             // GGGG
-    punpcklwd  xmm1, xmm2             // First 4
-    punpckhwd  xmm2, xmm2             // Next 4
-    por        xmm1, xmm5             // GGGA
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm2, xmm0  // GG
+    punpcklbw  xmm2, xmm0  // First 8
+    punpckhbw  xmm0, xmm0  // Next 8
+    movdqa     xmm1, xmm2  // GGGG
+    punpcklwd  xmm1, xmm2  // First 4
+    punpckhwd  xmm2, xmm2  // Next 4
+    por        xmm1, xmm5  // GGGA
     por        xmm2, xmm5
-    movdqa     xmm3, xmm0             // GGGG
-    punpcklwd  xmm3, xmm0             // Next 4
-    punpckhwd  xmm0, xmm0             // Last 4
-    por        xmm3, xmm5             // GGGA
+    movdqa     xmm3, xmm0  // GGGG
+    punpcklwd  xmm3, xmm0  // Next 4
+    punpckhwd  xmm0, xmm0  // Last 4
+    por        xmm3, xmm5  // GGGA
     por        xmm0, xmm5
     movdqu     [edx], xmm1
     movdqu     [edx + 16], xmm2
@@ -5184,22 +5142,23 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 
 #ifdef HAS_SOBELTOPLANEROW_SSE2
 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
-__declspec(naked)
-void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                          uint8* dst_y, int width) {
+__declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
+                                            const uint8_t* src_sobely,
+                                            uint8_t* dst_y,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
-    paddusb    xmm0, xmm1             // sobel = sobelx + sobely
+    paddusb    xmm0, xmm1  // sobel = sobelx + sobely
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
     sub        ecx, 16
@@ -5217,36 +5176,37 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // R = Sobel X
 // G = Sobel
 // B = Sobel Y
-__declspec(naked)
-void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
-                     uint8* dst_argb, int width) {
+__declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
+                                       const uint8_t* src_sobely,
+                                       uint8_t* dst_argb,
+                                       int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   // src_sobelx
-    mov        esi, [esp + 4 + 8]   // src_sobely
+    mov        eax, [esp + 4 + 4]  // src_sobelx
+    mov        esi, [esp + 4 + 8]  // src_sobely
     mov        edx, [esp + 4 + 12]  // dst_argb
     mov        ecx, [esp + 4 + 16]  // width
     sub        esi, eax
-    pcmpeqb    xmm5, xmm5           // alpha 255
+    pcmpeqb    xmm5, xmm5  // alpha 255
 
  convertloop:
-    movdqu     xmm0, [eax]            // read 16 pixels src_sobelx
-    movdqu     xmm1, [eax + esi]      // read 16 pixels src_sobely
+    movdqu     xmm0, [eax]  // read 16 pixels src_sobelx
+    movdqu     xmm1, [eax + esi]  // read 16 pixels src_sobely
     lea        eax, [eax + 16]
     movdqa     xmm2, xmm0
-    paddusb    xmm2, xmm1             // sobel = sobelx + sobely
-    movdqa     xmm3, xmm0             // XA
+    paddusb    xmm2, xmm1  // sobel = sobelx + sobely
+    movdqa     xmm3, xmm0  // XA
     punpcklbw  xmm3, xmm5
     punpckhbw  xmm0, xmm5
-    movdqa     xmm4, xmm1             // YS
+    movdqa     xmm4, xmm1  // YS
     punpcklbw  xmm4, xmm2
     punpckhbw  xmm1, xmm2
-    movdqa     xmm6, xmm4             // YSXA
-    punpcklwd  xmm6, xmm3             // First 4
-    punpckhwd  xmm4, xmm3             // Next 4
-    movdqa     xmm7, xmm1             // YSXA
-    punpcklwd  xmm7, xmm0             // Next 4
-    punpckhwd  xmm1, xmm0             // Last 4
+    movdqa     xmm6, xmm4  // YSXA
+    punpcklwd  xmm6, xmm3  // First 4
+    punpckhwd  xmm4, xmm3  // Next 4
+    movdqa     xmm7, xmm1  // YSXA
+    punpcklwd  xmm7, xmm0  // Next 4
+    punpckhwd  xmm1, xmm0  // Last 4
     movdqu     [edx], xmm6
     movdqu     [edx + 16], xmm4
     movdqu     [edx + 32], xmm7
@@ -5275,8 +5235,11 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
 // count is number of averaged pixels to produce.
 // Does 4 pixels at a time.
 // This function requires alignment on accumulation buffer pointers.
-void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
-                                    int width, int area, uint8* dst,
+void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
+                                    const int32_t* botleft,
+                                    int width,
+                                    int area,
+                                    uint8_t* dst,
                                     int count) {
   __asm {
     mov        eax, topleft  // eax topleft
@@ -5294,18 +5257,18 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     cmp        area, 128  // 128 pixels will not overflow 15 bits.
     ja         l4
 
-    pshufd     xmm5, xmm5, 0        // area
-    pcmpeqb    xmm6, xmm6           // constant of 65536.0 - 1 = 65535.0
+    pshufd     xmm5, xmm5, 0  // area
+    pcmpeqb    xmm6, xmm6  // constant of 65536.0 - 1 = 65535.0
     psrld      xmm6, 16
     cvtdq2ps   xmm6, xmm6
-    addps      xmm5, xmm6           // (65536.0 + area - 1)
-    mulps      xmm5, xmm4           // (65536.0 + area - 1) * 1 / area
-    cvtps2dq   xmm5, xmm5           // 0.16 fixed point
-    packssdw   xmm5, xmm5           // 16 bit shorts
+    addps      xmm5, xmm6  // (65536.0 + area - 1)
+    mulps      xmm5, xmm4  // (65536.0 + area - 1) * 1 / area
+    cvtps2dq   xmm5, xmm5  // 0.16 fixed point
+    packssdw   xmm5, xmm5  // 16 bit shorts
 
-    // 4 pixel loop small blocks.
+        // 4 pixel loop small blocks.
   s4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5345,9 +5308,9 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 
     jmp        l4b
 
-    // 4 pixel loop
+            // 4 pixel loop
   l4:
-    // top left
+        // top left
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + 32]
@@ -5373,7 +5336,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     paddd      xmm3, [esi + edx * 4 + 48]
     lea        esi, [esi + 64]
 
-    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
+    cvtdq2ps   xmm0, xmm0  // Average = Sum * 1 / Area
     cvtdq2ps   xmm1, xmm1
     mulps      xmm0, xmm4
     mulps      xmm1, xmm4
@@ -5397,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5422,8 +5385,10 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value.
-void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
-                                  const int32* previous_cumsum, int width) {
+void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
+                                  int32_t* cumsum,
+                                  const int32_t* previous_cumsum,
+                                  int width) {
   __asm {
     mov        eax, row
     mov        edx, cumsum
@@ -5437,7 +5402,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     test       edx, 15
     jne        l4b
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
     movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
     lea        eax, [eax + 16]
@@ -5483,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
     movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
     lea        eax, [eax + 4]
@@ -5505,10 +5470,11 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // Copy ARGB pixels from source image with slope to a row of destination.
-__declspec(naked)
-LIBYUV_API
-void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
-                        uint8* dst_argb, const float* uv_dudv, int width) {
+__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
+                                                     int src_argb_stride,
+                                                     uint8_t* dst_argb,
+                                                     const float* uv_dudv,
+                                                     int width) {
   __asm {
     push       esi
     push       edi
@@ -5519,46 +5485,46 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     movq       xmm2, qword ptr [ecx]  // uv
     movq       xmm7, qword ptr [ecx + 8]  // dudv
     mov        ecx, [esp + 28]  // width
-    shl        esi, 16          // 4, stride
+    shl        esi, 16  // 4, stride
     add        esi, 4
     movd       xmm5, esi
     sub        ecx, 4
     jl         l4b
 
-    // setup for 4 pixel loop
+        // setup for 4 pixel loop
     pshufd     xmm7, xmm7, 0x44  // dup dudv
     pshufd     xmm5, xmm5, 0  // dup 4, stride
-    movdqa     xmm0, xmm2    // x0, y0, x1, y1
+    movdqa     xmm0, xmm2  // x0, y0, x1, y1
     addps      xmm0, xmm7
     movlhps    xmm2, xmm0
     movdqa     xmm4, xmm7
-    addps      xmm4, xmm4    // dudv *= 2
-    movdqa     xmm3, xmm2    // x2, y2, x3, y3
+    addps      xmm4, xmm4  // dudv *= 2
+    movdqa     xmm3, xmm2  // x2, y2, x3, y3
     addps      xmm3, xmm4
-    addps      xmm4, xmm4    // dudv *= 4
+    addps      xmm4, xmm4  // dudv *= 4
 
-    // 4 pixel loop
+        // 4 pixel loop
   l4:
-    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
-    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
-    packssdw   xmm0, xmm1    // x, y as 8 shorts
-    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
+    cvttps2dq  xmm0, xmm2  // x, y float to int first 2
+    cvttps2dq  xmm1, xmm3  // x, y float to int next 2
+    packssdw   xmm0, xmm1  // x, y as 8 shorts
+    pmaddwd    xmm0, xmm5  // offsets = x * 4 + y * stride.
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       xmm1, [eax + esi]  // read pixel 0
     movd       xmm6, [eax + edi]  // read pixel 1
-    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
-    addps      xmm2, xmm4    // x, y += dx, dy first 2
+    punpckldq  xmm1, xmm6  // combine pixel 0 and 1
+    addps      xmm2, xmm4  // x, y += dx, dy first 2
     movq       qword ptr [edx], xmm1
     movd       esi, xmm0
     pshufd     xmm0, xmm0, 0x39  // shift right
     movd       edi, xmm0
     movd       xmm6, [eax + esi]  // read pixel 2
     movd       xmm0, [eax + edi]  // read pixel 3
-    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
-    addps      xmm3, xmm4    // x, y += dx, dy next 2
+    punpckldq  xmm6, xmm0  // combine pixel 2 and 3
+    addps      xmm3, xmm4  // x, y += dx, dy next 2
     movq       qword ptr 8[edx], xmm6
     lea        edx, [edx + 16]
     sub        ecx, 4
@@ -5568,12 +5534,12 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
     add        ecx, 4 - 1
     jl         l1b
 
-    // 1 pixel loop
+        // 1 pixel loop
   l1:
-    cvttps2dq  xmm0, xmm2    // x, y float to int
-    packssdw   xmm0, xmm0    // x, y as shorts
-    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
-    addps      xmm2, xmm7    // x, y += dx, dy
+    cvttps2dq  xmm0, xmm2  // x, y float to int
+    packssdw   xmm0, xmm0  // x, y as shorts
+    pmaddwd    xmm0, xmm5  // offset = x * 4 + y * stride
+    addps      xmm2, xmm7  // x, y += dx, dy
     movd       esi, xmm0
     movd       xmm0, [eax + esi]  // copy a pixel
     movd       [edx], xmm0
@@ -5590,15 +5556,16 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 
 #ifdef HAS_INTERPOLATEROW_AVX2
 // Bilinear filter 32x2 -> 32x1
-__declspec(naked)
-void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                         ptrdiff_t src_stride, int dst_width,
-                         int source_y_fraction) {
+__declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
+                                           const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           int dst_width,
+                                           int source_y_fraction) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
@@ -5607,7 +5574,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     je         xloop100  // 0 / 256.  Blend 100 / 0.
     sub        edi, esi
     cmp        eax, 128
-    je         xloop50   // 128 /256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 /256 is 0.50.  Blend 50 / 50.
 
     vmovd      xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5634,14 +5601,14 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
     vpaddw     ymm0, ymm0, ymm4
     vpsrlw     ymm1, ymm1, 8
     vpsrlw     ymm0, ymm0, 8
-    vpackuswb  ymm0, ymm0, ymm1  // unmutates
+    vpackuswb  ymm0, ymm0, ymm1            // unmutates
     vmovdqu    [esi + edi], ymm0
     lea        esi, [esi + 32]
     sub        ecx, 32
     jg         xloop
     jmp        xloop99
 
-   // Blend 50 / 50.
+        // Blend 50 / 50.
  xloop50:
    vmovdqu    ymm0, [esi]
    vpavgb     ymm0, ymm0, [esi + edx]
@@ -5651,7 +5618,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop50
    jmp        xloop99
 
-   // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
  xloop100:
    rep movsb
 
@@ -5666,25 +5633,26 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
 
 // Bilinear filter 16x2 -> 16x1
 // TODO(fbarchard): Consider allowing 256 using memcpy.
-__declspec(naked)
-void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+__declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
+                                            const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            int dst_width,
+                                            int source_y_fraction) {
   __asm {
     push       esi
     push       edi
 
-    mov        edi, [esp + 8 + 4]   // dst_ptr
-    mov        esi, [esp + 8 + 8]   // src_ptr
+    mov        edi, [esp + 8 + 4]  // dst_ptr
+    mov        esi, [esp + 8 + 8]  // src_ptr
     mov        edx, [esp + 8 + 12]  // src_stride
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
-    // Dispatch to specialized filters if applicable.
+        // Dispatch to specialized filters if applicable.
     cmp        eax, 0
     je         xloop100  // 0 /256.  Blend 100 / 0.
     cmp        eax, 128
-    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    je         xloop50  // 128 / 256 is 0.50.  Blend 50 / 50.
 
     movd       xmm0, eax  // high fraction 0..255
     neg        eax
@@ -5703,7 +5671,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     movdqu     xmm1, xmm0
     punpcklbw  xmm0, xmm2
     punpckhbw  xmm1, xmm2
-    psubb      xmm0, xmm4  // bias image by -128
+    psubb      xmm0, xmm4            // bias image by -128
     psubb      xmm1, xmm4
     movdqa     xmm2, xmm5
     movdqa     xmm3, xmm5
@@ -5720,7 +5688,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jg         xloop
     jmp        xloop99
 
-    // Blend 50 / 50.
+        // Blend 50 / 50.
   xloop50:
     movdqu     xmm0, [esi]
     movdqu     xmm1, [esi + edx]
@@ -5731,7 +5699,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
     jg         xloop50
     jmp        xloop99
 
-    // Blend 100 / 0 - Copy row unchanged.
+        // Blend 100 / 0 - Copy row unchanged.
   xloop100:
     movdqu     xmm0, [esi]
     movdqu     [esi + edi], xmm0
@@ -5747,15 +5715,16 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
-__declspec(naked)
-void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                          const uint8* shuffler, int width) {
-  __asm {
-    mov        eax, [esp + 4]    // src_argb
-    mov        edx, [esp + 8]    // dst_argb
-    mov        ecx, [esp + 12]   // shuffler
+__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
+                                            uint8_t* dst_argb,
+                                            const uint8_t* shuffler,
+                                            int width) {
+  __asm {
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
     movdqu     xmm5, [ecx]
-    mov        ecx, [esp + 16]   // width
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -5773,15 +5742,16 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
 }
 
 #ifdef HAS_ARGBSHUFFLEROW_AVX2
-__declspec(naked)
-void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
+__declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
+                                           uint8_t* dst_argb,
+                                           const uint8_t* shuffler,
+                                           int width) {
   __asm {
-    mov        eax, [esp + 4]     // src_argb
-    mov        edx, [esp + 8]     // dst_argb
-    mov        ecx, [esp + 12]    // shuffler
-    vbroadcastf128 ymm5, [ecx]    // same shuffle in high as low.
-    mov        ecx, [esp + 16]    // width
+    mov        eax, [esp + 4]  // src_argb
+    mov        edx, [esp + 8]  // dst_argb
+    mov        ecx, [esp + 12]  // shuffler
+    vbroadcastf128 ymm5, [ecx]  // same shuffle in high as low.
+    mov        ecx, [esp + 16]  // width
 
   wloop:
     vmovdqu    ymm0, [eax]
@@ -5801,152 +5771,36 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBSHUFFLEROW_AVX2
 
-__declspec(naked)
-void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
-                         const uint8* shuffler, int width) {
-  __asm {
-    push       ebx
-    push       esi
-    mov        eax, [esp + 8 + 4]    // src_argb
-    mov        edx, [esp + 8 + 8]    // dst_argb
-    mov        esi, [esp + 8 + 12]   // shuffler
-    mov        ecx, [esp + 8 + 16]   // width
-    pxor       xmm5, xmm5
-
-    mov        ebx, [esi]   // shuffler
-    cmp        ebx, 0x03000102
-    je         shuf_3012
-    cmp        ebx, 0x00010203
-    je         shuf_0123
-    cmp        ebx, 0x00030201
-    je         shuf_0321
-    cmp        ebx, 0x02010003
-    je         shuf_2103
-
-  // TODO(fbarchard): Use one source pointer and 3 offsets.
-  shuf_any1:
-    movzx      ebx, byte ptr [esi]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx], bl
-    movzx      ebx, byte ptr [esi + 1]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 1], bl
-    movzx      ebx, byte ptr [esi + 2]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 2], bl
-    movzx      ebx, byte ptr [esi + 3]
-    movzx      ebx, byte ptr [eax + ebx]
-    mov        [edx + 3], bl
-    lea        eax, [eax + 4]
-    lea        edx, [edx + 4]
-    sub        ecx, 1
-    jg         shuf_any1
-    jmp        shuf99
-
-  shuf_0123:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 01Bh   // 1B = 00011011 = 0x0123 = BGRAToARGB
-    pshuflw    xmm0, xmm0, 01Bh
-    pshufhw    xmm1, xmm1, 01Bh
-    pshuflw    xmm1, xmm1, 01Bh
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0123
-    jmp        shuf99
-
-  shuf_0321:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 039h   // 39 = 00111001 = 0x0321 = RGBAToARGB
-    pshuflw    xmm0, xmm0, 039h
-    pshufhw    xmm1, xmm1, 039h
-    pshuflw    xmm1, xmm1, 039h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_0321
-    jmp        shuf99
-
-  shuf_2103:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 093h   // 93 = 10010011 = 0x2103 = ARGBToRGBA
-    pshuflw    xmm0, xmm0, 093h
-    pshufhw    xmm1, xmm1, 093h
-    pshuflw    xmm1, xmm1, 093h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_2103
-    jmp        shuf99
-
-  shuf_3012:
-    movdqu     xmm0, [eax]
-    lea        eax, [eax + 16]
-    movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm5
-    punpckhbw  xmm1, xmm5
-    pshufhw    xmm0, xmm0, 0C6h   // C6 = 11000110 = 0x3012 = ABGRToARGB
-    pshuflw    xmm0, xmm0, 0C6h
-    pshufhw    xmm1, xmm1, 0C6h
-    pshuflw    xmm1, xmm1, 0C6h
-    packuswb   xmm0, xmm1
-    movdqu     [edx], xmm0
-    lea        edx, [edx + 16]
-    sub        ecx, 4
-    jg         shuf_3012
-
-  shuf99:
-    pop        esi
-    pop        ebx
-    ret
-  }
-}
-
 // YUY2 - Macro-pixel = 2 image pixels
 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
 
 // UYVY - Macro-pixel = 2 image pixels
 // U0Y0V0Y1
 
-__declspec(naked)
-void I422ToYUY2Row_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     lea        eax, [eax + 16]
     movdqa     xmm1, xmm0
-    punpcklbw  xmm0, xmm2 // YUYV
+    punpcklbw  xmm0, xmm2  // YUYV
     punpckhbw  xmm1, xmm2
     movdqu     [edi], xmm0
     movdqu     [edi + 16], xmm1
@@ -5960,30 +5814,30 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
   }
 }
 
-__declspec(naked)
-void I422ToUYVYRow_SSE2(const uint8* src_y,
-                        const uint8* src_u,
-                        const uint8* src_v,
-                        uint8* dst_frame, int width) {
+__declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
+                                          const uint8_t* src_u,
+                                          const uint8_t* src_v,
+                                          uint8_t* dst_frame,
+                                          int width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_y
-    mov        esi, [esp + 8 + 8]    // src_u
-    mov        edx, [esp + 8 + 12]   // src_v
-    mov        edi, [esp + 8 + 16]   // dst_frame
-    mov        ecx, [esp + 8 + 20]   // width
+    mov        eax, [esp + 8 + 4]  // src_y
+    mov        esi, [esp + 8 + 8]  // src_u
+    mov        edx, [esp + 8 + 12]  // src_v
+    mov        edi, [esp + 8 + 16]  // dst_frame
+    mov        ecx, [esp + 8 + 20]  // width
     sub        edx, esi
 
   convertloop:
-    movq       xmm2, qword ptr [esi] // U
-    movq       xmm3, qword ptr [esi + edx] // V
+    movq       xmm2, qword ptr [esi]  // U
+    movq       xmm3, qword ptr [esi + edx]  // V
     lea        esi, [esi + 8]
-    punpcklbw  xmm2, xmm3 // UV
-    movdqu     xmm0, [eax] // Y
+    punpcklbw  xmm2, xmm3  // UV
+    movdqu     xmm0, [eax]  // Y
     movdqa     xmm1, xmm2
     lea        eax, [eax + 16]
-    punpcklbw  xmm1, xmm0 // UYVY
+    punpcklbw  xmm1, xmm0  // UYVY
     punpckhbw  xmm2, xmm0
     movdqu     [edi], xmm1
     movdqu     [edi + 16], xmm2
@@ -5998,22 +5852,22 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 }
 
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
-__declspec(naked)
-void ARGBPolynomialRow_SSE2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
+__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* src_argb */
-    mov        edx, [esp + 4 + 8]   /* dst_argb */
-    mov        esi, [esp + 4 + 12]  /* poly */
-    mov        ecx, [esp + 4 + 16]  /* width */
+    mov        eax, [esp + 4 + 4] /* src_argb */
+    mov        edx, [esp + 4 + 8] /* dst_argb */
+    mov        esi, [esp + 4 + 12] /* poly */
+    mov        ecx, [esp + 4 + 16] /* width */
     pxor       xmm3, xmm3  // 0 constant for zero extending bytes to ints.
 
-    // 2 pixel loop.
+        // 2 pixel loop.
  convertloop:
-//    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
-//    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
+        //    pmovzxbd  xmm0, dword ptr [eax]  // BGRA pixel
+        //    pmovzxbd  xmm4, dword ptr [eax + 4]  // BGRA pixel
     movq       xmm0, qword ptr [eax]  // BGRABGRA
     lea        eax, [eax + 8]
     punpcklbw  xmm0, xmm3
@@ -6057,25 +5911,25 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb,
 #endif  // HAS_ARGBPOLYNOMIALROW_SSE2
 
 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
-__declspec(naked)
-void ARGBPolynomialRow_AVX2(const uint8* src_argb,
-                            uint8* dst_argb, const float* poly,
-                            int width) {
-  __asm {
-    mov        eax, [esp + 4]   /* src_argb */
-    mov        edx, [esp + 8]   /* dst_argb */
-    mov        ecx, [esp + 12]   /* poly */
-    vbroadcastf128 ymm4, [ecx]       // C0
+__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
+                                              uint8_t* dst_argb,
+                                              const float* poly,
+                                              int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src_argb */
+    mov        edx, [esp + 8] /* dst_argb */
+    mov        ecx, [esp + 12] /* poly */
+    vbroadcastf128 ymm4, [ecx]  // C0
     vbroadcastf128 ymm5, [ecx + 16]  // C1
     vbroadcastf128 ymm6, [ecx + 32]  // C2
     vbroadcastf128 ymm7, [ecx + 48]  // C3
-    mov        ecx, [esp + 16]  /* width */
+    mov        ecx, [esp + 16] /* width */
 
     // 2 pixel loop.
  convertloop:
     vpmovzxbd   ymm0, qword ptr [eax]  // 2 BGRA pixels
     lea         eax, [eax + 8]
-    vcvtdq2ps   ymm0, ymm0        // X 8 floats
+    vcvtdq2ps   ymm0, ymm0  // X 8 floats
     vmulps      ymm2, ymm0, ymm0  // X * X
     vmulps      ymm3, ymm0, ymm7  // C3 * X
     vfmadd132ps ymm0, ymm4, ymm5  // result = C0 + C1 * X
@@ -6095,16 +5949,125 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
 }
 #endif  // HAS_ARGBPOLYNOMIALROW_AVX2
 
+#ifdef HAS_HALFFLOATROW_SSE2
+static float kExpBias = 1.9259299444e-34f;
+__declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    mulss      xmm4, kExpBias
+    pshufd     xmm4, xmm4, 0
+    pxor       xmm5, xmm5
+    sub        edx, eax
+
+        // 8 pixel loop.
+ convertloop:
+    movdqu      xmm2, xmmword ptr [eax]  // 8 shorts
+    add         eax, 16
+    movdqa      xmm3, xmm2
+    punpcklwd   xmm2, xmm5
+    cvtdq2ps    xmm2, xmm2  // convert 8 ints to floats
+    punpckhwd   xmm3, xmm5
+    cvtdq2ps    xmm3, xmm3
+    mulps       xmm2, xmm4
+    mulps       xmm3, xmm4
+    psrld       xmm2, 13
+    psrld       xmm3, 13
+    packssdw    xmm2, xmm3
+    movdqu      [eax + edx - 16], xmm2
+    sub         ecx, 8
+    jg          convertloop
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_SSE2
+
+#ifdef HAS_HALFFLOATROW_AVX2
+__declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    movd       xmm4, dword ptr [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+
+    vmulss     xmm4, xmm4, kExpBias
+    vbroadcastss ymm4, xmm4
+    vpxor      ymm5, ymm5, ymm5
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vmovdqu     ymm2, [eax]  // 16 shorts
+    add         eax, 32
+    vpunpckhwd  ymm3, ymm2, ymm5  // convert 16 shorts to 16 ints
+    vpunpcklwd  ymm2, ymm2, ymm5
+    vcvtdq2ps   ymm3, ymm3  // convert 16 ints to floats
+    vcvtdq2ps   ymm2, ymm2
+    vmulps      ymm3, ymm3, ymm4  // scale to adjust exponent for 5 bit range.
+    vmulps      ymm2, ymm2, ymm4
+    vpsrld      ymm3, ymm3, 13  // float convert to 8 half floats truncate
+    vpsrld      ymm2, ymm2, 13
+    vpackssdw   ymm2, ymm2, ymm3
+    vmovdqu     [eax + edx - 32], ymm2
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_AVX2
+
+#ifdef HAS_HALFFLOATROW_F16C
+__declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
+                                         uint16_t* dst,
+                                         float scale,
+                                         int width) {
+  __asm {
+    mov        eax, [esp + 4] /* src */
+    mov        edx, [esp + 8] /* dst */
+    vbroadcastss ymm4, [esp + 12] /* scale */
+    mov        ecx, [esp + 16] /* width */
+    sub        edx, eax
+
+        // 16 pixel loop.
+ convertloop:
+    vpmovzxwd   ymm2, xmmword ptr [eax]  // 8 shorts -> 8 ints
+    vpmovzxwd   ymm3, xmmword ptr [eax + 16]  // 8 more shorts
+    add         eax, 32
+    vcvtdq2ps   ymm2, ymm2  // convert 8 ints to floats
+    vcvtdq2ps   ymm3, ymm3
+    vmulps      ymm2, ymm2, ymm4  // scale to normalized range 0 to 1
+    vmulps      ymm3, ymm3, ymm4
+    vcvtps2ph   xmm2, ymm2, 3  // float convert to 8 half floats truncate
+    vcvtps2ph   xmm3, ymm3, 3
+    vmovdqu     [eax + edx + 32], xmm2
+    vmovdqu     [eax + edx + 32 + 16], xmm3
+    sub         ecx, 16
+    jg          convertloop
+    vzeroupper
+    ret
+  }
+}
+#endif  // HAS_HALFFLOATROW_F16C
+
 #ifdef HAS_ARGBCOLORTABLEROW_X86
 // Tranform ARGB pixels with color table.
-__declspec(naked)
-void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
-                           int width) {
+__declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
+                                             const uint8_t* table_argb,
+                                             int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6131,13 +6094,14 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
 
 #ifdef HAS_RGBCOLORTABLEROW_X86
 // Tranform RGB pixels with color table.
-__declspec(naked)
-void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
+__declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
+                                            const uint8_t* table_argb,
+                                            int width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]   /* dst_argb */
-    mov        esi, [esp + 4 + 8]   /* table_argb */
-    mov        ecx, [esp + 4 + 12]  /* width */
+    mov        eax, [esp + 4 + 4] /* dst_argb */
+    mov        esi, [esp + 4 + 8] /* table_argb */
+    mov        ecx, [esp + 4 + 12] /* width */
 
     // 1 pixel loop.
   convertloop:
@@ -6162,27 +6126,28 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
 
 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
 // Tranform RGB pixels with luma table.
-__declspec(naked)
-void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
-                                 int width,
-                                 const uint8* luma, uint32 lumacoeff) {
+__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
+                                                   uint8_t* dst_argb,
+                                                   int width,
+                                                   const uint8_t* luma,
+                                                   uint32_t lumacoeff) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]   /* src_argb */
-    mov        edi, [esp + 8 + 8]   /* dst_argb */
-    mov        ecx, [esp + 8 + 12]  /* width */
+    mov        eax, [esp + 8 + 4] /* src_argb */
+    mov        edi, [esp + 8 + 8] /* dst_argb */
+    mov        ecx, [esp + 8 + 12] /* width */
     movd       xmm2, dword ptr [esp + 8 + 16]  // luma table
     movd       xmm3, dword ptr [esp + 8 + 20]  // lumacoeff
     pshufd     xmm2, xmm2, 0
     pshufd     xmm3, xmm3, 0
-    pcmpeqb    xmm4, xmm4        // generate mask 0xff00ff00
+    pcmpeqb    xmm4, xmm4  // generate mask 0xff00ff00
     psllw      xmm4, 8
     pxor       xmm5, xmm5
 
-    // 4 pixel loop.
+        // 4 pixel loop.
   convertloop:
-    movdqu     xmm0, xmmword ptr [eax]      // generate luma ptr
+    movdqu     xmm0, xmmword ptr [eax]  // generate luma ptr
     pmaddubsw  xmm0, xmm3
     phaddw     xmm0, xmm0
     pand       xmm0, xmm4  // mask out low bits
diff --git a/libs/libvpx/third_party/libyuv/source/scale.cc b/libs/libvpx/third_party/libyuv/source/scale.cc
index 36e3fe52813..2cfa1c6cb1c 100644
--- a/libs/libvpx/third_party/libyuv/source/scale.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale.cc
@@ -33,17 +33,25 @@ static __inline int Abs(int v) {
 // This is an optimized version for scaling down a plane to 1/2 of
 // its original size.
 
-static void ScalePlaneDown2(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown2(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
-      filtering == kFilterNone ? ScaleRowDown2_C :
-      (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C);
+  void (*ScaleRowDown2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_C
+                                        : ScaleRowDown2Box_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -51,46 +59,63 @@ static void ScalePlaneDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON :
-        ScaleRowDown2Box_Any_NEON);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON
+                                          : ScaleRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON :
-          ScaleRowDown2Box_NEON);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_NEON
+                                                      : ScaleRowDown2Box_NEON);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSSE3 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3 :
-        ScaleRowDown2Box_Any_SSSE3);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_SSSE3
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSSE3
+                                          : ScaleRowDown2Box_Any_SSSE3);
     if (IS_ALIGNED(dst_width, 16)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSSE3 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3 :
-          ScaleRowDown2Box_SSSE3);
+      ScaleRowDown2 =
+          filtering == kFilterNone
+              ? ScaleRowDown2_SSSE3
+              : (filtering == kFilterLinear ? ScaleRowDown2Linear_SSSE3
+                                            : ScaleRowDown2Box_SSSE3);
     }
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 :
-        ScaleRowDown2Box_Any_AVX2);
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_AVX2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2
+                                          : ScaleRowDown2Box_Any_AVX2);
     if (IS_ALIGNED(dst_width, 32)) {
-      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 :
-          (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 :
-          ScaleRowDown2Box_AVX2);
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_AVX2
+                                                      : ScaleRowDown2Box_AVX2);
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN2_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_DSPR2 : ScaleRowDown2_DSPR2;
+#if defined(HAS_SCALEROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_MSA
+                                          : ScaleRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 32)) {
+      ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_MSA
+                                               : (filtering == kFilterLinear
+                                                      ? ScaleRowDown2Linear_MSA
+                                                      : ScaleRowDown2Box_MSA);
+    }
   }
 #endif
 
@@ -105,18 +130,25 @@ static void ScalePlaneDown2(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown2_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown2_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
-    filtering == kFilterNone ? ScaleRowDown2_16_C :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C :
-        ScaleRowDown2Box_16_C);
+  void (*ScaleRowDown2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleRowDown2_16_C
+          : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_C
+                                        : ScaleRowDown2Box_16_C);
   int row_stride = src_stride << 1;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride;  // Point to odd rows.
     src_stride = 0;
@@ -124,23 +156,17 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 
 #if defined(HAS_SCALEROWDOWN2_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering ? ScaleRowDown2Box_16_NEON :
-        ScaleRowDown2_16_NEON;
+    ScaleRowDown2 =
+        filtering ? ScaleRowDown2Box_16_NEON : ScaleRowDown2_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN2_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) {
-    ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_16_SSE2 :
-        (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2 :
-        ScaleRowDown2Box_16_SSE2);
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN2_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_ptr, 4) &&
-      IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown2 = filtering ?
-        ScaleRowDown2Box_16_DSPR2 : ScaleRowDown2_16_DSPR2;
+    ScaleRowDown2 =
+        filtering == kFilterNone
+            ? ScaleRowDown2_16_SSE2
+            : (filtering == kFilterLinear ? ScaleRowDown2Linear_16_SSE2
+                                          : ScaleRowDown2Box_16_SSE2);
   }
 #endif
 
@@ -159,24 +185,30 @@ static void ScalePlaneDown2_16(int src_width, int src_height,
 // This is an optimized version for scaling down a plane to 1/4 of
 // its original size.
 
-static void ScalePlaneDown4(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown4(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                        uint8_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON;
     }
@@ -184,8 +216,8 @@ static void ScalePlaneDown4(int src_width, int src_height,
 #endif
 #if defined(HAS_SCALEROWDOWN4_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_SSSE3 : ScaleRowDown4_Any_SSSE3;
     if (IS_ALIGNED(dst_width, 8)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSSE3 : ScaleRowDown4_SSSE3;
     }
@@ -193,19 +225,20 @@ static void ScalePlaneDown4(int src_width, int src_height,
 #endif
 #if defined(HAS_SCALEROWDOWN4_AVX2)
   if (TestCpuFlag(kCpuHasAVX2)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2;
     if (IS_ALIGNED(dst_width, 16)) {
       ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2;
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN4_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_DSPR2 : ScaleRowDown4_DSPR2;
+#if defined(HAS_SCALEROWDOWN4_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_Any_MSA : ScaleRowDown4_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleRowDown4 = filtering ? ScaleRowDown4Box_MSA : ScaleRowDown4_MSA;
+    }
   }
 #endif
 
@@ -219,38 +252,36 @@ static void ScalePlaneDown4(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown4_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown4_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown4)(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst_ptr, int dst_width) =
+  void (*ScaleRowDown4)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                        uint16_t* dst_ptr, int dst_width) =
       filtering ? ScaleRowDown4Box_16_C : ScaleRowDown4_16_C;
   int row_stride = src_stride << 2;
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     src_ptr += src_stride * 2;  // Point to row 2.
     src_stride = 0;
   }
 #if defined(HAS_SCALEROWDOWN4_16_NEON)
   if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_NEON :
-        ScaleRowDown4_16_NEON;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_NEON : ScaleRowDown4_16_NEON;
   }
 #endif
 #if defined(HAS_SCALEROWDOWN4_16_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) {
-    ScaleRowDown4 = filtering ? ScaleRowDown4Box_16_SSE2 :
-        ScaleRowDown4_16_SSE2;
-  }
-#endif
-#if defined(HAS_SCALEROWDOWN4_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(row_stride, 4) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    ScaleRowDown4 = filtering ?
-        ScaleRowDown4Box_16_DSPR2 : ScaleRowDown4_16_DSPR2;
+    ScaleRowDown4 =
+        filtering ? ScaleRowDown4Box_16_SSE2 : ScaleRowDown4_16_SSE2;
   }
 #endif
 
@@ -265,18 +296,23 @@ static void ScalePlaneDown4_16(int src_width, int src_height,
 }
 
 // Scale plane down, 3/4
-
-static void ScalePlaneDown34(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown34(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_C;
@@ -305,6 +341,26 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEROWDOWN34_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    if (!filtering) {
+      ScaleRowDown34_0 = ScaleRowDown34_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_Any_MSA;
+    } else {
+      ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_MSA;
+      ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_MSA;
+    }
+    if (dst_width % 48 == 0) {
+      if (!filtering) {
+        ScaleRowDown34_0 = ScaleRowDown34_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_MSA;
+      } else {
+        ScaleRowDown34_0 = ScaleRowDown34_0_Box_MSA;
+        ScaleRowDown34_1 = ScaleRowDown34_1_Box_MSA;
+      }
+    }
+  }
+#endif
 #if defined(HAS_SCALEROWDOWN34_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     if (!filtering) {
@@ -325,19 +381,6 @@ static void ScalePlaneDown34(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -346,8 +389,7 @@ static void ScalePlaneDown34(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -363,17 +405,23 @@ static void ScalePlaneDown34(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown34_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown34_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown34_0)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown34_1)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_0)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown34_1)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown34_0 = ScaleRowDown34_16_C;
@@ -404,19 +452,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN34_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 24 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown34_0 = ScaleRowDown34_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_16_DSPR2;
-    } else {
-      ScaleRowDown34_0 = ScaleRowDown34_0_Box_16_DSPR2;
-      ScaleRowDown34_1 = ScaleRowDown34_1_Box_16_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -425,8 +460,7 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
     ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride;
     dst_ptr += dst_stride;
-    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride,
-                     dst_ptr, dst_width);
+    ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, dst_ptr, dst_width);
     src_ptr += src_stride * 2;
     dst_ptr += dst_stride;
   }
@@ -442,7 +476,6 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
   }
 }
 
-
 // Scale plane, 3/8
 // This is an optimized version for scaling down a plane to 3/8
 // of its original size.
@@ -458,18 +491,24 @@ static void ScalePlaneDown34_16(int src_width, int src_height,
 // ggghhhii
 // Boxes are 3x3, 2x3, 3x2 and 2x2
 
-static void ScalePlaneDown38(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr,
+static void ScalePlaneDown38(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
                              enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint8_t* src_ptr, ptrdiff_t src_stride,
+                           uint8_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
   assert(dst_width % 3 == 0);
+  (void)src_width;
+  (void)src_height;
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_C;
     ScaleRowDown38_2 = ScaleRowDown38_C;
@@ -517,16 +556,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
+#if defined(HAS_SCALEROWDOWN38_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
     if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_Any_MSA;
     } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_DSPR2;
+      ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_MSA;
+      ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_MSA;
+    }
+    if (dst_width % 12 == 0) {
+      if (!filtering) {
+        ScaleRowDown38_3 = ScaleRowDown38_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_MSA;
+      } else {
+        ScaleRowDown38_3 = ScaleRowDown38_3_Box_MSA;
+        ScaleRowDown38_2 = ScaleRowDown38_2_Box_MSA;
+      }
     }
   }
 #endif
@@ -554,17 +600,23 @@ static void ScalePlaneDown38(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneDown38_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr,
+static void ScalePlaneDown38_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr,
                                 enum FilterMode filtering) {
   int y;
-  void (*ScaleRowDown38_3)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
-  void (*ScaleRowDown38_2)(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_3)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
+  void (*ScaleRowDown38_2)(const uint16_t* src_ptr, ptrdiff_t src_stride,
+                           uint16_t* dst_ptr, int dst_width);
   const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride;
+  (void)src_width;
+  (void)src_height;
   assert(dst_width % 3 == 0);
   if (!filtering) {
     ScaleRowDown38_3 = ScaleRowDown38_16_C;
@@ -595,19 +647,6 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_SCALEROWDOWN38_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && (dst_width % 12 == 0) &&
-      IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
-    if (!filtering) {
-      ScaleRowDown38_3 = ScaleRowDown38_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_16_DSPR2;
-    } else {
-      ScaleRowDown38_3 = ScaleRowDown38_3_Box_16_DSPR2;
-      ScaleRowDown38_2 = ScaleRowDown38_2_Box_16_DSPR2;
-    }
-  }
-#endif
 
   for (y = 0; y < dst_height - 2; y += 3) {
     ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width);
@@ -634,8 +673,8 @@ static void ScalePlaneDown38_16(int src_width, int src_height,
 
 #define MIN1(x) ((x) < 1 ? 1 : (x))
 
-static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels(int iboxwidth, const uint16_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -644,8 +683,8 @@ static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) {
   return sum;
 }
 
-static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
-  uint32 sum = 0u;
+static __inline uint32_t SumPixels_16(int iboxwidth, const uint32_t* src_ptr) {
+  uint32_t sum = 0u;
   int x;
   assert(iboxwidth > 0);
   for (x = 0; x < iboxwidth; ++x) {
@@ -654,8 +693,12 @@ static __inline uint32 SumPixels_16(int iboxwidth, const uint32* src_ptr) {
   return sum;
 }
 
-static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols2_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -666,13 +709,18 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx,
     int ix = x >> 16;
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
-    *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth] >> 16;
+    *dst_ptr++ =
+        SumPixels(boxwidth, src_ptr + ix) * scaletbl[boxwidth - minboxwidth] >>
+        16;
   }
 }
 
-static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols2_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int i;
   int scaletbl[2];
   int minboxwidth = dx >> 16;
@@ -684,22 +732,32 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx,
     x += dx;
     boxwidth = MIN1((x >> 16) - ix);
     *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) *
-        scaletbl[boxwidth - minboxwidth]  >> 16;
+                     scaletbl[boxwidth - minboxwidth] >>
+                 16;
   }
 }
 
-static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols0_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int scaleval = 65536 / boxheight;
   int i;
+  (void)dx;
   src_ptr += (x >> 16);
   for (i = 0; i < dst_width; ++i) {
     *dst_ptr++ = src_ptr[i] * scaleval >> 16;
   }
 }
 
-static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
-                            const uint16* src_ptr, uint8* dst_ptr) {
+static void ScaleAddCols1_C(int dst_width,
+                            int boxheight,
+                            int x,
+                            int dx,
+                            const uint16_t* src_ptr,
+                            uint8_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -710,8 +768,12 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
   }
 }
 
-static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
-                               const uint32* src_ptr, uint16* dst_ptr) {
+static void ScaleAddCols1_16_C(int dst_width,
+                               int boxheight,
+                               int x,
+                               int dx,
+                               const uint32_t* src_ptr,
+                               uint16_t* dst_ptr) {
   int boxwidth = MIN1(dx >> 16);
   int scaleval = 65536 / (boxwidth * boxheight);
   int i;
@@ -728,10 +790,14 @@ static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx,
 // one pixel of destination using fixed point (16.16) to step
 // through source, sampling a box of pixel with simple
 // averaging.
-static void ScalePlaneBox(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneBox(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -739,18 +805,18 @@ static void ScalePlaneBox(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint16.
+    // Allocate a row buffer of uint16_t.
     align_buffer_64(row16, src_width * 2);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint16* src_ptr, uint8* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_C:
-        ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
-    void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) =
-        ScaleAddRow_C;
+                         const uint16_t* src_ptr, uint8_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_C
+                      : ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C);
+    void (*ScaleAddRow)(const uint8_t* src_ptr, uint16_t* dst_ptr,
+                        int src_width) = ScaleAddRow_C;
 #if defined(HAS_SCALEADDROW_SSE2)
     if (TestCpuFlag(kCpuHasSSE2)) {
       ScaleAddRow = ScaleAddRow_Any_SSE2;
@@ -775,11 +841,19 @@ static void ScalePlaneBox(int src_width, int src_height,
       }
     }
 #endif
+#if defined(HAS_SCALEADDROW_MSA)
+    if (TestCpuFlag(kCpuHasMSA)) {
+      ScaleAddRow = ScaleAddRow_Any_MSA;
+      if (IS_ALIGNED(src_width, 16)) {
+        ScaleAddRow = ScaleAddRow_MSA;
+      }
+    }
+#endif
 
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint8* src = src_ptr + iy * src_stride;
+      const uint8_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -787,20 +861,24 @@ static void ScalePlaneBox(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row16, 0, src_width * 2);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint16 *)(row16), src_width);
+        ScaleAddRow(src, (uint16_t*)(row16), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint16_t*)(row16), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row16);
   }
 }
 
-static void ScalePlaneBox_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneBox_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr) {
   int j, k;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -808,17 +886,17 @@ static void ScalePlaneBox_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height << 16);
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   {
-    // Allocate a row buffer of uint32.
+    // Allocate a row buffer of uint32_t.
     align_buffer_64(row32, src_width * 4);
     void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx,
-        const uint32* src_ptr, uint16* dst_ptr) =
-        (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C;
-    void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) =
-        ScaleAddRow_16_C;
+                         const uint32_t* src_ptr, uint16_t* dst_ptr) =
+        (dx & 0xffff) ? ScaleAddCols2_16_C : ScaleAddCols1_16_C;
+    void (*ScaleAddRow)(const uint16_t* src_ptr, uint32_t* dst_ptr,
+                        int src_width) = ScaleAddRow_16_C;
 
 #if defined(HAS_SCALEADDROW_16_SSE2)
     if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) {
@@ -829,7 +907,7 @@ static void ScalePlaneBox_16(int src_width, int src_height,
     for (j = 0; j < dst_height; ++j) {
       int boxheight;
       int iy = y >> 16;
-      const uint16* src = src_ptr + iy * src_stride;
+      const uint16_t* src = src_ptr + iy * src_stride;
       y += dy;
       if (y > max_y) {
         y = max_y;
@@ -837,10 +915,10 @@ static void ScalePlaneBox_16(int src_width, int src_height,
       boxheight = MIN1((y >> 16) - iy);
       memset(row32, 0, src_width * 4);
       for (k = 0; k < boxheight; ++k) {
-        ScaleAddRow(src, (uint32 *)(row32), src_width);
+        ScaleAddRow(src, (uint32_t*)(row32), src_width);
         src += src_stride;
       }
-      ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr);
+      ScaleAddCols(dst_width, boxheight, x, dx, (uint32_t*)(row32), dst_ptr);
       dst_ptr += dst_stride;
     }
     free_aligned_buffer_64(row32);
@@ -848,10 +926,14 @@ static void ScalePlaneBox_16(int src_width, int src_height,
 }
 
 // Scale plane down with bilinear interpolation.
-void ScalePlaneBilinearDown(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearDown(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_ptr,
+                            uint8_t* dst_ptr,
                             enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -864,14 +946,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -898,16 +980,15 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
 
-
 #if defined(HAS_SCALEFILTERCOLS_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
     ScaleFilterCols = ScaleFilterCols_SSSE3;
@@ -920,6 +1001,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
       ScaleFilterCols = ScaleFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
 #endif
   if (y > max_y) {
     y = max_y;
@@ -927,7 +1016,7 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
@@ -944,10 +1033,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height,
   free_aligned_buffer_64(row);
 }
 
-void ScalePlaneBilinearDown_16(int src_width, int src_height,
-                               int dst_width, int dst_height,
-                               int src_stride, int dst_stride,
-                               const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearDown_16(int src_width,
+                               int src_height,
+                               int dst_width,
+                               int dst_height,
+                               int src_stride,
+                               int dst_stride,
+                               const uint16_t* src_ptr,
+                               uint16_t* dst_ptr,
                                enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -960,14 +1053,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   const int max_y = (src_height - 1) << 16;
   int j;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleFilterCols64_16_C : ScaleFilterCols_16_C;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1002,15 +1095,6 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(src_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
-
 
 #if defined(HAS_SCALEFILTERCOLS_16_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -1023,13 +1107,13 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
     if (filtering == kFilterLinear) {
       ScaleFilterCols(dst_ptr, src, dst_width, x, dx);
     } else {
       int yf = (y >> 8) & 255;
-      InterpolateRow((uint16*)row, src, src_stride, src_width, yf);
-      ScaleFilterCols(dst_ptr, (uint16*)row, dst_width, x, dx);
+      InterpolateRow((uint16_t*)row, src, src_stride, src_width, yf);
+      ScaleFilterCols(dst_ptr, (uint16_t*)row, dst_width, x, dx);
     }
     dst_ptr += dst_stride;
     y += dy;
@@ -1041,10 +1125,14 @@ void ScalePlaneBilinearDown_16(int src_width, int src_height,
 }
 
 // Scale up down with bilinear interpolation.
-void ScalePlaneBilinearUp(int src_width, int src_height,
-                          int dst_width, int dst_height,
-                          int src_stride, int dst_stride,
-                          const uint8* src_ptr, uint8* dst_ptr,
+void ScalePlaneBilinearUp(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          int src_stride,
+                          int dst_stride,
+                          const uint8_t* src_ptr,
+                          uint8_t* dst_ptr,
                           enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1053,14 +1141,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleFilterCols)(uint8_t * dst_ptr, const uint8_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_C : ScaleCols_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -1087,14 +1175,6 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_C;
@@ -1111,6 +1191,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
       ScaleFilterCols = ScaleFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA) && src_width < 32768) {
+    ScaleFilterCols = ScaleFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 16)) {
+      ScaleFilterCols = ScaleFilterCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleFilterCols = ScaleColsUp2_C;
@@ -1126,13 +1214,13 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint8* src = src_ptr + yi * src_stride;
+    const uint8_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1172,10 +1260,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height,
   }
 }
 
-void ScalePlaneBilinearUp_16(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint16* src_ptr, uint16* dst_ptr,
+void ScalePlaneBilinearUp_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint16_t* src_ptr,
+                             uint16_t* dst_ptr,
                              enum FilterMode filtering) {
   int j;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
@@ -1184,14 +1276,14 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   int dx = 0;
   int dy = 0;
   const int max_y = (src_height - 1) << 16;
-  void (*InterpolateRow)(uint16* dst_ptr, const uint16* src_ptr,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
-  void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
+  void (*ScaleFilterCols)(uint16_t * dst_ptr, const uint16_t* src_ptr,
+                          int dst_width, int x, int dx) =
       filtering ? ScaleFilterCols_16_C : ScaleCols_16_C;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
 #if defined(HAS_INTERPOLATEROW_16_SSE2)
@@ -1226,14 +1318,6 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
-#endif
 
   if (filtering && src_width >= 32768) {
     ScaleFilterCols = ScaleFilterCols64_16_C;
@@ -1257,13 +1341,13 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
   }
   {
     int yi = y >> 16;
-    const uint16* src = src_ptr + yi * src_stride;
+    const uint16_t* src = src_ptr + yi * src_stride;
 
     // Allocate 2 row buffers.
     const int kRowSize = (dst_width + 31) & ~31;
     align_buffer_64(row, kRowSize * 4);
 
-    uint16* rowptr = (uint16*)row;
+    uint16_t* rowptr = (uint16_t*)row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -1308,20 +1392,24 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScalePlaneSimple(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             int src_stride, int dst_stride,
-                             const uint8* src_ptr, uint8* dst_ptr) {
+static void ScalePlaneSimple(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             int src_stride,
+                             int dst_stride,
+                             const uint8_t* src_ptr,
+                             uint8_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_C;
+  void (*ScaleCols)(uint8_t * dst_ptr, const uint8_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1340,20 +1428,24 @@ static void ScalePlaneSimple(int src_width, int src_height,
   }
 }
 
-static void ScalePlaneSimple_16(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint16* src_ptr, uint16* dst_ptr) {
+static void ScalePlaneSimple_16(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint16_t* src_ptr,
+                                uint16_t* dst_ptr) {
   int i;
-  void (*ScaleCols)(uint16* dst_ptr, const uint16* src_ptr,
-      int dst_width, int x, int dx) = ScaleCols_16_C;
+  void (*ScaleCols)(uint16_t * dst_ptr, const uint16_t* src_ptr, int dst_width,
+                    int x, int dx) = ScaleCols_16_C;
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
   int y = 0;
   int dx = 0;
   int dy = 0;
-  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
 
   if (src_width * 2 == dst_width && x < 0x8000) {
@@ -1366,8 +1458,7 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
   }
 
   for (i = 0; i < dst_height; ++i) {
-    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride,
-              dst_width, x, dx);
+    ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx);
     dst_ptr += dst_stride;
     y += dy;
   }
@@ -1377,14 +1468,18 @@ static void ScalePlaneSimple_16(int src_width, int src_height,
 // This function dispatches to a specialized scaler based on scale factor.
 
 LIBYUV_API
-void ScalePlane(const uint8* src, int src_stride,
-                int src_width, int src_height,
-                uint8* dst, int dst_stride,
-                int dst_width, int dst_height,
+void ScalePlane(const uint8_t* src,
+                int src_stride,
+                int src_width,
+                int src_height,
+                uint8_t* dst,
+                int dst_stride,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1403,46 +1498,42 @@ void ScalePlane(const uint8* src, int src_stride,
   if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled horizontally.
-    ScalePlaneVertical(src_height,
-                       dst_width, dst_height,
-                       src_stride, dst_stride, src, dst,
-                       0, 0, dy, 1, filtering);
+    ScalePlaneVertical(src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
-      ScalePlaneDown34(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown34(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (2 * dst_width == src_width && 2 * dst_height == src_height) {
       // optimized, 1/2
-      ScalePlaneDown2(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown2(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
-      ScalePlaneDown38(src_width, src_height, dst_width, dst_height,
-                       src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride,
+                       dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
         (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
-      ScalePlaneDown4(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst, filtering);
+      ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst, filtering);
       return;
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox(src_width, src_height, dst_width, dst_height,
-                  src_stride, dst_stride, src, dst);
+    ScalePlaneBox(src_width, src_height, dst_width, dst_height, src_stride,
+                  dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1455,19 +1546,23 @@ void ScalePlane(const uint8* src, int src_stride,
                            src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple(src_width, src_height, dst_width, dst_height,
-                   src_stride, dst_stride, src, dst);
+  ScalePlaneSimple(src_width, src_height, dst_width, dst_height, src_stride,
+                   dst_stride, src, dst);
 }
 
 LIBYUV_API
-void ScalePlane_16(const uint16* src, int src_stride,
-                  int src_width, int src_height,
-                  uint16* dst, int dst_stride,
-                  int dst_width, int dst_height,
-                  enum FilterMode filtering) {
+void ScalePlane_16(const uint16_t* src,
+                   int src_stride,
+                   int src_width,
+                   int src_height,
+                   uint16_t* dst,
+                   int dst_stride,
+                   int dst_width,
+                   int dst_height,
+                   enum FilterMode filtering) {
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height, filtering);
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
+                                filtering);
 
   // Negative height means invert the image.
   if (src_height < 0) {
@@ -1483,19 +1578,16 @@ void ScalePlane_16(const uint16* src, int src_stride,
     CopyPlane_16(src, src_stride, dst, dst_stride, dst_width, dst_height);
     return;
   }
-  if (dst_width == src_width) {
+  if (dst_width == src_width && filtering != kFilterBox) {
     int dy = FixedDiv(src_height, dst_height);
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical_16(src_height,
-                          dst_width, dst_height,
-                          src_stride, dst_stride, src, dst,
-                          0, 0, dy, 1, filtering);
+    ScalePlaneVertical_16(src_height, dst_width, dst_height, src_stride,
+                          dst_stride, src, dst, 0, 0, dy, 1, filtering);
     return;
   }
   if (dst_width <= Abs(src_width) && dst_height <= src_height) {
     // Scale down.
-    if (4 * dst_width == 3 * src_width &&
-        4 * dst_height == 3 * src_height) {
+    if (4 * dst_width == 3 * src_width && 4 * dst_height == 3 * src_height) {
       // optimized, 3/4
       ScalePlaneDown34_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
@@ -1508,15 +1600,14 @@ void ScalePlane_16(const uint16* src, int src_stride,
       return;
     }
     // 3/8 rounded up for odd sized chroma height.
-    if (8 * dst_width == 3 * src_width &&
-        dst_height == ((src_height * 3 + 7) / 8)) {
+    if (8 * dst_width == 3 * src_width && 8 * dst_height == 3 * src_height) {
       // optimized, 3/8
       ScalePlaneDown38_16(src_width, src_height, dst_width, dst_height,
                           src_stride, dst_stride, src, dst, filtering);
       return;
     }
     if (4 * dst_width == src_width && 4 * dst_height == src_height &&
-               filtering != kFilterBilinear) {
+        (filtering == kFilterBox || filtering == kFilterNone)) {
       // optimized, 1/4
       ScalePlaneDown4_16(src_width, src_height, dst_width, dst_height,
                          src_stride, dst_stride, src, dst, filtering);
@@ -1524,8 +1615,8 @@ void ScalePlane_16(const uint16* src, int src_stride,
     }
   }
   if (filtering == kFilterBox && dst_height * 2 < src_height) {
-    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height,
-                     src_stride, dst_stride, src, dst);
+    ScalePlaneBox_16(src_width, src_height, dst_width, dst_height, src_stride,
+                     dst_stride, src, dst);
     return;
   }
   if (filtering && dst_height > src_height) {
@@ -1538,132 +1629,110 @@ void ScalePlane_16(const uint16* src, int src_stride,
                               src_stride, dst_stride, src, dst, filtering);
     return;
   }
-  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height,
-                      src_stride, dst_stride, src, dst);
+  ScalePlaneSimple_16(src_width, src_height, dst_width, dst_height, src_stride,
+                      dst_stride, src, dst);
 }
 
 // Scale an I420 image.
 // This function in turn calls a scaling function for each plane.
 
 LIBYUV_API
-int I420Scale(const uint8* src_y, int src_stride_y,
-              const uint8* src_u, int src_stride_u,
-              const uint8* src_v, int src_stride_v,
-              int src_width, int src_height,
-              uint8* dst_y, int dst_stride_y,
-              uint8* dst_u, int dst_stride_u,
-              uint8* dst_v, int dst_stride_v,
-              int dst_width, int dst_height,
+int I420Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane(src_y, src_stride_y, src_width, src_height,
-             dst_y, dst_stride_y, dst_width, dst_height,
-             filtering);
-  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight,
-             dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-             filtering);
-  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight,
-             dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-             filtering);
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+             dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+             dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 LIBYUV_API
-int I420Scale_16(const uint16* src_y, int src_stride_y,
-                 const uint16* src_u, int src_stride_u,
-                 const uint16* src_v, int src_stride_v,
-                 int src_width, int src_height,
-                 uint16* dst_y, int dst_stride_y,
-                 uint16* dst_u, int dst_stride_u,
-                 uint16* dst_v, int dst_stride_v,
-                 int dst_width, int dst_height,
+int I420Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
                  enum FilterMode filtering) {
   int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
   int src_halfheight = SUBSAMPLE(src_height, 1, 1);
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
-  ScalePlane_16(src_y, src_stride_y, src_width, src_height,
-                dst_y, dst_stride_y, dst_width, dst_height,
-                filtering);
-  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight,
-                dst_u, dst_stride_u, dst_halfwidth, dst_halfheight,
-                filtering);
-  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight,
-                dst_v, dst_stride_v, dst_halfwidth, dst_halfheight,
-                filtering);
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_halfwidth, src_halfheight, dst_u,
+                dst_stride_u, dst_halfwidth, dst_halfheight, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_halfwidth, src_halfheight, dst_v,
+                dst_stride_v, dst_halfwidth, dst_halfheight, filtering);
   return 0;
 }
 
 // Deprecated api
 LIBYUV_API
-int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v,
-          int src_stride_y, int src_stride_u, int src_stride_v,
-          int src_width, int src_height,
-          uint8* dst_y, uint8* dst_u, uint8* dst_v,
-          int dst_stride_y, int dst_stride_u, int dst_stride_v,
-          int dst_width, int dst_height,
+int Scale(const uint8_t* src_y,
+          const uint8_t* src_u,
+          const uint8_t* src_v,
+          int src_stride_y,
+          int src_stride_u,
+          int src_stride_v,
+          int src_width,
+          int src_height,
+          uint8_t* dst_y,
+          uint8_t* dst_u,
+          uint8_t* dst_v,
+          int dst_stride_y,
+          int dst_stride_u,
+          int dst_stride_v,
+          int dst_width,
+          int dst_height,
           LIBYUV_BOOL interpolate) {
-  return I420Scale(src_y, src_stride_y,
-                   src_u, src_stride_u,
-                   src_v, src_stride_v,
-                   src_width, src_height,
-                   dst_y, dst_stride_y,
-                   dst_u, dst_stride_u,
-                   dst_v, dst_stride_v,
-                   dst_width, dst_height,
-                   interpolate ? kFilterBox : kFilterNone);
-}
-
-// Deprecated api
-LIBYUV_API
-int ScaleOffset(const uint8* src, int src_width, int src_height,
-                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
-                LIBYUV_BOOL interpolate) {
-  // Chroma requires offset to multiple of 2.
-  int dst_yoffset_even = dst_yoffset & ~1;
-  int src_halfwidth = SUBSAMPLE(src_width, 1, 1);
-  int src_halfheight = SUBSAMPLE(src_height, 1, 1);
-  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
-  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
-  int aheight = dst_height - dst_yoffset_even * 2;  // actual output height
-  const uint8* src_y = src;
-  const uint8* src_u = src + src_width * src_height;
-  const uint8* src_v = src + src_width * src_height +
-                             src_halfwidth * src_halfheight;
-  uint8* dst_y = dst + dst_yoffset_even * dst_width;
-  uint8* dst_u = dst + dst_width * dst_height +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight +
-                 (dst_yoffset_even >> 1) * dst_halfwidth;
-  if (!src || src_width <= 0 || src_height <= 0 ||
-      !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset_even < 0 ||
-      dst_yoffset_even >= dst_height) {
-    return -1;
-  }
-  return I420Scale(src_y, src_width,
-                   src_u, src_halfwidth,
-                   src_v, src_halfwidth,
-                   src_width, src_height,
-                   dst_y, dst_width,
-                   dst_u, dst_halfwidth,
-                   dst_v, dst_halfwidth,
-                   dst_width, aheight,
-                   interpolate ? kFilterBox : kFilterNone);
+  return I420Scale(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                   src_stride_v, src_width, src_height, dst_y, dst_stride_y,
+                   dst_u, dst_stride_u, dst_v, dst_stride_v, dst_width,
+                   dst_height, interpolate ? kFilterBox : kFilterNone);
 }
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/third_party/libyuv/source/scale_any.cc b/libs/libvpx/third_party/libyuv/source/scale_any.cc
index ed76a9e4c04..53ad1364049 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_any.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_any.cc
@@ -20,184 +20,429 @@ extern "C" {
 
 // Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols
 #define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK)                            \
-    void NAMEANY(uint8* dst_ptr, const uint8* src_ptr,                         \
-                 int dst_width, int x, int dx) {                               \
-      int n = dst_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                 \
-      }                                                                        \
-      TERP_C(dst_ptr + n * BPP, src_ptr,                                       \
-             dst_width & MASK, x + n * dx, dx);                                \
-    }
+  void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, int dst_width, int x, \
+               int dx) {                                                       \
+    int r = dst_width & MASK;                                                  \
+    int n = dst_width & ~MASK;                                                 \
+    if (n > 0) {                                                               \
+      TERP_SIMD(dst_ptr, src_ptr, n, x, dx);                                   \
+    }                                                                          \
+    TERP_C(dst_ptr + n * BPP, src_ptr, r, x + n * dx, dx);                     \
+  }
 
 #ifdef HAS_SCALEFILTERCOLS_NEON
 CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7)
 #endif
+#ifdef HAS_SCALEFILTERCOLS_MSA
+CANY(ScaleFilterCols_Any_MSA, ScaleFilterCols_MSA, ScaleFilterCols_C, 1, 15)
+#endif
 #ifdef HAS_SCALEARGBCOLS_NEON
 CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7)
 #endif
+#ifdef HAS_SCALEARGBCOLS_MSA
+CANY(ScaleARGBCols_Any_MSA, ScaleARGBCols_MSA, ScaleARGBCols_C, 4, 3)
+#endif
 #ifdef HAS_SCALEARGBFILTERCOLS_NEON
-CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON,
-     ScaleARGBFilterCols_C, 4, 3)
+CANY(ScaleARGBFilterCols_Any_NEON,
+     ScaleARGBFilterCols_NEON,
+     ScaleARGBFilterCols_C,
+     4,
+     3)
+#endif
+#ifdef HAS_SCALEARGBFILTERCOLS_MSA
+CANY(ScaleARGBFilterCols_Any_MSA,
+     ScaleARGBFilterCols_MSA,
+     ScaleARGBFilterCols_C,
+     4,
+     7)
 #endif
 #undef CANY
 
 // Fixed scale down.
+// Mask may be non-power of 2, so use MOD
 #define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)dst_width % (MASK + 1)); /* NOLINT */          \
+    int n = dst_width - r;                                                     \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r);                                      \
+  }
 
 // Fixed scale down for odd source width.  Used by I420Blend subsampling.
 // Since dst_width is (width + 1) / 2, this function scales one less pixel
 // and copies the last pixel.
 #define SDODD(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK)   \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride,                   \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1));               \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                    \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                 \
-                     dst_ptr + n * BPP, r);                                    \
-    }
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, \
+               int dst_width) {                                                \
+    int r = (int)((unsigned int)(dst_width - 1) % (MASK + 1)); /* NOLINT */    \
+    int n = (dst_width - 1) - r;                                               \
+    if (n > 0) {                                                               \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n);                      \
+    }                                                                          \
+    SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride,                   \
+                   dst_ptr + n * BPP, r + 1);                                  \
+  }
 
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 SDANY(ScaleRowDown2_Any_SSSE3, ScaleRowDown2_SSSE3, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_SSSE3, ScaleRowDown2Linear_SSSE3,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_SSSE3, ScaleRowDown2Box_SSSE3, ScaleRowDown2Box_C,
-      2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_SSSE3, ScaleRowDown2Box_SSSE3,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_SSSE3,
+      ScaleRowDown2Linear_SSSE3,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_SSSE3,
+      ScaleRowDown2Box_SSSE3,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN2_AVX2
 SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31)
-SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2,
-      ScaleRowDown2Linear_C, 2, 1, 31)
-SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C,
-      2, 1, 31)
-SDODD(ScaleRowDown2Box_Odd_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_Odd_C,
-      2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_AVX2,
+      ScaleRowDown2Linear_AVX2,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
+SDODD(ScaleRowDown2Box_Odd_AVX2,
+      ScaleRowDown2Box_AVX2,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN2_NEON
 SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15)
-SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON,
-      ScaleRowDown2Linear_C, 2, 1, 15)
-SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_C, 2, 1, 15)
-SDODD(ScaleRowDown2Box_Odd_NEON, ScaleRowDown2Box_NEON,
-      ScaleRowDown2Box_Odd_C, 2, 1, 15)
+SDANY(ScaleRowDown2Linear_Any_NEON,
+      ScaleRowDown2Linear_NEON,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      15)
+SDANY(ScaleRowDown2Box_Any_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      15)
+SDODD(ScaleRowDown2Box_Odd_NEON,
+      ScaleRowDown2Box_NEON,
+      ScaleRowDown2Box_Odd_C,
+      2,
+      1,
+      15)
+#endif
+#ifdef HAS_SCALEROWDOWN2_MSA
+SDANY(ScaleRowDown2_Any_MSA, ScaleRowDown2_MSA, ScaleRowDown2_C, 2, 1, 31)
+SDANY(ScaleRowDown2Linear_Any_MSA,
+      ScaleRowDown2Linear_MSA,
+      ScaleRowDown2Linear_C,
+      2,
+      1,
+      31)
+SDANY(ScaleRowDown2Box_Any_MSA,
+      ScaleRowDown2Box_MSA,
+      ScaleRowDown2Box_C,
+      2,
+      1,
+      31)
 #endif
 #ifdef HAS_SCALEROWDOWN4_SSSE3
 SDANY(ScaleRowDown4_Any_SSSE3, ScaleRowDown4_SSSE3, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_SSSE3, ScaleRowDown4Box_SSSE3, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_SSSE3,
+      ScaleRowDown4Box_SSSE3,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
 #endif
 #ifdef HAS_SCALEROWDOWN4_AVX2
 SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15)
-SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C,
-      4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_AVX2,
+      ScaleRowDown4Box_AVX2,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN4_NEON
 SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7)
-SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C,
-      4, 1, 7)
+SDANY(ScaleRowDown4Box_Any_NEON,
+      ScaleRowDown4Box_NEON,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      7)
+#endif
+#ifdef HAS_SCALEROWDOWN4_MSA
+SDANY(ScaleRowDown4_Any_MSA, ScaleRowDown4_MSA, ScaleRowDown4_C, 4, 1, 15)
+SDANY(ScaleRowDown4Box_Any_MSA,
+      ScaleRowDown4Box_MSA,
+      ScaleRowDown4Box_C,
+      4,
+      1,
+      15)
 #endif
 #ifdef HAS_SCALEROWDOWN34_SSSE3
-SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_SSSE3,
+      ScaleRowDown34_SSSE3,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_SSSE3,
+      ScaleRowDown34_0_Box_SSSE3,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_SSSE3,
+      ScaleRowDown34_1_Box_SSSE3,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
 #endif
 #ifdef HAS_SCALEROWDOWN34_NEON
-SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON,
-      ScaleRowDown34_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON,
-      ScaleRowDown34_0_Box_C, 4 / 3, 1, 23)
-SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON,
-      ScaleRowDown34_1_Box_C, 4 / 3, 1, 23)
+SDANY(ScaleRowDown34_Any_NEON,
+      ScaleRowDown34_NEON,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_0_Box_Any_NEON,
+      ScaleRowDown34_0_Box_NEON,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      23)
+SDANY(ScaleRowDown34_1_Box_Any_NEON,
+      ScaleRowDown34_1_Box_NEON,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      23)
+#endif
+#ifdef HAS_SCALEROWDOWN34_MSA
+SDANY(ScaleRowDown34_Any_MSA,
+      ScaleRowDown34_MSA,
+      ScaleRowDown34_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_0_Box_Any_MSA,
+      ScaleRowDown34_0_Box_MSA,
+      ScaleRowDown34_0_Box_C,
+      4 / 3,
+      1,
+      47)
+SDANY(ScaleRowDown34_1_Box_Any_MSA,
+      ScaleRowDown34_1_Box_MSA,
+      ScaleRowDown34_1_Box_C,
+      4 / 3,
+      1,
+      47)
 #endif
 #ifdef HAS_SCALEROWDOWN38_SSSE3
-SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 5)
-SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 5)
+SDANY(ScaleRowDown38_Any_SSSE3,
+      ScaleRowDown38_SSSE3,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_SSSE3,
+      ScaleRowDown38_3_Box_SSSE3,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      5)
+SDANY(ScaleRowDown38_2_Box_Any_SSSE3,
+      ScaleRowDown38_2_Box_SSSE3,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      5)
 #endif
 #ifdef HAS_SCALEROWDOWN38_NEON
-SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON,
-      ScaleRowDown38_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON,
-      ScaleRowDown38_3_Box_C, 8 / 3, 1, 11)
-SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON,
-      ScaleRowDown38_2_Box_C, 8 / 3, 1, 11)
+SDANY(ScaleRowDown38_Any_NEON,
+      ScaleRowDown38_NEON,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_NEON,
+      ScaleRowDown38_3_Box_NEON,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_NEON,
+      ScaleRowDown38_2_Box_NEON,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
+#endif
+#ifdef HAS_SCALEROWDOWN38_MSA
+SDANY(ScaleRowDown38_Any_MSA,
+      ScaleRowDown38_MSA,
+      ScaleRowDown38_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_3_Box_Any_MSA,
+      ScaleRowDown38_3_Box_MSA,
+      ScaleRowDown38_3_Box_C,
+      8 / 3,
+      1,
+      11)
+SDANY(ScaleRowDown38_2_Box_Any_MSA,
+      ScaleRowDown38_2_Box_MSA,
+      ScaleRowDown38_2_Box_C,
+      8 / 3,
+      1,
+      11)
 #endif
 
 #ifdef HAS_SCALEARGBROWDOWN2_SSE2
-SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2,
-      ScaleARGBRowDown2_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2,
-      ScaleARGBRowDown2Linear_C, 2, 4, 3)
-SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2,
-      ScaleARGBRowDown2Box_C, 2, 4, 3)
+SDANY(ScaleARGBRowDown2_Any_SSE2,
+      ScaleARGBRowDown2_SSE2,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_SSE2,
+      ScaleARGBRowDown2Linear_SSE2,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_SSE2,
+      ScaleARGBRowDown2Box_SSE2,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWN2_NEON
-SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON,
-      ScaleARGBRowDown2_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON,
-      ScaleARGBRowDown2Linear_C, 2, 4, 7)
-SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON,
-      ScaleARGBRowDown2Box_C, 2, 4, 7)
+SDANY(ScaleARGBRowDown2_Any_NEON,
+      ScaleARGBRowDown2_NEON,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Linear_Any_NEON,
+      ScaleARGBRowDown2Linear_NEON,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      7)
+SDANY(ScaleARGBRowDown2Box_Any_NEON,
+      ScaleARGBRowDown2Box_NEON,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      7)
+#endif
+#ifdef HAS_SCALEARGBROWDOWN2_MSA
+SDANY(ScaleARGBRowDown2_Any_MSA,
+      ScaleARGBRowDown2_MSA,
+      ScaleARGBRowDown2_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Linear_Any_MSA,
+      ScaleARGBRowDown2Linear_MSA,
+      ScaleARGBRowDown2Linear_C,
+      2,
+      4,
+      3)
+SDANY(ScaleARGBRowDown2Box_Any_MSA,
+      ScaleARGBRowDown2Box_MSA,
+      ScaleARGBRowDown2Box_C,
+      2,
+      4,
+      3)
 #endif
 #undef SDANY
 
 // Scale down by even scale factor.
-#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)          \
-    void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx,    \
-                 uint8* dst_ptr, int dst_width) {                              \
-      int r = (int)((unsigned int)dst_width % (MASK + 1));                     \
-      int n = dst_width - r;                                                   \
-      if (n > 0) {                                                             \
-        SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);         \
-      }                                                                        \
-      SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride,              \
-                     src_stepx, dst_ptr + n * BPP, r);                         \
-    }
+#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK)       \
+  void NAMEANY(const uint8_t* src_ptr, ptrdiff_t src_stride, int src_stepx, \
+               uint8_t* dst_ptr, int dst_width) {                           \
+    int r = dst_width & MASK;                                               \
+    int n = dst_width & ~MASK;                                              \
+    if (n > 0) {                                                            \
+      SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n);        \
+    }                                                                       \
+    SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, src_stepx,  \
+                   dst_ptr + n * BPP, r);                                   \
+  }
 
 #ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2
-SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_SSE2,
+       ScaleARGBRowDownEven_SSE2,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2,
+       ScaleARGBRowDownEvenBox_SSE2,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 #ifdef HAS_SCALEARGBROWDOWNEVEN_NEON
-SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON,
-       ScaleARGBRowDownEven_C, 4, 3)
-SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON,
-       ScaleARGBRowDownEvenBox_C, 4, 3)
+SDAANY(ScaleARGBRowDownEven_Any_NEON,
+       ScaleARGBRowDownEven_NEON,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_NEON,
+       ScaleARGBRowDownEvenBox_NEON,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
+#endif
+#ifdef HAS_SCALEARGBROWDOWNEVEN_MSA
+SDAANY(ScaleARGBRowDownEven_Any_MSA,
+       ScaleARGBRowDownEven_MSA,
+       ScaleARGBRowDownEven_C,
+       4,
+       3)
+SDAANY(ScaleARGBRowDownEvenBox_Any_MSA,
+       ScaleARGBRowDownEvenBox_MSA,
+       ScaleARGBRowDownEvenBox_C,
+       4,
+       3)
 #endif
 
 // Add rows box filter scale down.
-#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)                  \
-  void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) {         \
-      int n = src_width & ~MASK;                                               \
-      if (n > 0) {                                                             \
-        SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                                 \
-      }                                                                        \
-      SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);               \
-    }
+#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK)              \
+  void NAMEANY(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { \
+    int n = src_width & ~MASK;                                             \
+    if (n > 0) {                                                           \
+      SCALEADDROW_SIMD(src_ptr, dst_ptr, n);                               \
+    }                                                                      \
+    SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK);             \
+  }
 
 #ifdef HAS_SCALEADDROW_SSE2
 SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15)
@@ -208,14 +453,12 @@ SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31)
 #ifdef HAS_SCALEADDROW_NEON
 SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15)
 #endif
+#ifdef HAS_SCALEADDROW_MSA
+SAANY(ScaleAddRow_Any_MSA, ScaleAddRow_MSA, ScaleAddRow_C, 15)
+#endif
 #undef SAANY
 
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
-
-
-
-
diff --git a/libs/libvpx/third_party/libyuv/source/scale_argb.cc b/libs/libvpx/third_party/libyuv/source/scale_argb.cc
index 17f51ae9bf8..53a22e8b41e 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_argb.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_argb.cc
@@ -30,20 +30,31 @@ static __inline int Abs(int v) {
 // ScaleARGB ARGB, 1/2
 // This is an optimized version for scaling down a ARGB to 1/2 of
 // its original size.
-static void ScaleARGBDown2(int src_width, int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint8* src_argb, uint8* dst_argb,
-                           int x, int dx, int y, int dy,
+static void ScaleARGBDown2(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint8_t* src_argb,
+                           uint8_t* dst_argb,
+                           int x,
+                           int dx,
+                           int y,
+                           int dy,
                            enum FilterMode filtering) {
   int j;
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) =
-    filtering == kFilterNone ? ScaleARGBRowDown2_C :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
-        ScaleARGBRowDown2Box_C);
-  assert(dx == 65536 * 2);  // Test scale factor of 2.
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      filtering == kFilterNone
+          ? ScaleARGBRowDown2_C
+          : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C
+                                        : ScaleARGBRowDown2Box_C);
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 2);      // Test scale factor of 2.
   assert((dy & 0x1ffff) == 0);  // Test vertical scale is multiple of 2.
   // Advance to odd row, even column.
   if (filtering == kFilterBilinear) {
@@ -54,25 +65,49 @@ static void ScaleARGBDown2(int src_width, int src_height,
 
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 :
-        ScaleARGBRowDown2Box_Any_SSE2);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_SSE2
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2
+                                          : ScaleARGBRowDown2Box_Any_SSE2);
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
-          ScaleARGBRowDown2Box_SSE2);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_SSE2
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2
+                                            : ScaleARGBRowDown2Box_SSE2);
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWN2_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON :
-        (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON :
-        ScaleARGBRowDown2Box_Any_NEON);
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_NEON
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON
+                                          : ScaleARGBRowDown2Box_Any_NEON);
     if (IS_ALIGNED(dst_width, 8)) {
-      ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON :
-          (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON :
-          ScaleARGBRowDown2Box_NEON);
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_NEON
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON
+                                            : ScaleARGBRowDown2Box_NEON);
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWN2_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDown2 =
+        filtering == kFilterNone
+            ? ScaleARGBRowDown2_Any_MSA
+            : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_MSA
+                                          : ScaleARGBRowDown2Box_Any_MSA);
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDown2 =
+          filtering == kFilterNone
+              ? ScaleARGBRowDown2_MSA
+              : (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_MSA
+                                            : ScaleARGBRowDown2Box_MSA);
     }
   }
 #endif
@@ -90,21 +125,32 @@ static void ScaleARGBDown2(int src_width, int src_height,
 // ScaleARGB ARGB, 1/4
 // This is an optimized version for scaling down a ARGB to 1/4 of
 // its original size.
-static void ScaleARGBDown4Box(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy) {
+static void ScaleARGBDown4Box(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy) {
   int j;
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 2 * 4 + 31) & ~31;
   align_buffer_64(row, kRowSize * 2);
   int row_stride = src_stride * (dy >> 16);
-  void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
-    uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+  void (*ScaleARGBRowDown2)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                            uint8_t* dst_argb, int dst_width) =
+      ScaleARGBRowDown2Box_C;
   // Advance to odd row, even column.
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
-  assert(dx == 65536 * 4);  // Test scale factor of 4.
+  (void)src_width;
+  (void)src_height;
+  (void)dx;
+  assert(dx == 65536 * 4);      // Test scale factor of 4.
   assert((dy & 0x3ffff) == 0);  // Test vertical scale is multiple of 4.
 #if defined(HAS_SCALEARGBROWDOWN2_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
@@ -125,8 +171,8 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 
   for (j = 0; j < dst_height; ++j) {
     ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
-    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
-                      row + kRowSize, dst_width * 2);
+    ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, row + kRowSize,
+                      dst_width * 2);
     ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
     src_argb += row_stride;
     dst_argb += dst_stride;
@@ -137,38 +183,57 @@ static void ScaleARGBDown4Box(int src_width, int src_height,
 // ScaleARGB ARGB Even
 // This is an optimized version for scaling down a ARGB to even
 // multiple of its original size.
-static void ScaleARGBDownEven(int src_width, int src_height,
-                              int dst_width, int dst_height,
-                              int src_stride, int dst_stride,
-                              const uint8* src_argb, uint8* dst_argb,
-                              int x, int dx, int y, int dy,
+static void ScaleARGBDownEven(int src_width,
+                              int src_height,
+                              int dst_width,
+                              int dst_height,
+                              int src_stride,
+                              int dst_stride,
+                              const uint8_t* src_argb,
+                              uint8_t* dst_argb,
+                              int x,
+                              int dx,
+                              int y,
+                              int dy,
                               enum FilterMode filtering) {
   int j;
   int col_step = dx >> 16;
   int row_stride = (dy >> 16) * src_stride;
-  void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_step, uint8* dst_argb, int dst_width) =
+  void (*ScaleARGBRowDownEven)(const uint8_t* src_argb, ptrdiff_t src_stride,
+                               int src_step, uint8_t* dst_argb, int dst_width) =
       filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+  (void)src_width;
+  (void)src_height;
   assert(IS_ALIGNED(src_width, 2));
   assert(IS_ALIGNED(src_height, 2));
   src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 :
-        ScaleARGBRowDownEven_Any_SSE2;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2
+                                     : ScaleARGBRowDownEven_Any_SSE2;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
-          ScaleARGBRowDownEven_SSE2;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2;
     }
   }
 #endif
 #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
   if (TestCpuFlag(kCpuHasNEON)) {
-    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON :
-        ScaleARGBRowDownEven_Any_NEON;
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON
+                                     : ScaleARGBRowDownEven_Any_NEON;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON;
+    }
+  }
+#endif
+#if defined(HAS_SCALEARGBROWDOWNEVEN_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_MSA
+                                     : ScaleARGBRowDownEven_Any_MSA;
     if (IS_ALIGNED(dst_width, 4)) {
-      ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
-          ScaleARGBRowDownEven_NEON;
+      ScaleARGBRowDownEven =
+          filtering ? ScaleARGBRowDownEvenBox_MSA : ScaleARGBRowDownEven_MSA;
     }
   }
 #endif
@@ -184,25 +249,32 @@ static void ScaleARGBDownEven(int src_width, int src_height,
 }
 
 // Scale ARGB down with bilinear interpolation.
-static void ScaleARGBBilinearDown(int src_width, int src_height,
-                                  int dst_width, int dst_height,
-                                  int src_stride, int dst_stride,
-                                  const uint8* src_argb, uint8* dst_argb,
-                                  int x, int dx, int y, int dy,
+static void ScaleARGBBilinearDown(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
+                                  int src_stride,
+                                  int dst_stride,
+                                  const uint8_t* src_argb,
+                                  uint8_t* dst_argb,
+                                  int x,
+                                  int dx,
+                                  int y,
+                                  int dy,
                                   enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
-  int64 xlast = x + (int64)(dst_width - 1) * dx;
-  int64 xl = (dx >= 0) ? x : xlast;
-  int64 xr = (dx >= 0) ? xlast : x;
+  int64_t xlast = x + (int64_t)(dst_width - 1) * dx;
+  int64_t xl = (dx >= 0) ? x : xlast;
+  int64_t xr = (dx >= 0) ? xlast : x;
   int clip_src_width;
-  xl = (xl >> 16) & ~3;  // Left edge aligned.
-  xr = (xr >> 16) + 1;  // Right most pixel used.  Bilinear uses 2 pixels.
+  xl = (xl >> 16) & ~3;    // Left edge aligned.
+  xr = (xr >> 16) + 1;     // Right most pixel used.  Bilinear uses 2 pixels.
   xr = (xr + 1 + 3) & ~3;  // 1 beyond 4 pixel aligned right most pixel.
   if (xr > src_width) {
     xr = src_width;
@@ -234,12 +306,11 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(clip_src_width, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(clip_src_width, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -255,6 +326,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBFilterCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
 #endif
   // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
   // Allocate a row of ARGB.
@@ -267,7 +346,7 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
     }
     for (j = 0; j < dst_height; ++j) {
       int yi = y >> 16;
-      const uint8* src = src_argb + yi * src_stride;
+      const uint8_t* src = src_argb + yi * src_stride;
       if (filtering == kFilterLinear) {
         ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
       } else {
@@ -286,18 +365,25 @@ static void ScaleARGBBilinearDown(int src_width, int src_height,
 }
 
 // Scale ARGB up with bilinear interpolation.
-static void ScaleARGBBilinearUp(int src_width, int src_height,
-                                int dst_width, int dst_height,
-                                int src_stride, int dst_stride,
-                                const uint8* src_argb, uint8* dst_argb,
-                                int x, int dx, int y, int dy,
+static void ScaleARGBBilinearUp(int src_width,
+                                int src_height,
+                                int dst_width,
+                                int dst_height,
+                                int src_stride,
+                                int dst_stride,
+                                const uint8_t* src_argb,
+                                uint8_t* dst_argb,
+                                int x,
+                                int dx,
+                                int y,
+                                int dy,
                                 enum FilterMode filtering) {
   int j;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   const int max_y = (src_height - 1) << 16;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
@@ -324,15 +410,17 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -347,6 +435,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -359,6 +455,14 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -375,13 +479,13 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
   {
     int yi = y >> 16;
-    const uint8* src = src_argb + yi * src_stride;
+    const uint8_t* src = src_argb + yi * src_stride;
 
     // Allocate 2 rows of ARGB.
     const int kRowSize = (dst_width * 4 + 31) & ~31;
     align_buffer_64(row, kRowSize * 2);
 
-    uint8* rowptr = row;
+    uint8_t* rowptr = row;
     int rowstride = kRowSize;
     int lasty = yi;
 
@@ -423,24 +527,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height,
 
 #ifdef YUVSCALEUP
 // Scale YUV to ARGB up with bilinear interpolation.
-static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
-                                     int dst_width, int dst_height,
+static void ScaleYUVToARGBBilinearUp(int src_width,
+                                     int src_height,
+                                     int dst_width,
+                                     int dst_height,
                                      int src_stride_y,
                                      int src_stride_u,
                                      int src_stride_v,
                                      int dst_stride_argb,
-                                     const uint8* src_y,
-                                     const uint8* src_u,
-                                     const uint8* src_v,
-                                     uint8* dst_argb,
-                                     int x, int dx, int y, int dy,
+                                     const uint8_t* src_y,
+                                     const uint8_t* src_u,
+                                     const uint8_t* src_v,
+                                     uint8_t* dst_argb,
+                                     int x,
+                                     int dx,
+                                     int y,
+                                     int dy,
                                      enum FilterMode filtering) {
   int j;
-  void (*I422ToARGBRow)(const uint8* y_buf,
-                        const uint8* u_buf,
-                        const uint8* v_buf,
-                        uint8* rgb_buf,
-                        int width) = I422ToARGBRow_C;
+  void (*I422ToARGBRow)(const uint8_t* y_buf, const uint8_t* u_buf,
+                        const uint8_t* v_buf, uint8_t* rgb_buf, int width) =
+      I422ToARGBRow_C;
 #if defined(HAS_I422TOARGBROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
@@ -465,19 +572,18 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_I422TOARGBROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
-      IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
-      IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    I422ToARGBRow = I422ToARGBRow_DSPR2;
+#if defined(HAS_I422TOARGBROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToARGBRow = I422ToARGBRow_Any_MSA;
+    if (IS_ALIGNED(src_width, 8)) {
+      I422ToARGBRow = I422ToARGBRow_MSA;
+    }
   }
 #endif
 
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
 #if defined(HAS_INTERPOLATEROW_SSSE3)
   if (TestCpuFlag(kCpuHasSSSE3)) {
     InterpolateRow = InterpolateRow_Any_SSSE3;
@@ -502,19 +608,21 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
-    InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      InterpolateRow = InterpolateRow_MSA;
+    }
   }
 #endif
 
-  void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBFilterCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                              int dst_width, int x, int dx) =
       filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
   if (src_width >= 32768) {
-    ScaleARGBFilterCols = filtering ?
-        ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+    ScaleARGBFilterCols =
+        filtering ? ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
   }
 #if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
   if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
@@ -529,6 +637,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
     }
   }
 #endif
+#if defined(HAS_SCALEARGBFILTERCOLS_MSA)
+  if (filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBFilterCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 8)) {
+      ScaleARGBFilterCols = ScaleARGBFilterCols_MSA;
+    }
+  }
+#endif
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBFilterCols = ScaleARGBCols_SSE2;
@@ -541,6 +657,14 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
       ScaleARGBFilterCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (!filtering && TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBFilterCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBFilterCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBFilterCols = ScaleARGBColsUp2_C;
@@ -558,9 +682,9 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   const int kYShift = 1;  // Shift Y by 1 to convert Y plane to UV coordinate.
   int yi = y >> 16;
   int uv_yi = yi >> kYShift;
-  const uint8* src_row_y = src_y + yi * src_stride_y;
-  const uint8* src_row_u = src_u + uv_yi * src_stride_u;
-  const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+  const uint8_t* src_row_y = src_y + yi * src_stride_y;
+  const uint8_t* src_row_u = src_u + uv_yi * src_stride_u;
+  const uint8_t* src_row_v = src_v + uv_yi * src_stride_v;
 
   // Allocate 2 rows of ARGB.
   const int kRowSize = (dst_width * 4 + 31) & ~31;
@@ -569,7 +693,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
   // Allocate 1 row of ARGB for source conversion.
   align_buffer_64(argb_row, src_width * 4);
 
-  uint8* rowptr = row;
+  uint8_t* rowptr = row;
   int rowstride = kRowSize;
   int lasty = yi;
 
@@ -635,15 +759,23 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
 // of x and dx is the integer part of the source position and
 // the lower 16 bits are the fixed decimal part.
 
-static void ScaleARGBSimple(int src_width, int src_height,
-                            int dst_width, int dst_height,
-                            int src_stride, int dst_stride,
-                            const uint8* src_argb, uint8* dst_argb,
-                            int x, int dx, int y, int dy) {
+static void ScaleARGBSimple(int src_width,
+                            int src_height,
+                            int dst_width,
+                            int dst_height,
+                            int src_stride,
+                            int dst_stride,
+                            const uint8_t* src_argb,
+                            uint8_t* dst_argb,
+                            int x,
+                            int dx,
+                            int y,
+                            int dy) {
   int j;
-  void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
-      int dst_width, int x, int dx) =
+  void (*ScaleARGBCols)(uint8_t * dst_argb, const uint8_t* src_argb,
+                        int dst_width, int x, int dx) =
       (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+  (void)src_height;
 #if defined(HAS_SCALEARGBCOLS_SSE2)
   if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
     ScaleARGBCols = ScaleARGBCols_SSE2;
@@ -656,6 +788,14 @@ static void ScaleARGBSimple(int src_width, int src_height,
       ScaleARGBCols = ScaleARGBCols_NEON;
     }
   }
+#endif
+#if defined(HAS_SCALEARGBCOLS_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ScaleARGBCols = ScaleARGBCols_Any_MSA;
+    if (IS_ALIGNED(dst_width, 4)) {
+      ScaleARGBCols = ScaleARGBCols_MSA;
+    }
+  }
 #endif
   if (src_width * 2 == dst_width && x < 0x8000) {
     ScaleARGBCols = ScaleARGBColsUp2_C;
@@ -667,8 +807,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
   }
 
   for (j = 0; j < dst_height; ++j) {
-    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
-                  dst_width, x, dx);
+    ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride, dst_width, x,
+                  dx);
     dst_argb += dst_stride;
     y += dy;
   }
@@ -677,11 +817,18 @@ static void ScaleARGBSimple(int src_width, int src_height,
 // ScaleARGB a ARGB.
 // This function in turn calls a scaling function
 // suitable for handling the desired resolutions.
-static void ScaleARGB(const uint8* src, int src_stride,
-                      int src_width, int src_height,
-                      uint8* dst, int dst_stride,
-                      int dst_width, int dst_height,
-                      int clip_x, int clip_y, int clip_width, int clip_height,
+static void ScaleARGB(const uint8_t* src,
+                      int src_stride,
+                      int src_width,
+                      int src_height,
+                      uint8_t* dst,
+                      int dst_stride,
+                      int dst_width,
+                      int dst_height,
+                      int clip_x,
+                      int clip_y,
+                      int clip_width,
+                      int clip_height,
                       enum FilterMode filtering) {
   // Initial source x/y coordinate and step values as 16.16 fixed point.
   int x = 0;
@@ -690,8 +837,7 @@ static void ScaleARGB(const uint8* src, int src_stride,
   int dy = 0;
   // ARGB does not support box filter yet, but allow the user to pass it.
   // Simplify filtering when possible.
-  filtering = ScaleFilterReduce(src_width, src_height,
-                                dst_width, dst_height,
+  filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
                                 filtering);
 
   // Negative src_height means invert the image.
@@ -700,17 +846,17 @@ static void ScaleARGB(const uint8* src, int src_stride,
     src = src + (src_height - 1) * src_stride;
     src_stride = -src_stride;
   }
-  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
-             &x, &y, &dx, &dy);
+  ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y,
+             &dx, &dy);
   src_width = Abs(src_width);
   if (clip_x) {
-    int64 clipf = (int64)(clip_x) * dx;
+    int64_t clipf = (int64_t)(clip_x)*dx;
     x += (clipf & 0xffff);
     src += (clipf >> 16) * 4;
     dst += clip_x * 4;
   }
   if (clip_y) {
-    int64 clipf = (int64)(clip_y) * dy;
+    int64_t clipf = (int64_t)(clip_y)*dy;
     y += (clipf & 0xffff);
     src += (clipf >> 16) * src_stride;
     dst += clip_y * dst_stride;
@@ -725,24 +871,20 @@ static void ScaleARGB(const uint8* src, int src_stride,
       if (!(dx & 0x10000) && !(dy & 0x10000)) {
         if (dx == 0x20000) {
           // Optimized 1/2 downsample.
-          ScaleARGBDown2(src_width, src_height,
-                         clip_width, clip_height,
-                         src_stride, dst_stride, src, dst,
-                         x, dx, y, dy, filtering);
+          ScaleARGBDown2(src_width, src_height, clip_width, clip_height,
+                         src_stride, dst_stride, src, dst, x, dx, y, dy,
+                         filtering);
           return;
         }
         if (dx == 0x40000 && filtering == kFilterBox) {
           // Optimized 1/4 box downsample.
-          ScaleARGBDown4Box(src_width, src_height,
-                            clip_width, clip_height,
-                            src_stride, dst_stride, src, dst,
-                            x, dx, y, dy);
+          ScaleARGBDown4Box(src_width, src_height, clip_width, clip_height,
+                            src_stride, dst_stride, src, dst, x, dx, y, dy);
           return;
         }
-        ScaleARGBDownEven(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+        ScaleARGBDownEven(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
         return;
       }
       // Optimized odd scale down. ie 3, 5, 7, 9x.
@@ -759,96 +901,105 @@ static void ScaleARGB(const uint8* src, int src_stride,
   }
   if (dx == 0x10000 && (x & 0xffff) == 0) {
     // Arbitrary scale vertically, but unscaled vertically.
-    ScalePlaneVertical(src_height,
-                       clip_width, clip_height,
-                       src_stride, dst_stride, src, dst,
-                       x, y, dy, 4, filtering);
+    ScalePlaneVertical(src_height, clip_width, clip_height, src_stride,
+                       dst_stride, src, dst, x, y, dy, 4, filtering);
     return;
   }
   if (filtering && dy < 65536) {
-    ScaleARGBBilinearUp(src_width, src_height,
-                        clip_width, clip_height,
-                        src_stride, dst_stride, src, dst,
-                        x, dx, y, dy, filtering);
+    ScaleARGBBilinearUp(src_width, src_height, clip_width, clip_height,
+                        src_stride, dst_stride, src, dst, x, dx, y, dy,
+                        filtering);
     return;
   }
   if (filtering) {
-    ScaleARGBBilinearDown(src_width, src_height,
-                          clip_width, clip_height,
-                          src_stride, dst_stride, src, dst,
-                          x, dx, y, dy, filtering);
+    ScaleARGBBilinearDown(src_width, src_height, clip_width, clip_height,
+                          src_stride, dst_stride, src, dst, x, dx, y, dy,
+                          filtering);
     return;
   }
-  ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
-                  src_stride, dst_stride, src, dst,
-                  x, dx, y, dy);
+  ScaleARGBSimple(src_width, src_height, clip_width, clip_height, src_stride,
+                  dst_stride, src, dst, x, dx, y, dy);
 }
 
 LIBYUV_API
-int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
-                  int src_width, int src_height,
-                  uint8* dst_argb, int dst_stride_argb,
-                  int dst_width, int dst_height,
-                  int clip_x, int clip_y, int clip_width, int clip_height,
+int ARGBScaleClip(const uint8_t* src_argb,
+                  int src_stride_argb,
+                  int src_width,
+                  int src_height,
+                  uint8_t* dst_argb,
+                  int dst_stride_argb,
+                  int dst_width,
+                  int dst_height,
+                  int clip_x,
+                  int clip_y,
+                  int clip_width,
+                  int clip_height,
                   enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0 ||
-      clip_x < 0 || clip_y < 0 ||
+  if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb ||
+      dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 ||
       clip_width > 32768 || clip_height > 32768 ||
       (clip_x + clip_width) > dst_width ||
       (clip_y + clip_height) > dst_height) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            clip_x, clip_y, clip_width, clip_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, clip_x, clip_y, clip_width,
+            clip_height, filtering);
   return 0;
 }
 
 // Scale an ARGB image.
 LIBYUV_API
-int ARGBScale(const uint8* src_argb, int src_stride_argb,
-              int src_width, int src_height,
-              uint8* dst_argb, int dst_stride_argb,
-              int dst_width, int dst_height,
+int ARGBScale(const uint8_t* src_argb,
+              int src_stride_argb,
+              int src_width,
+              int src_height,
+              uint8_t* dst_argb,
+              int dst_stride_argb,
+              int dst_width,
+              int dst_height,
               enum FilterMode filtering) {
-  if (!src_argb || src_width == 0 || src_height == 0 ||
-      src_width > 32768 || src_height > 32768 ||
-      !dst_argb || dst_width <= 0 || dst_height <= 0) {
+  if (!src_argb || src_width == 0 || src_height == 0 || src_width > 32768 ||
+      src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
-  ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
-            dst_argb, dst_stride_argb, dst_width, dst_height,
-            0, 0, dst_width, dst_height, filtering);
+  ScaleARGB(src_argb, src_stride_argb, src_width, src_height, dst_argb,
+            dst_stride_argb, dst_width, dst_height, 0, 0, dst_width, dst_height,
+            filtering);
   return 0;
 }
 
 // Scale with YUV conversion to ARGB and clipping.
 LIBYUV_API
-int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
-                       const uint8* src_u, int src_stride_u,
-                       const uint8* src_v, int src_stride_v,
-                       uint32 src_fourcc,
-                       int src_width, int src_height,
-                       uint8* dst_argb, int dst_stride_argb,
-                       uint32 dst_fourcc,
-                       int dst_width, int dst_height,
-                       int clip_x, int clip_y, int clip_width, int clip_height,
+int YUVToARGBScaleClip(const uint8_t* src_y,
+                       int src_stride_y,
+                       const uint8_t* src_u,
+                       int src_stride_u,
+                       const uint8_t* src_v,
+                       int src_stride_v,
+                       uint32_t src_fourcc,
+                       int src_width,
+                       int src_height,
+                       uint8_t* dst_argb,
+                       int dst_stride_argb,
+                       uint32_t dst_fourcc,
+                       int dst_width,
+                       int dst_height,
+                       int clip_x,
+                       int clip_y,
+                       int clip_width,
+                       int clip_height,
                        enum FilterMode filtering) {
-  uint8* argb_buffer = (uint8*)malloc(src_width * src_height * 4);
+  uint8_t* argb_buffer = (uint8_t*)malloc(src_width * src_height * 4);
   int r;
-  I420ToARGB(src_y, src_stride_y,
-             src_u, src_stride_u,
-             src_v, src_stride_v,
-             argb_buffer, src_width * 4,
-             src_width, src_height);
-
-  r = ARGBScaleClip(argb_buffer, src_width * 4,
-                    src_width, src_height,
-                    dst_argb, dst_stride_argb,
-                    dst_width, dst_height,
-                    clip_x, clip_y, clip_width, clip_height,
-                    filtering);
+  (void)src_fourcc;  // TODO(fbarchard): implement and/or assert.
+  (void)dst_fourcc;
+  I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
+             argb_buffer, src_width * 4, src_width, src_height);
+
+  r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, src_height, dst_argb,
+                    dst_stride_argb, dst_width, dst_height, clip_x, clip_y,
+                    clip_width, clip_height, filtering);
   free(argb_buffer);
   return r;
 }
diff --git a/libs/libvpx/third_party/libyuv/source/scale_common.cc b/libs/libvpx/third_party/libyuv/source/scale_common.cc
index 3507aa4d9ff..b28d7da41fc 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_common.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_common.cc
@@ -28,9 +28,12 @@ static __inline int Abs(int v) {
 }
 
 // CPU agnostic row functions
-void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown2_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -42,9 +45,12 @@ void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown2_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[1];
     dst[1] = src_ptr[3];
@@ -56,10 +62,13 @@ void ScaleRowDown2_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
+void ScaleRowDown2Linear_C(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  const uint8_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -71,10 +80,13 @@ void ScaleRowDown2Linear_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                              uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
+void ScaleRowDown2Linear_16_C(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  const uint16_t* s = src_ptr;
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + 1) >> 1;
     dst[1] = (s[2] + s[3] + 1) >> 1;
@@ -86,10 +98,12 @@ void ScaleRowDown2Linear_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown2Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -103,10 +117,12 @@ void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown2Box_Odd_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   dst_width -= 1;
   for (x = 0; x < dst_width - 1; x += 2) {
@@ -125,10 +141,12 @@ void ScaleRowDown2Box_Odd_C(const uint8* src_ptr, ptrdiff_t src_stride,
   dst[0] = (s[0] + t[0] + 1) >> 1;
 }
 
-void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown2Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (s[0] + s[1] + t[0] + t[1] + 2) >> 2;
@@ -142,9 +160,12 @@ void ScaleRowDown2Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                     uint8* dst, int dst_width) {
+void ScaleRowDown4_C(const uint8_t* src_ptr,
+                     ptrdiff_t src_stride,
+                     uint8_t* dst,
+                     int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -156,9 +177,12 @@ void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                        uint16* dst, int dst_width) {
+void ScaleRowDown4_16_C(const uint16_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint16_t* dst,
+                        int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src_ptr[2];
     dst[1] = src_ptr[6];
@@ -170,81 +194,88 @@ void ScaleRowDown4_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
+void ScaleRowDown4Box_C(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown4Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                           uint16* dst, int dst_width) {
+void ScaleRowDown4Box_16_C(const uint16_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint16_t* dst,
+                           int dst_width) {
   intptr_t stride = src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
     dst[1] = (src_ptr[4] + src_ptr[5] + src_ptr[6] + src_ptr[7] +
-             src_ptr[stride + 4] + src_ptr[stride + 5] +
-             src_ptr[stride + 6] + src_ptr[stride + 7] +
-             src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5] +
-             src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7] +
-             src_ptr[stride * 3 + 4] + src_ptr[stride * 3 + 5] +
-             src_ptr[stride * 3 + 6] + src_ptr[stride * 3 + 7] +
-             8) >> 4;
+              src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride + 6] +
+              src_ptr[stride + 7] + src_ptr[stride * 2 + 4] +
+              src_ptr[stride * 2 + 5] + src_ptr[stride * 2 + 6] +
+              src_ptr[stride * 2 + 7] + src_ptr[stride * 3 + 4] +
+              src_ptr[stride * 3 + 5] + src_ptr[stride * 3 + 6] +
+              src_ptr[stride * 3 + 7] + 8) >>
+             4;
     dst += 2;
     src_ptr += 8;
   }
   if (dst_width & 1) {
     dst[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[3] +
-             src_ptr[stride + 0] + src_ptr[stride + 1] +
-             src_ptr[stride + 2] + src_ptr[stride + 3] +
-             src_ptr[stride * 2 + 0] + src_ptr[stride * 2 + 1] +
-             src_ptr[stride * 2 + 2] + src_ptr[stride * 2 + 3] +
-             src_ptr[stride * 3 + 0] + src_ptr[stride * 3 + 1] +
-             src_ptr[stride * 3 + 2] + src_ptr[stride * 3 + 3] +
-             8) >> 4;
+              src_ptr[stride + 0] + src_ptr[stride + 1] + src_ptr[stride + 2] +
+              src_ptr[stride + 3] + src_ptr[stride * 2 + 0] +
+              src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2] +
+              src_ptr[stride * 2 + 3] + src_ptr[stride * 3 + 0] +
+              src_ptr[stride * 3 + 1] + src_ptr[stride * 3 + 2] +
+              src_ptr[stride * 3 + 3] + 8) >>
+             4;
   }
 }
 
-void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown34_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -255,9 +286,12 @@ void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown34_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -269,19 +303,21 @@ void ScaleRowDown34_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 0 and 1 together, 3 : 1
-void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -291,19 +327,21 @@ void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_0_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 * 3 + b0 + 2) >> 2;
     d[1] = (a1 * 3 + b1 + 2) >> 2;
     d[2] = (a2 * 3 + b2 + 2) >> 2;
@@ -314,19 +352,21 @@ void ScaleRowDown34_0_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Filter rows 1 and 2 together, 1 : 1
-void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* d, int dst_width) {
-  const uint8* s = src_ptr;
-  const uint8* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* d,
+                            int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint8 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint8 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint8 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint8 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint8 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint8 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint8_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint8_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint8_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint8_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint8_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint8_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -336,19 +376,21 @@ void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* d, int dst_width) {
-  const uint16* s = src_ptr;
-  const uint16* t = src_ptr + src_stride;
+void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* d,
+                               int dst_width) {
+  const uint16_t* s = src_ptr;
+  const uint16_t* t = src_ptr + src_stride;
   int x;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (x = 0; x < dst_width; x += 3) {
-    uint16 a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
-    uint16 a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
-    uint16 a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
-    uint16 b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
-    uint16 b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
-    uint16 b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
+    uint16_t a0 = (s[0] * 3 + s[1] * 1 + 2) >> 2;
+    uint16_t a1 = (s[1] * 1 + s[2] * 1 + 1) >> 1;
+    uint16_t a2 = (s[2] * 1 + s[3] * 3 + 2) >> 2;
+    uint16_t b0 = (t[0] * 3 + t[1] * 1 + 2) >> 2;
+    uint16_t b1 = (t[1] * 1 + t[2] * 1 + 1) >> 1;
+    uint16_t b2 = (t[2] * 1 + t[3] * 3 + 2) >> 2;
     d[0] = (a0 + b0 + 1) >> 1;
     d[1] = (a1 + b1 + 1) >> 1;
     d[2] = (a2 + b2 + 1) >> 1;
@@ -359,8 +401,11 @@ void ScaleRowDown34_1_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                 int dst_width, int x, int dx) {
+void ScaleCols_C(uint8_t* dst_ptr,
+                 const uint8_t* src_ptr,
+                 int dst_width,
+                 int x,
+                 int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -374,8 +419,11 @@ void ScaleCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleCols_16_C(uint16_t* dst_ptr,
+                    const uint16_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[0] = src_ptr[x >> 16];
@@ -390,9 +438,14 @@ void ScaleCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
-                    int dst_width, int x, int dx) {
+void ScaleColsUp2_C(uint8_t* dst_ptr,
+                    const uint8_t* src_ptr,
+                    int dst_width,
+                    int x,
+                    int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -403,9 +456,14 @@ void ScaleColsUp2_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleColsUp2_16_C(uint16_t* dst_ptr,
+                       const uint16_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst_ptr[1] = dst_ptr[0] = src_ptr[0];
     src_ptr += 1;
@@ -418,16 +476,19 @@ void ScaleColsUp2_16_C(uint16* dst_ptr, const uint16* src_ptr,
 
 // (1-f)a + fb can be replaced with a + f(b-a)
 #if defined(__arm__) || defined(__aarch64__)
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 #else
-// inteluses 7 bit math with rounding.
-#define BLENDER(a, b, f) (uint8)((int)(a) + \
-    (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
+// Intel uses 7 bit math with rounding.
+#define BLENDER(a, b, f) \
+  (uint8_t)((int)(a) + (((int)((f) >> 9) * ((int)(b) - (int)(a)) + 0x40) >> 7))
 #endif
 
-void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_C(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -450,12 +511,15 @@ void ScaleFilterCols_C(uint8* dst_ptr, const uint8* src_ptr,
   }
 }
 
-void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_C(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x32,
+                         int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -468,7 +532,7 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -476,12 +540,15 @@ void ScaleFilterCols64_C(uint8* dst_ptr, const uint8* src_ptr,
 }
 #undef BLENDER
 
-// Same as 8 bit arm blender but return is cast to uint16
-#define BLENDER(a, b, f) (uint16)((int)(a) + \
-    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+// Same as 8 bit arm blender but return is cast to uint16_t
+#define BLENDER(a, b, f) \
+  (uint16_t)((int)(a) + ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
 
-void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                       int dst_width, int x, int dx) {
+void ScaleFilterCols_16_C(uint16_t* dst_ptr,
+                          const uint16_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
@@ -504,12 +571,15 @@ void ScaleFilterCols_16_C(uint16* dst_ptr, const uint16* src_ptr,
   }
 }
 
-void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
-                         int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
+void ScaleFilterCols64_16_C(uint16_t* dst_ptr,
+                            const uint16_t* src_ptr,
+                            int dst_width,
+                            int x32,
+                            int dx) {
+  int64_t x = (int64_t)(x32);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -522,7 +592,7 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
     dst_ptr += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int a = src_ptr[xi];
     int b = src_ptr[xi + 1];
     dst_ptr[0] = BLENDER(a, b, x & 0xffff);
@@ -530,9 +600,12 @@ void ScaleFilterCols64_16_C(uint16* dst_ptr, const uint16* src_ptr,
 }
 #undef BLENDER
 
-void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                      uint8* dst, int dst_width) {
+void ScaleRowDown38_C(const uint8_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint8_t* dst,
+                      int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -543,9 +616,12 @@ void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t src_stride,
   }
 }
 
-void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                         uint16* dst, int dst_width) {
+void ScaleRowDown38_16_C(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
   int x;
+  (void)src_stride;
   assert(dst_width % 3 == 0);
   for (x = 0; x < dst_width; x += 3) {
     dst[0] = src_ptr[0];
@@ -557,100 +633,118 @@ void ScaleRowDown38_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
 }
 
 // 8x3 -> 3x1
-void ScaleRowDown38_3_Box_C(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_C(const uint8_t* src_ptr,
                             ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_3_Box_16_C(const uint16* src_ptr,
+void ScaleRowDown38_3_Box_16_C(const uint16_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
-        src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
-        (65536 / 9) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
-        src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
-        (65536 / 9) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7] +
-        src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
-        (65536 / 6) >> 16;
+    dst_ptr[0] =
+        (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+         src_ptr[stride + 1] + src_ptr[stride + 2] + src_ptr[stride * 2 + 0] +
+         src_ptr[stride * 2 + 1] + src_ptr[stride * 2 + 2]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[1] =
+        (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+         src_ptr[stride + 4] + src_ptr[stride + 5] + src_ptr[stride * 2 + 3] +
+         src_ptr[stride * 2 + 4] + src_ptr[stride * 2 + 5]) *
+            (65536 / 9) >>
+        16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7] +
+         src_ptr[stride * 2 + 6] + src_ptr[stride * 2 + 7]) *
+            (65536 / 6) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
 // 8x2 -> 3x1
-void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_C(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride,
-                               uint16* dst_ptr, int dst_width) {
+void ScaleRowDown38_2_Box_16_C(const uint16_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint16_t* dst_ptr,
+                               int dst_width) {
   intptr_t stride = src_stride;
   int i;
   assert((dst_width % 3 == 0) && (dst_width > 0));
   for (i = 0; i < dst_width; i += 3) {
-    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] +
-        src_ptr[stride + 0] + src_ptr[stride + 1] +
-        src_ptr[stride + 2]) * (65536 / 6) >> 16;
-    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] +
-        src_ptr[stride + 3] + src_ptr[stride + 4] +
-        src_ptr[stride + 5]) * (65536 / 6) >> 16;
-    dst_ptr[2] = (src_ptr[6] + src_ptr[7] +
-        src_ptr[stride + 6] + src_ptr[stride + 7]) *
-        (65536 / 4) >> 16;
+    dst_ptr[0] = (src_ptr[0] + src_ptr[1] + src_ptr[2] + src_ptr[stride + 0] +
+                  src_ptr[stride + 1] + src_ptr[stride + 2]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[1] = (src_ptr[3] + src_ptr[4] + src_ptr[5] + src_ptr[stride + 3] +
+                  src_ptr[stride + 4] + src_ptr[stride + 5]) *
+                     (65536 / 6) >>
+                 16;
+    dst_ptr[2] =
+        (src_ptr[6] + src_ptr[7] + src_ptr[stride + 6] + src_ptr[stride + 7]) *
+            (65536 / 4) >>
+        16;
     src_ptr += 8;
     dst_ptr += 3;
   }
 }
 
-void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+void ScaleAddRow_C(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -664,7 +758,9 @@ void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
   }
 }
 
-void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
+void ScaleAddRow_16_C(const uint16_t* src_ptr,
+                      uint32_t* dst_ptr,
+                      int src_width) {
   int x;
   assert(src_width > 0);
   for (x = 0; x < src_width - 1; x += 2) {
@@ -678,13 +774,14 @@ void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) {
   }
 }
 
-void ScaleARGBRowDown2_C(const uint8* src_argb,
+void ScaleARGBRowDown2_C(const uint8_t* src_argb,
                          ptrdiff_t src_stride,
-                         uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
+                         uint8_t* dst_argb,
+                         int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[1];
     dst[1] = src[3];
@@ -696,10 +793,12 @@ void ScaleARGBRowDown2_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
+  (void)src_stride;
   for (x = 0; x < dst_width; ++x) {
     dst_argb[0] = (src_argb[0] + src_argb[4] + 1) >> 1;
     dst_argb[1] = (src_argb[1] + src_argb[5] + 1) >> 1;
@@ -710,29 +809,37 @@ void ScaleARGBRowDown2Linear_C(const uint8* src_argb,
   }
 }
 
-void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDown2Box_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_argb,
+                            int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += 8;
     dst_argb += 4;
   }
 }
 
-void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEven_C(const uint8_t* src_argb,
+                            ptrdiff_t src_stride,
                             int src_stepx,
-                            uint8* dst_argb, int dst_width) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
-
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  (void)src_stride;
   int x;
   for (x = 0; x < dst_width - 1; x += 2) {
     dst[0] = src[0];
@@ -745,30 +852,38 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t src_stride,
   }
 }
 
-void ScaleARGBRowDownEvenBox_C(const uint8* src_argb,
+void ScaleARGBRowDownEvenBox_C(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
                                int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+                               uint8_t* dst_argb,
+                               int dst_width) {
   int x;
   for (x = 0; x < dst_width; ++x) {
-    dst_argb[0] = (src_argb[0] + src_argb[4] +
-                  src_argb[src_stride] + src_argb[src_stride + 4] + 2) >> 2;
-    dst_argb[1] = (src_argb[1] + src_argb[5] +
-                  src_argb[src_stride + 1] + src_argb[src_stride + 5] + 2) >> 2;
-    dst_argb[2] = (src_argb[2] + src_argb[6] +
-                  src_argb[src_stride + 2] + src_argb[src_stride + 6] + 2) >> 2;
-    dst_argb[3] = (src_argb[3] + src_argb[7] +
-                  src_argb[src_stride + 3] + src_argb[src_stride + 7] + 2) >> 2;
+    dst_argb[0] = (src_argb[0] + src_argb[4] + src_argb[src_stride] +
+                   src_argb[src_stride + 4] + 2) >>
+                  2;
+    dst_argb[1] = (src_argb[1] + src_argb[5] + src_argb[src_stride + 1] +
+                   src_argb[src_stride + 5] + 2) >>
+                  2;
+    dst_argb[2] = (src_argb[2] + src_argb[6] + src_argb[src_stride + 2] +
+                   src_argb[src_stride + 6] + 2) >>
+                  2;
+    dst_argb[3] = (src_argb[3] + src_argb[7] + src_argb[src_stride + 3] +
+                   src_argb[src_stride + 7] + 2) >>
+                  2;
     src_argb += src_stepx * 4;
     dst_argb += 4;
   }
 }
 
 // Scales a single row of pixels using point sampling.
-void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
-                     int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols_C(uint8_t* dst_argb,
+                     const uint8_t* src_argb,
+                     int dst_width,
+                     int x,
+                     int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -782,11 +897,14 @@ void ScaleARGBCols_C(uint8* dst_argb, const uint8* src_argb,
   }
 }
 
-void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
-                       int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBCols64_C(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x32,
+                       int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[0] = src[x >> 16];
@@ -801,11 +919,16 @@ void ScaleARGBCols64_C(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Scales a single row of pixels up by 2x using point sampling.
-void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBColsUp2_C(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
+  (void)x;
+  (void)dx;
   for (j = 0; j < dst_width - 1; j += 2) {
     dst[1] = dst[0] = src[0];
     src += 1;
@@ -818,23 +941,26 @@ void ScaleARGBColsUp2_C(uint8* dst_argb, const uint8* src_argb,
 
 // TODO(fbarchard): Replace 0x7f ^ f with 128-f.  bug=607.
 // Mimics SSSE3 blender
-#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7
-#define BLENDERC(a, b, f, s) (uint32)( \
-    BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
-#define BLENDER(a, b, f) \
-    BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | \
-    BLENDERC(a, b, f, 8) | BLENDERC(a, b, f, 0)
+#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7
+#define BLENDERC(a, b, f, s) \
+  (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s)
+#define BLENDER(a, b, f)                                                 \
+  BLENDERC(a, b, f, 24) | BLENDERC(a, b, f, 16) | BLENDERC(a, b, f, 8) | \
+      BLENDERC(a, b, f, 0)
 
-void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBFilterCols_C(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -848,23 +974,26 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb,
   if (dst_width & 1) {
     int xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
 
-void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
-                             int dst_width, int x32, int dx) {
-  int64 x = (int64)(x32);
-  const uint32* src = (const uint32*)(src_argb);
-  uint32* dst = (uint32*)(dst_argb);
+void ScaleARGBFilterCols64_C(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x32,
+                             int dx) {
+  int64_t x = (int64_t)(x32);
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
   int j;
   for (j = 0; j < dst_width - 1; j += 2) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
     x += dx;
     xi = x >> 16;
@@ -876,10 +1005,10 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
     dst += 2;
   }
   if (dst_width & 1) {
-    int64 xi = x >> 16;
+    int64_t xi = x >> 16;
     int xf = (x >> 9) & 0x7f;
-    uint32 a = src[xi];
-    uint32 b = src[xi + 1];
+    uint32_t a = src[xi];
+    uint32_t b = src[xi + 1];
     dst[0] = BLENDER(a, b, xf);
   }
 }
@@ -889,16 +1018,22 @@ void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb,
 
 // Scale plane vertically with bilinear interpolation.
 void ScalePlaneVertical(int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_argb, uint8* dst_argb,
-                        int x, int y, int dy,
-                        int bpp, enum FilterMode filtering) {
+                        int dst_width,
+                        int dst_height,
+                        int src_stride,
+                        int dst_stride,
+                        const uint8_t* src_argb,
+                        uint8_t* dst_argb,
+                        int x,
+                        int y,
+                        int dy,
+                        int bpp,
+                        enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher bpp.
   int dst_width_bytes = dst_width * bpp;
-  void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_C;
+  void (*InterpolateRow)(uint8_t * dst_argb, const uint8_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(bpp >= 1 && bpp <= 4);
@@ -930,13 +1065,11 @@ void ScalePlaneVertical(int src_height,
     }
   }
 #endif
-#if defined(HAS_INTERPOLATEROW_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_DSPR2;
+#if defined(HAS_INTERPOLATEROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    InterpolateRow = InterpolateRow_Any_MSA;
+    if (IS_ALIGNED(dst_width_bytes, 32)) {
+      InterpolateRow = InterpolateRow_MSA;
     }
   }
 #endif
@@ -948,23 +1081,29 @@ void ScalePlaneVertical(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_bytes, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_bytes, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 void ScalePlaneVertical_16(int src_height,
-                           int dst_width, int dst_height,
-                           int src_stride, int dst_stride,
-                           const uint16* src_argb, uint16* dst_argb,
-                           int x, int y, int dy,
-                           int wpp, enum FilterMode filtering) {
+                           int dst_width,
+                           int dst_height,
+                           int src_stride,
+                           int dst_stride,
+                           const uint16_t* src_argb,
+                           uint16_t* dst_argb,
+                           int x,
+                           int y,
+                           int dy,
+                           int wpp,
+                           enum FilterMode filtering) {
   // TODO(fbarchard): Allow higher wpp.
   int dst_width_words = dst_width * wpp;
-  void (*InterpolateRow)(uint16* dst_argb, const uint16* src_argb,
-      ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
-      InterpolateRow_16_C;
+  void (*InterpolateRow)(uint16_t * dst_argb, const uint16_t* src_argb,
+                         ptrdiff_t src_stride, int dst_width,
+                         int source_y_fraction) = InterpolateRow_16_C;
   const int max_y = (src_height > 1) ? ((src_height - 1) << 16) - 1 : 0;
   int j;
   assert(wpp >= 1 && wpp <= 2);
@@ -1003,16 +1142,6 @@ void ScalePlaneVertical_16(int src_height,
       InterpolateRow = InterpolateRow_16_NEON;
     }
   }
-#endif
-#if defined(HAS_INTERPOLATEROW_16_DSPR2)
-  if (TestCpuFlag(kCpuHasDSPR2) &&
-      IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
-    InterpolateRow = InterpolateRow_Any_16_DSPR2;
-    if (IS_ALIGNED(dst_width_bytes, 4)) {
-      InterpolateRow = InterpolateRow_16_DSPR2;
-    }
-  }
 #endif
   for (j = 0; j < dst_height; ++j) {
     int yi;
@@ -1022,16 +1151,18 @@ void ScalePlaneVertical_16(int src_height,
     }
     yi = y >> 16;
     yf = filtering ? ((y >> 8) & 255) : 0;
-    InterpolateRow(dst_argb, src_argb + yi * src_stride,
-                   src_stride, dst_width_words, yf);
+    InterpolateRow(dst_argb, src_argb + yi * src_stride, src_stride,
+                   dst_width_words, yf);
     dst_argb += dst_stride;
     y += dy;
   }
 }
 
 // Simplify the filtering based on scale factors.
-enum FilterMode ScaleFilterReduce(int src_width, int src_height,
-                                  int dst_width, int dst_height,
+enum FilterMode ScaleFilterReduce(int src_width,
+                                  int src_height,
+                                  int dst_width,
+                                  int dst_height,
                                   enum FilterMode filtering) {
   if (src_width < 0) {
     src_width = -src_width;
@@ -1073,22 +1204,26 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height,
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_C(int num, int div) {
-  return (int)(((int64)(num) << 16) / div);
+  return (int)(((int64_t)(num) << 16) / div);
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv1_C(int num, int div) {
-  return (int)((((int64)(num) << 16) - 0x00010001) /
-                          (div - 1));
+  return (int)((((int64_t)(num) << 16) - 0x00010001) / (div - 1));
 }
 
 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
 
 // Compute slope values for stepping.
-void ScaleSlope(int src_width, int src_height,
-                int dst_width, int dst_height,
+void ScaleSlope(int src_width,
+                int src_height,
+                int dst_width,
+                int dst_height,
                 enum FilterMode filtering,
-                int* x, int* y, int* dx, int* dy) {
+                int* x,
+                int* y,
+                int* dx,
+                int* dy) {
   assert(x != NULL);
   assert(y != NULL);
   assert(dx != NULL);
@@ -1120,7 +1255,7 @@ void ScaleSlope(int src_width, int src_height,
       *x = 0;
     }
     if (dst_height <= src_height) {
-      *dy = FixedDiv(src_height,  dst_height);
+      *dy = FixedDiv(src_height, dst_height);
       *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
     } else if (dst_height > 1) {
       *dy = FixedDiv1(src_height, dst_height);
@@ -1153,6 +1288,35 @@ void ScaleSlope(int src_width, int src_height,
 }
 #undef CENTERSTART
 
+// Read 8x2 upsample with filtering and write 16x1.
+// actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_C(const uint16_t* src_ptr,
+                      ptrdiff_t src_stride,
+                      uint16_t* dst,
+                      int dst_width) {
+  const uint16_t* src2 = src_ptr + src_stride;
+
+  int x;
+  for (x = 0; x < dst_width - 1; x += 2) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+    dst[1] = (p0 * 3 + p1 * 9 + p2 + p3 * 3 + 8) >> 4;
+    ++src_ptr;
+    ++src2;
+    dst += 2;
+  }
+  if (dst_width & 1) {
+    uint16_t p0 = src_ptr[0];
+    uint16_t p1 = src_ptr[1];
+    uint16_t p2 = src2[0];
+    uint16_t p3 = src2[1];
+    dst[0] = (p0 * 9 + p1 * 3 + p2 * 3 + p3 + 8) >> 4;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/libs/libvpx/third_party/libyuv/source/scale_gcc.cc b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc
index e2f88544b7f..312236d2df8 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_gcc.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_gcc.cc
@@ -21,1296 +21,1348 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // GCC versions of row functions are verbatim conversions from Visual C.
 // Generated using gcc disassembly on Visual C object file:
 // objdump -D yuvscaler.obj >yuvscaler.txt
 
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "psrlw     $0x8,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "psrlw     $0x8,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "pxor       %%xmm5,%%xmm5                  \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "psrlw      $0x1,%%xmm0                    \n"
-    "psrlw      $0x1,%%xmm1                    \n"
-    "pavgw      %%xmm5,%%xmm0                  \n"
-    "pavgw      %%xmm5,%%xmm1                  \n"
-    "packuswb   %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "psrlw      $0x1,%%xmm0                    \n"
+      "psrlw      $0x1,%%xmm1                    \n"
+      "pavgw      %%xmm5,%%xmm0                  \n"
+      "pavgw      %%xmm5,%%xmm1                  \n"
+      "packuswb   %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
-    "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
-    "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x20,1) ",%1          \n"
-    "sub        $0x20,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea        0x20(%1),%1                    \n"
+      "sub        $0x20,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "pcmpeqb   %%xmm5,%%xmm5                   \n"
-    "psrld     $0x18,%%xmm5                    \n"
-    "pslld     $0x10,%%xmm5                    \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pand      %%xmm5,%%xmm0                   \n"
-    "pand      %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x8,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "pcmpeqb   %%xmm5,%%xmm5                   \n"
+      "psrld     $0x18,%%xmm5                    \n"
+      "pslld     $0x10,%%xmm5                    \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pand      %%xmm5,%%xmm0                   \n"
+      "pand      %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x8,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst_ptr,
+                            int dst_width) {
   intptr_t stridex3;
-  asm volatile (
-    "pcmpeqb    %%xmm4,%%xmm4                  \n"
-    "psrlw      $0xf,%%xmm4                    \n"
-    "movdqa     %%xmm4,%%xmm5                  \n"
-    "packuswb   %%xmm4,%%xmm4                  \n"
-    "psllw      $0x3,%%xmm5                    \n"
-    "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm0                  \n"
-    "pmaddubsw  %%xmm4,%%xmm1                  \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
-    MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pmaddubsw  %%xmm4,%%xmm2                  \n"
-    "pmaddubsw  %%xmm4,%%xmm3                  \n"
-    "paddw      %%xmm2,%%xmm0                  \n"
-    "paddw      %%xmm3,%%xmm1                  \n"
-    "phaddw     %%xmm1,%%xmm0                  \n"
-    "paddw      %%xmm5,%%xmm0                  \n"
-    "psrlw      $0x4,%%xmm0                    \n"
-    "packuswb   %%xmm0,%%xmm0                  \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x8,1) ",%1            \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width),   // %2
-    "=&r"(stridex3)    // %3
-  : "r"((intptr_t)(src_stride))    // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "movdqa     %%xmm4,%%xmm5                  \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "psllw      $0x3,%%xmm5                    \n"
+      "lea       0x00(%4,%4,2),%3                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%4,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,1),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm0                  \n"
+      "pmaddubsw  %%xmm4,%%xmm1                  \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%4,2),%%xmm2            \n"
+      "movdqu    0x10(%0,%4,2),%%xmm3            \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pmaddubsw  %%xmm4,%%xmm2                  \n"
+      "pmaddubsw  %%xmm4,%%xmm3                  \n"
+      "paddw      %%xmm2,%%xmm0                  \n"
+      "paddw      %%xmm3,%%xmm1                  \n"
+      "phaddw     %%xmm1,%%xmm0                  \n"
+      "paddw      %%xmm5,%%xmm0                  \n"
+      "psrlw      $0x4,%%xmm0                    \n"
+      "packuswb   %%xmm0,%%xmm0                  \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "lea       0x8(%1),%1                      \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width),             // %2
+        "=&r"(stridex3)              // %3
+      : "r"((intptr_t)(src_stride))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-
 #ifdef HAS_SCALEROWDOWN4_AVX2
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
-    "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
-    "vpslld     $0x10,%%ymm5,%%ymm5            \n"
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
-    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
-  );
+void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
+      "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
+      "vpslld     $0x10,%%ymm5,%%ymm5            \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm5");
 }
 
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
-    "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
-    "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
-    "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
-    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
-    MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
-    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
-    MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
-    "lea        " MEMLEA(0x40,0) ",%0          \n"
-    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
-    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
-    "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
-    "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
-    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
-    "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
-    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
-    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
-    "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
-    "lea        " MEMLEA(0x10,1) ",%1          \n"
-    "sub        $0x10,%2                       \n"
-    "jg         1b                             \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "r"((intptr_t)(src_stride * 3))   // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  asm volatile(
+      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
+      "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
+      "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
+      "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x00(%0,%3,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,1),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%3,2),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%3,2),%%ymm3           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vmovdqu    0x00(%0,%4,1),%%ymm2           \n"
+      "vmovdqu    0x20(%0,%4,1),%%ymm3           \n"
+      "lea        0x40(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%xmm0,(%1)                    \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x10,%2                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),                   // %0
+        "+r"(dst_ptr),                   // %1
+        "+r"(dst_width)                  // %2
+      : "r"((intptr_t)(src_stride)),     // %3
+        "r"((intptr_t)(src_stride * 3))  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // HAS_SCALEROWDOWN4_AVX2
 
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm3                       \n"
-    "movdqa    %1,%%xmm4                       \n"
-    "movdqa    %2,%%xmm5                       \n"
-  :
-  : "m"(kShuf0),  // %0
-    "m"(kShuf1),  // %1
-    "m"(kShuf2)   // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "palignr   $0x8,%%xmm0,%%xmm1              \n"
-    "pshufb    %%xmm3,%%xmm0                   \n"
-    "pshufb    %%xmm4,%%xmm1                   \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
-  );
+void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %0,%%xmm3                       \n"
+      "movdqa    %1,%%xmm4                       \n"
+      "movdqa    %2,%%xmm5                       \n"
+      :
+      : "m"(kShuf0),  // %0
+        "m"(kShuf1),  // %1
+        "m"(kShuf2)   // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm2                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "palignr   $0x8,%%xmm0,%%xmm1              \n"
+      "pshufb    %%xmm3,%%xmm0                   \n"
+      "pshufb    %%xmm4,%%xmm1                   \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movq      %%xmm1,0x8(%1)                  \n"
+      "movq      %%xmm2,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "r"((intptr_t)(src_stride)),  // %3
-    "m"(kMadd21)     // %4
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"  // kShuf01
-    "movdqa    %1,%%xmm3                       \n"  // kShuf11
-    "movdqa    %2,%%xmm4                       \n"  // kShuf21
-  :
-  : "m"(kShuf01),  // %0
-    "m"(kShuf11),  // %1
-    "m"(kShuf21)   // %2
-  );
-  asm volatile (
-    "movdqa    %0,%%xmm5                       \n"  // kMadd01
-    "movdqa    %1,%%xmm0                       \n"  // kMadd11
-    "movdqa    %2,%%xmm1                       \n"  // kRound34
-  :
-  : "m"(kMadd01),  // %0
-    "m"(kMadd11),  // %1
-    "m"(kRound34)  // %2
-  );
-
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "pmaddubsw %%xmm5,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS(1) "         \n"
-    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
-    MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "pmaddubsw %%xmm0,%%xmm6                   \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
-    MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm6,%%xmm7                   \n"
-    "pavgb     %%xmm7,%%xmm6                   \n"
-    "pshufb    %%xmm4,%%xmm6                   \n"
-    "pmaddubsw %4,%%xmm6                       \n"
-    "paddsw    %%xmm1,%%xmm6                   \n"
-    "psrlw     $0x2,%%xmm6                     \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x18,1) ",%1           \n"
-    "sub       $0x18,%2                        \n"
-    "jg        1b                              \n"
-    : "+r"(src_ptr),   // %0
-      "+r"(dst_ptr),   // %1
-      "+r"(dst_width)  // %2
-    : "r"((intptr_t)(src_stride)),  // %3
-      "m"(kMadd21)     // %4
-    : "memory", "cc", NACL_R14
-      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"  // kShuf01
+      "movdqa    %1,%%xmm3                       \n"  // kShuf11
+      "movdqa    %2,%%xmm4                       \n"  // kShuf21
+      :
+      : "m"(kShuf01),  // %0
+        "m"(kShuf11),  // %1
+        "m"(kShuf21)   // %2
+      );
+  asm volatile(
+      "movdqa    %0,%%xmm5                       \n"  // kMadd01
+      "movdqa    %1,%%xmm0                       \n"  // kMadd11
+      "movdqa    %2,%%xmm1                       \n"  // kRound34
+      :
+      : "m"(kMadd01),  // %0
+        "m"(kMadd11),  // %1
+        "m"(kRound34)  // %2
+      );
+
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm6                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm7            \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "pmaddubsw %%xmm5,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,(%1)                     \n"
+      "movdqu    0x8(%0),%%xmm6                  \n"
+      "movdqu    0x8(%0,%3,1),%%xmm7             \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "pmaddubsw %%xmm0,%%xmm6                   \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x8(%1)                  \n"
+      "movdqu    0x10(%0),%%xmm6                 \n"
+      "movdqu    0x10(%0,%3,1),%%xmm7            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm6,%%xmm7                   \n"
+      "pavgb     %%xmm7,%%xmm6                   \n"
+      "pshufb    %%xmm4,%%xmm6                   \n"
+      "pmaddubsw %4,%%xmm6                       \n"
+      "paddsw    %%xmm1,%%xmm6                   \n"
+      "psrlw     $0x2,%%xmm6                     \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movq      %%xmm6,0x10(%1)                 \n"
+      "lea       0x18(%1),%1                     \n"
+      "sub       $0x18,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),                // %0
+        "+r"(dst_ptr),                // %1
+        "+r"(dst_width)               // %2
+      : "r"((intptr_t)(src_stride)),  // %3
+        "m"(kMadd21)                  // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %3,%%xmm4                       \n"
-    "movdqa    %4,%%xmm5                       \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "paddusb   %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(1) "         \n"
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
-    "lea       " MEMLEA(0xc,1) ",%1            \n"
-    "sub       $0xc,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width)  // %2
-  : "m"(kShuf38a),   // %3
-    "m"(kShuf38b)    // %4
-  : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
-  );
+void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst_ptr,
+                          int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "movdqa    %3,%%xmm4                       \n"
+      "movdqa    %4,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "paddusb   %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%1)                     \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movd      %%xmm1,0x8(%1)                  \n"
+      "lea       0xc(%1),%1                      \n"
+      "sub       $0xc,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "m"(kShuf38a),   // %3
+        "m"(kShuf38b)    // %4
+      : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5");
 }
 
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "movdqa    %3,%%xmm5                       \n"
-  :
-  : "m"(kShufAb0),   // %0
-    "m"(kShufAb1),   // %1
-    "m"(kShufAb2),   // %2
-    "m"(kScaleAb2)   // %3
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "pavgb     %%xmm1,%%xmm0                   \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "pshufb    %%xmm2,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm3,%%xmm6                   \n"
-    "paddusw   %%xmm6,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "paddusw   %%xmm0,%%xmm1                   \n"
-    "pmulhuw   %%xmm5,%%xmm1                   \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm1                    \n"
-    "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(dst_width)    // %2
-  : "r"((intptr_t)(src_stride))  // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "movdqa    %3,%%xmm5                       \n"
+      :
+      : "m"(kShufAb0),  // %0
+        "m"(kShufAb1),  // %1
+        "m"(kShufAb2),  // %2
+        "m"(kScaleAb2)  // %3
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm1            \n"
+      "lea       0x10(%0),%0                     \n"
+      "pavgb     %%xmm1,%%xmm0                   \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "pshufb    %%xmm2,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm3,%%xmm6                   \n"
+      "paddusw   %%xmm6,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "paddusw   %%xmm0,%%xmm1                   \n"
+      "pmulhuw   %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,(%1)                     \n"
+      "psrlq     $0x10,%%xmm1                    \n"
+      "movd      %%xmm1,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
+void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
                                 ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movdqa    %0,%%xmm2                       \n"
-    "movdqa    %1,%%xmm3                       \n"
-    "movdqa    %2,%%xmm4                       \n"
-    "pxor      %%xmm5,%%xmm5                   \n"
-  :
-  : "m"(kShufAc),    // %0
-    "m"(kShufAc3),   // %1
-    "m"(kScaleAc33)  // %2
-  );
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
-    "movhlps   %%xmm0,%%xmm1                   \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm0                   \n"
-    "punpcklbw %%xmm5,%%xmm1                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
-    "lea       " MEMLEA(0x10,0) ",%0           \n"
-    "movhlps   %%xmm6,%%xmm7                   \n"
-    "punpcklbw %%xmm5,%%xmm6                   \n"
-    "punpcklbw %%xmm5,%%xmm7                   \n"
-    "paddusw   %%xmm6,%%xmm0                   \n"
-    "paddusw   %%xmm7,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "psrldq    $0x2,%%xmm0                     \n"
-    "paddusw   %%xmm0,%%xmm6                   \n"
-    "pshufb    %%xmm2,%%xmm6                   \n"
-    "movdqa    %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "psrldq    $0x2,%%xmm1                     \n"
-    "paddusw   %%xmm1,%%xmm7                   \n"
-    "pshufb    %%xmm3,%%xmm7                   \n"
-    "paddusw   %%xmm7,%%xmm6                   \n"
-    "pmulhuw   %%xmm4,%%xmm6                   \n"
-    "packuswb  %%xmm6,%%xmm6                   \n"
-    "movd      %%xmm6," MEMACCESS(1) "         \n"
-    "psrlq     $0x10,%%xmm6                    \n"
-    "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
-    "lea       " MEMLEA(0x6,1) ",%1            \n"
-    "sub       $0x6,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),    // %0
-    "+r"(dst_ptr),    // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+                                uint8_t* dst_ptr,
+                                int dst_width) {
+  asm volatile(
+      "movdqa    %0,%%xmm2                       \n"
+      "movdqa    %1,%%xmm3                       \n"
+      "movdqa    %2,%%xmm4                       \n"
+      "pxor      %%xmm5,%%xmm5                   \n"
+      :
+      : "m"(kShufAc),    // %0
+        "m"(kShufAc3),   // %1
+        "m"(kScaleAc33)  // %2
+      );
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x00(%0,%3,1),%%xmm6            \n"
+      "movhlps   %%xmm0,%%xmm1                   \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm0                   \n"
+      "punpcklbw %%xmm5,%%xmm1                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqu    0x00(%0,%3,2),%%xmm6            \n"
+      "lea       0x10(%0),%0                     \n"
+      "movhlps   %%xmm6,%%xmm7                   \n"
+      "punpcklbw %%xmm5,%%xmm6                   \n"
+      "punpcklbw %%xmm5,%%xmm7                   \n"
+      "paddusw   %%xmm6,%%xmm0                   \n"
+      "paddusw   %%xmm7,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "psrldq    $0x2,%%xmm0                     \n"
+      "paddusw   %%xmm0,%%xmm6                   \n"
+      "pshufb    %%xmm2,%%xmm6                   \n"
+      "movdqa    %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "psrldq    $0x2,%%xmm1                     \n"
+      "paddusw   %%xmm1,%%xmm7                   \n"
+      "pshufb    %%xmm3,%%xmm7                   \n"
+      "paddusw   %%xmm7,%%xmm6                   \n"
+      "pmulhuw   %%xmm4,%%xmm6                   \n"
+      "packuswb  %%xmm6,%%xmm6                   \n"
+      "movd      %%xmm6,(%1)                     \n"
+      "psrlq     $0x10,%%xmm6                    \n"
+      "movd      %%xmm6,0x2(%1)                  \n"
+      "lea       0x6(%1),%1                      \n"
+      "sub       $0x6,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),               // %0
+        "+r"(dst_ptr),               // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 16xN bytes and produces 16 shorts at a time.
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "pxor      %%xmm5,%%xmm5                   \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
-    "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
-    "movdqa    %%xmm3,%%xmm2                   \n"
-    "punpcklbw %%xmm5,%%xmm2                   \n"
-    "punpckhbw %%xmm5,%%xmm3                   \n"
-    "paddusw   %%xmm2,%%xmm0                   \n"
-    "paddusw   %%xmm3,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
-    "lea       " MEMLEA(0x20,1) ",%1           \n"
-    "sub       $0x10,%2                        \n"
-    "jg        1b                              \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "pxor      %%xmm5,%%xmm5                   \n"
+
+      // 16 pixel loop.
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm3                     \n"
+      "lea       0x10(%0),%0                     \n"  // src_ptr += 16
+      "movdqu    (%1),%%xmm0                     \n"
+      "movdqu    0x10(%1),%%xmm1                 \n"
+      "movdqa    %%xmm3,%%xmm2                   \n"
+      "punpcklbw %%xmm5,%%xmm2                   \n"
+      "punpckhbw %%xmm5,%%xmm3                   \n"
+      "paddusw   %%xmm2,%%xmm0                   \n"
+      "paddusw   %%xmm3,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 
-
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
-  asm volatile (
-    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
-    "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
-    "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
-    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
-    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
-    "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
-    "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
-    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
-    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
-    "lea       " MEMLEA(0x40,1) ",%1           \n"
-    "sub       $0x20,%2                        \n"
-    "jg        1b                              \n"
-    "vzeroupper                                \n"
-  : "+r"(src_ptr),     // %0
-    "+r"(dst_ptr),     // %1
-    "+r"(src_width)    // %2
-  :
-  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
-  );
+void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                      uint16_t* dst_ptr,
+                      int src_width) {
+  asm volatile(
+
+      "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm3                    \n"
+      "lea        0x20(%0),%0                    \n"  // src_ptr += 32
+      "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
+      "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
+      "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
+      "vpaddusw   (%1),%%ymm2,%%ymm0             \n"
+      "vpaddusw   0x20(%1),%%ymm3,%%ymm1         \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "vmovdqu    %%ymm1,0x20(%1)                \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_width)  // %2
+      :
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5");
 }
 #endif  // HAS_SCALEADDROW_AVX2
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                           const uint8_t* src_ptr,
+                           int dst_width,
+                           int x,
+                           int dx) {
   intptr_t x0, x1, temp_pixel;
-  asm volatile (
-    "movd      %6,%%xmm2                       \n"
-    "movd      %7,%%xmm3                       \n"
-    "movl      $0x04040000,%k2                 \n"
-    "movd      %k2,%%xmm5                      \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
-    "pcmpeqb   %%xmm7,%%xmm7                   \n"
-    "psrlw     $15,%%xmm7                      \n"  // 0x00010001
-
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "subl      $0x2,%5                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
-    "movd      %k2,%%xmm4                      \n"
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "punpcklwd %%xmm4,%%xmm0                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm1                   \n"  // 128 -f = (f ^ 127 ) + 1
-    "paddusb   %%xmm7,%%xmm1                   \n"
-    "pmaddubsw %%xmm0,%%xmm1                   \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm1                     \n"
-    "packuswb  %%xmm1,%%xmm1                   \n"
-    "movd      %%xmm1,%k2                      \n"
-    "mov       %w2," MEMACCESS(0) "            \n"
-    "lea       " MEMLEA(0x2,0) ",%0            \n"
-    "subl      $0x2,%5                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "addl      $0x1,%5                         \n"
-    "jl        99f                             \n"
-    MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
-    "movd      %k2,%%xmm0                      \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "psubb     %8,%%xmm0                       \n"  // make pixels signed.
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "paddusb   %%xmm7,%%xmm2                   \n"
-    "pmaddubsw %%xmm0,%%xmm2                   \n"
-    "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
-    "psrlw     $0x7,%%xmm2                     \n"
-    "packuswb  %%xmm2,%%xmm2                   \n"
-    "movd      %%xmm2,%k2                      \n"
-    "mov       %b2," MEMACCESS(0) "            \n"
-  "99:                                         \n"
-  : "+r"(dst_ptr),      // %0
-    "+r"(src_ptr),      // %1
-    "=&a"(temp_pixel),  // %2
-    "=&r"(x0),          // %3
-    "=&r"(x1),          // %4
+  asm volatile(
+      "movd      %6,%%xmm2                       \n"
+      "movd      %7,%%xmm3                       \n"
+      "movl      $0x04040000,%k2                 \n"
+      "movd      %k2,%%xmm5                      \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
+      "pcmpeqb   %%xmm7,%%xmm7                   \n"
+      "psrlw     $15,%%xmm7                      \n"  // 0x00010001
+
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "subl      $0x2,%5                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movzwl    0x00(%1,%4,1),%k2               \n"
+      "movd      %k2,%%xmm4                      \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "punpcklwd %%xmm4,%%xmm0                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) +
+                                                      // 1
+      "paddusb   %%xmm7,%%xmm1                   \n"
+      "pmaddubsw %%xmm0,%%xmm1                   \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm1                     \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "movd      %%xmm1,%k2                      \n"
+      "mov       %w2,(%0)                        \n"
+      "lea       0x2(%0),%0                      \n"
+      "subl      $0x2,%5                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "addl      $0x1,%5                         \n"
+      "jl        99f                             \n"
+      "movzwl    0x00(%1,%3,1),%k2               \n"
+      "movd      %k2,%%xmm0                      \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "psubb     %8,%%xmm0                       \n"  // make pixels signed.
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "paddusb   %%xmm7,%%xmm2                   \n"
+      "pmaddubsw %%xmm0,%%xmm2                   \n"
+      "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
+      "psrlw     $0x7,%%xmm2                     \n"
+      "packuswb  %%xmm2,%%xmm2                   \n"
+      "movd      %%xmm2,%k2                      \n"
+      "mov       %b2,(%0)                        \n"
+      "99:                                       \n"
+      : "+r"(dst_ptr),      // %0
+        "+r"(src_ptr),      // %1
+        "=&a"(temp_pixel),  // %2
+        "=&r"(x0),          // %3
+        "=&r"(x1),          // %4
 #if defined(__x86_64__)
-    "+rm"(dst_width)    // %5
+        "+rm"(dst_width)  // %5
 #else
-    "+m"(dst_width)    // %5
+        "+m"(dst_width)  // %5
 #endif
-  : "rm"(x),            // %6
-    "rm"(dx),           // %7
+      : "rm"(x),   // %6
+        "rm"(dx),  // %7
 #if defined(__x86_64__)
-    "x"(kFsub80),       // %8
-    "x"(kFadd40)        // %9
+        "x"(kFsub80),  // %8
+        "x"(kFadd40)   // %9
 #else
-    "m"(kFsub80),       // %8
-    "m"(kFadd40)        // %9
+        "m"(kFsub80),    // %8
+        "m"(kFadd40)     // %9
 #endif
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
-  );
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpcklbw %%xmm0,%%xmm0                   \n"
-    "punpckhbw %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x20,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_ptr),     // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                       const uint8_t* src_ptr,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm0,%%xmm0                   \n"
+      "punpckhbw %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_ptr),   // %0
+        "+r"(src_ptr),   // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
                             ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "shufps    $0xdd,%%xmm1,%%xmm0             \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                            uint8_t* dst_argb,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "shufps    $0xdd,%%xmm1,%%xmm0             \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
                                   ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),  // %0
-    "+r"(dst_argb),  // %1
-    "+r"(dst_width)  // %2
-  :: "memory", "cc", "xmm0", "xmm1"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
+void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
                                ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
-    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
-    MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
-    MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "sub       $0x4,%2                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),   // %0
-    "+r"(dst_argb),   // %1
-    "+r"(dst_width)   // %2
-  : "r"((intptr_t)(src_stride))   // %3
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "movdqu    0x00(%0,%3,1),%%xmm2            \n"
+      "movdqu    0x10(%0,%3,1),%%xmm3            \n"
+      "lea       0x20(%0),%0                     \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x4,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),              // %0
+        "+r"(dst_argb),              // %1
+        "+r"(dst_width)              // %2
+      : "r"((intptr_t)(src_stride))  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    LABELALIGN
-  "1:                                          \n"
-    "movd      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
-    MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "punpckldq %%xmm3,%%xmm2                   \n"
-    "punpcklqdq %%xmm2,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),       // %0
-    "+r"(src_stepx_x4),   // %1
-    "+r"(dst_argb),       // %2
-    "+r"(dst_width),      // %3
-    "=&r"(src_stepx_x12)  // %4
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  (void)src_stride;
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movd      (%0),%%xmm0                     \n"
+      "movd      0x00(%0,%1,1),%%xmm1            \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%0,%1,2),%%xmm2            \n"
+      "movd      0x00(%0,%4,1),%%xmm3            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "punpckldq %%xmm3,%%xmm2                   \n"
+      "punpcklqdq %%xmm2,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),       // %0
+        "+r"(src_stepx_x4),   // %1
+        "+r"(dst_argb),       // %2
+        "+r"(dst_width),      // %3
+        "=&r"(src_stepx_x12)  // %4
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
 // Blends four 2x2 to 4x1.
 // Alignment requirement: dst_argb 16 byte aligned.
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride, int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  int src_stepx,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
   intptr_t src_stepx_x12;
   intptr_t row1 = (intptr_t)(src_stride);
-  asm volatile (
-    "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
-    "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
-    "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
-
-    LABELALIGN
-  "1:                                          \n"
-    "movq      " MEMACCESS(0) ",%%xmm0         \n"
-    MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
-    MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
-    MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
-    "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
-    "movq      " MEMACCESS(5) ",%%xmm2         \n"
-    MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
-    MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
-    MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
-    "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "pavgb     %%xmm3,%%xmm1                   \n"
-    "movdqa    %%xmm0,%%xmm2                   \n"
-    "shufps    $0x88,%%xmm1,%%xmm0             \n"
-    "shufps    $0xdd,%%xmm1,%%xmm2             \n"
-    "pavgb     %%xmm2,%%xmm0                   \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%3                         \n"
-    "jg        1b                              \n"
-  : "+r"(src_argb),        // %0
-    "+r"(src_stepx_x4),    // %1
-    "+r"(dst_argb),        // %2
-    "+rm"(dst_width),      // %3
-    "=&r"(src_stepx_x12),  // %4
-    "+r"(row1)             // %5
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3"
-  );
+  asm volatile(
+      "lea       0x00(,%1,4),%1                  \n"
+      "lea       0x00(%1,%1,2),%4                \n"
+      "lea       0x00(%0,%5,1),%5                \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movq      (%0),%%xmm0                     \n"
+      "movhps    0x00(%0,%1,1),%%xmm0            \n"
+      "movq      0x00(%0,%1,2),%%xmm1            \n"
+      "movhps    0x00(%0,%4,1),%%xmm1            \n"
+      "lea       0x00(%0,%1,4),%0                \n"
+      "movq      (%5),%%xmm2                     \n"
+      "movhps    0x00(%5,%1,1),%%xmm2            \n"
+      "movq      0x00(%5,%1,2),%%xmm3            \n"
+      "movhps    0x00(%5,%4,1),%%xmm3            \n"
+      "lea       0x00(%5,%1,4),%5                \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "pavgb     %%xmm3,%%xmm1                   \n"
+      "movdqa    %%xmm0,%%xmm2                   \n"
+      "shufps    $0x88,%%xmm1,%%xmm0             \n"
+      "shufps    $0xdd,%%xmm1,%%xmm2             \n"
+      "pavgb     %%xmm2,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%3                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_argb),        // %0
+        "+r"(src_stepx_x4),    // %1
+        "+r"(dst_argb),        // %2
+        "+rm"(dst_width),      // %3
+        "=&r"(src_stepx_x12),  // %4
+        "+r"(row1)             // %5
+        ::"memory",
+        "cc", "xmm0", "xmm1", "xmm2", "xmm3");
 }
 
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
-    "pshufd    $0x11,%%xmm3,%%xmm0             \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x5,%%xmm3,%%xmm0              \n"
-    "paddd     %%xmm0,%%xmm2                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "cmp       $0x0,%4                         \n"
-    "jl        99f                             \n"
-    "sub       $0x4,%4                         \n"
-    "jl        49f                             \n"
-
-    LABELALIGN
-  "40:                                         \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "pextrw    $0x7,%%xmm2,%k1                 \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
-    MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
-    "pextrw    $0x1,%%xmm2,%k0                 \n"
-    "pextrw    $0x3,%%xmm2,%k1                 \n"
-    "punpckldq %%xmm4,%%xmm1                   \n"
-    "punpcklqdq %%xmm1,%%xmm0                  \n"
-    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x10,2) ",%2           \n"
-    "sub       $0x4,%4                         \n"
-    "jge       40b                             \n"
-
-  "49:                                         \n"
-    "test      $0x2,%4                         \n"
-    "je        29f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
-    "pextrw    $0x5,%%xmm2,%k0                 \n"
-    "punpckldq %%xmm1,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(2) "         \n"
-    "lea       " MEMLEA(0x8,2) ",%2            \n"
-  "29:                                         \n"
-    "test      $0x1,%4                         \n"
-    "je        99f                             \n"
-    MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
-    "movd      %%xmm0," MEMACCESS(2) "         \n"
-  "99:                                         \n"
-  : "=&a"(x0),         // %0
-    "=&d"(x1),         // %1
-    "+r"(dst_argb),    // %2
-    "+r"(src_argb),    // %3
-    "+r"(dst_width)    // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
-  );
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pshufd    $0x0,%%xmm2,%%xmm2              \n"
+      "pshufd    $0x11,%%xmm3,%%xmm0             \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x5,%%xmm3,%%xmm0              \n"
+      "paddd     %%xmm0,%%xmm2                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pshufd    $0x0,%%xmm3,%%xmm3              \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "cmp       $0x0,%4                         \n"
+      "jl        99f                             \n"
+      "sub       $0x4,%4                         \n"
+      "jl        49f                             \n"
+
+      LABELALIGN
+      "40:                                       \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "pextrw    $0x7,%%xmm2,%k1                 \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movd      0x00(%3,%0,4),%%xmm1            \n"
+      "movd      0x00(%3,%1,4),%%xmm4            \n"
+      "pextrw    $0x1,%%xmm2,%k0                 \n"
+      "pextrw    $0x3,%%xmm2,%k1                 \n"
+      "punpckldq %%xmm4,%%xmm1                   \n"
+      "punpcklqdq %%xmm1,%%xmm0                  \n"
+      "movdqu    %%xmm0,(%2)                     \n"
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x4,%4                         \n"
+      "jge       40b                             \n"
+
+      "49:                                       \n"
+      "test      $0x2,%4                         \n"
+      "je        29f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      0x00(%3,%1,4),%%xmm1            \n"
+      "pextrw    $0x5,%%xmm2,%k0                 \n"
+      "punpckldq %%xmm1,%%xmm0                   \n"
+      "movq      %%xmm0,(%2)                     \n"
+      "lea       0x8(%2),%2                      \n"
+      "29:                                       \n"
+      "test      $0x1,%4                         \n"
+      "je        99f                             \n"
+      "movd      0x00(%3,%0,4),%%xmm0            \n"
+      "movd      %%xmm0,(%2)                     \n"
+      "99:                                       \n"
+      : "=&a"(x0),       // %0
+        "=&d"(x1),       // %1
+        "+r"(dst_argb),  // %2
+        "+r"(src_argb),  // %3
+        "+r"(dst_width)  // %4
+      : "rm"(x),         // %5
+        "rm"(dx)         // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
-  asm volatile (
-    LABELALIGN
-  "1:                                          \n"
-    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
-    "lea       " MEMLEA(0x10,1) ",%1           \n"
-    "movdqa    %%xmm0,%%xmm1                   \n"
-    "punpckldq %%xmm0,%%xmm0                   \n"
-    "punpckhdq %%xmm1,%%xmm1                   \n"
-    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
-    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
-    "lea       " MEMLEA(0x20,0) ",%0           \n"
-    "sub       $0x8,%2                         \n"
-    "jg        1b                              \n"
-
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+r"(dst_width)    // %2
-  :: "memory", "cc", NACL_R14
-    "xmm0", "xmm1"
-  );
+void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                           const uint8_t* src_argb,
+                           int dst_width,
+                           int x,
+                           int dx) {
+  (void)x;
+  (void)dx;
+  asm volatile(
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%1),%%xmm0                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpckldq %%xmm0,%%xmm0                   \n"
+      "punpckhdq %%xmm1,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%0)                     \n"
+      "movdqu    %%xmm1,0x10(%0)                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+
+      : "+r"(dst_argb),  // %0
+        "+r"(src_argb),  // %1
+        "+r"(dst_width)  // %2
+        ::"memory",
+        "cc", "xmm0", "xmm1");
 }
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                               const uint8_t* src_argb,
+                               int dst_width,
+                               int x,
+                               int dx) {
   intptr_t x0, x1;
-  asm volatile (
-    "movdqa    %0,%%xmm4                       \n"
-    "movdqa    %1,%%xmm5                       \n"
-  :
-  : "m"(kShuffleColARGB),  // %0
-    "m"(kShuffleFractions)  // %1
-  );
-
-  asm volatile (
-    "movd      %5,%%xmm2                       \n"
-    "movd      %6,%%xmm3                       \n"
-    "pcmpeqb   %%xmm6,%%xmm6                   \n"
-    "psrlw     $0x9,%%xmm6                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "sub       $0x2,%2                         \n"
-    "jl        29f                             \n"
-    "movdqa    %%xmm2,%%xmm0                   \n"
-    "paddd     %%xmm3,%%xmm0                   \n"
-    "punpckldq %%xmm0,%%xmm2                   \n"
-    "punpckldq %%xmm3,%%xmm3                   \n"
-    "paddd     %%xmm3,%%xmm3                   \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-
-    LABELALIGN
-  "2:                                          \n"
-    "movdqa    %%xmm2,%%xmm1                   \n"
-    "paddd     %%xmm3,%%xmm2                   \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "psrlw     $0x9,%%xmm1                     \n"
-    MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm1                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm1                   \n"
-    "pmaddubsw %%xmm1,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "pextrw    $0x1,%%xmm2,%k3                 \n"
-    "pextrw    $0x3,%%xmm2,%k4                 \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movq      %%xmm0," MEMACCESS(0) "         \n"
-    "lea       " MEMLEA(0x8,0) ",%0            \n"
-    "sub       $0x2,%2                         \n"
-    "jge       2b                              \n"
-
-    LABELALIGN
-  "29:                                         \n"
-    "add       $0x1,%2                         \n"
-    "jl        99f                             \n"
-    "psrlw     $0x9,%%xmm2                     \n"
-    MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
-    "pshufb    %%xmm5,%%xmm2                   \n"
-    "pshufb    %%xmm4,%%xmm0                   \n"
-    "pxor      %%xmm6,%%xmm2                   \n"
-    "pmaddubsw %%xmm2,%%xmm0                   \n"
-    "psrlw     $0x7,%%xmm0                     \n"
-    "packuswb  %%xmm0,%%xmm0                   \n"
-    "movd      %%xmm0," MEMACCESS(0) "         \n"
-
-    LABELALIGN
-  "99:                                         \n"
-  : "+r"(dst_argb),    // %0
-    "+r"(src_argb),    // %1
-    "+rm"(dst_width),  // %2
-    "=&r"(x0),         // %3
-    "=&r"(x1)          // %4
-  : "rm"(x),           // %5
-    "rm"(dx)           // %6
-  : "memory", "cc", NACL_R14
-    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
-  );
+  asm volatile(
+      "movdqa    %0,%%xmm4                       \n"
+      "movdqa    %1,%%xmm5                       \n"
+      :
+      : "m"(kShuffleColARGB),   // %0
+        "m"(kShuffleFractions)  // %1
+      );
+
+  asm volatile(
+      "movd      %5,%%xmm2                       \n"
+      "movd      %6,%%xmm3                       \n"
+      "pcmpeqb   %%xmm6,%%xmm6                   \n"
+      "psrlw     $0x9,%%xmm6                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "sub       $0x2,%2                         \n"
+      "jl        29f                             \n"
+      "movdqa    %%xmm2,%%xmm0                   \n"
+      "paddd     %%xmm3,%%xmm0                   \n"
+      "punpckldq %%xmm0,%%xmm2                   \n"
+      "punpckldq %%xmm3,%%xmm3                   \n"
+      "paddd     %%xmm3,%%xmm3                   \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+
+      LABELALIGN
+      "2:                                        \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "paddd     %%xmm3,%%xmm2                   \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "psrlw     $0x9,%%xmm1                     \n"
+      "movhps    0x00(%1,%4,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm1                   \n"
+      "pmaddubsw %%xmm1,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "pextrw    $0x1,%%xmm2,%k3                 \n"
+      "pextrw    $0x3,%%xmm2,%k4                 \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movq      %%xmm0,(%0)                     \n"
+      "lea       0x8(%0),%0                      \n"
+      "sub       $0x2,%2                         \n"
+      "jge       2b                              \n"
+
+      LABELALIGN
+      "29:                                       \n"
+      "add       $0x1,%2                         \n"
+      "jl        99f                             \n"
+      "psrlw     $0x9,%%xmm2                     \n"
+      "movq      0x00(%1,%3,4),%%xmm0            \n"
+      "pshufb    %%xmm5,%%xmm2                   \n"
+      "pshufb    %%xmm4,%%xmm0                   \n"
+      "pxor      %%xmm6,%%xmm2                   \n"
+      "pmaddubsw %%xmm2,%%xmm0                   \n"
+      "psrlw     $0x7,%%xmm0                     \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "movd      %%xmm0,(%0)                     \n"
+
+      LABELALIGN "99:                            \n"  // clang-format error.
+
+      : "+r"(dst_argb),    // %0
+        "+r"(src_argb),    // %1
+        "+rm"(dst_width),  // %2
+        "=&r"(x0),         // %3
+        "=&r"(x1)          // %4
+      : "rm"(x),           // %5
+        "rm"(dx)           // %6
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
 }
 
 // Divide num by div and return as 16.16 fixed point result.
 int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
 int FixedDiv1_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "sub       $0x10001,%%eax                  \n"
-    "sbb       $0x0,%%edx                      \n"
-    "sub       $0x1,%1                         \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
+  asm volatile(
+      "cdq                                       \n"
+      "shld      $0x10,%%eax,%%edx               \n"
+      "shl       $0x10,%%eax                     \n"
+      "sub       $0x10001,%%eax                  \n"
+      "sbb       $0x0,%%edx                      \n"
+      "sub       $0x1,%1                         \n"
+      "idiv      %1                              \n"
+      "mov       %0, %%eax                       \n"
+      : "+a"(num)  // %0
+      : "c"(div)   // %1
+      : "memory", "cc", "edx");
   return num;
 }
 
diff --git a/libs/libvpx/third_party/libyuv/source/scale_mips.cc b/libs/libvpx/third_party/libyuv/source/scale_mips.cc
deleted file mode 100644
index ae953073fa8..00000000000
--- a/libs/libvpx/third_party/libyuv/source/scale_mips.cc
+++ /dev/null
@@ -1,644 +0,0 @@
-/*
- *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS. All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "libyuv/basic_types.h"
-#include "libyuv/row.h"
-
-#ifdef __cplusplus
-namespace libyuv {
-extern "C" {
-#endif
-
-// This module is for GCC MIPS DSPR2
-#if !defined(LIBYUV_DISABLE_MIPS) && \
-    defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
-    (_MIPS_SIM == _MIPS_SIM_ABI32)
-
-void ScaleRowDown2_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__(
-    ".set push                                     \n"
-    ".set noreorder                                \n"
-
-    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
-    "beqz           $t9, 2f                        \n"
-    " nop                                          \n"
-
-  "1:                                              \n"
-    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
-    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
-    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
-    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
-    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
-    // TODO(fbarchard): Use odd pixels instead of even.
-    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
-    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
-    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
-    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
-    "addiu          %[src_ptr], %[src_ptr], 32     \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sw             $t8, 0(%[dst])                 \n"
-    "sw             $t0, 4(%[dst])                 \n"
-    "sw             $t1, 8(%[dst])                 \n"
-    "sw             $t2, 12(%[dst])                \n"
-    "bgtz           $t9, 1b                        \n"
-    " addiu         %[dst], %[dst], 16             \n"
-
-  "2:                                              \n"
-    "andi           $t9, %[dst_width], 0xf         \n"  // residue
-    "beqz           $t9, 3f                        \n"
-    " nop                                          \n"
-
-  "21:                                             \n"
-    "lbu            $t0, 0(%[src_ptr])             \n"
-    "addiu          %[src_ptr], %[src_ptr], 2      \n"
-    "addiu          $t9, $t9, -1                   \n"
-    "sb             $t0, 0(%[dst])                 \n"
-    "bgtz           $t9, 21b                       \n"
-    " addiu         %[dst], %[dst], 1              \n"
-
-  "3:                                              \n"
-    ".set pop                                      \n"
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown2Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  const uint8* t = src_ptr + src_stride;
-
-  __asm__ __volatile__ (
-    ".set push                                    \n"
-    ".set noreorder                               \n"
-
-    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
-    "bltz           $t9, 2f                       \n"
-    " nop                                         \n"
-
-  "1:                                             \n"
-    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
-    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
-    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
-    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
-    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
-    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
-    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
-    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
-    "addiu          $t9, $t9, -1                  \n"
-    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
-    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
-    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
-    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
-    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
-    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
-    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
-    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
-    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
-    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
-    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
-    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
-    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
-    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
-    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
-    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
-    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
-    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
-    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
-    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
-    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
-    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
-    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
-    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
-    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
-    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
-    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
-    "addiu          %[src_ptr], %[src_ptr], 16    \n"
-    "addiu          %[t], %[t], 16                \n"
-    "sb             $t0, 0(%[dst])                \n"
-    "sb             $t4, 1(%[dst])                \n"
-    "sb             $t1, 2(%[dst])                \n"
-    "sb             $t5, 3(%[dst])                \n"
-    "sb             $t2, 4(%[dst])                \n"
-    "sb             $t6, 5(%[dst])                \n"
-    "sb             $t3, 6(%[dst])                \n"
-    "sb             $t7, 7(%[dst])                \n"
-    "bgtz           $t9, 1b                       \n"
-    " addiu         %[dst], %[dst], 8             \n"
-
-  "2:                                             \n"
-    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
-    "beqz           $t9, 3f                       \n"
-    " nop                                         \n"
-
-    "21:                                          \n"
-    "lwr            $t1, 0(%[src_ptr])            \n"
-    "lwl            $t1, 3(%[src_ptr])            \n"
-    "lwr            $t2, 0(%[t])                  \n"
-    "lwl            $t2, 3(%[t])                  \n"
-    "srl            $t8, $t1, 16                  \n"
-    "ins            $t1, $t2, 16, 16              \n"
-    "ins            $t2, $t8, 0, 16               \n"
-    "raddu.w.qb     $t1, $t1                      \n"
-    "raddu.w.qb     $t2, $t2                      \n"
-    "shra_r.w       $t1, $t1, 2                   \n"
-    "shra_r.w       $t2, $t2, 2                   \n"
-    "sb             $t1, 0(%[dst])                \n"
-    "sb             $t2, 1(%[dst])                \n"
-    "addiu          %[src_ptr], %[src_ptr], 4     \n"
-    "addiu          $t9, $t9, -2                  \n"
-    "addiu          %[t], %[t], 4                 \n"
-    "bgtz           $t9, 21b                      \n"
-    " addiu         %[dst], %[dst], 2             \n"
-
-  "3:                                             \n"
-    ".set pop                                     \n"
-
-  : [src_ptr] "+r" (src_ptr),
-    [dst] "+r" (dst), [t] "+r" (t)
-  : [dst_width] "r" (dst_width)
-  : "t0", "t1", "t2", "t3", "t4", "t5",
-    "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                    \n"
-      ".set noreorder                               \n"
-
-      "srl            $t9, %[dst_width], 3          \n"
-      "beqz           $t9, 2f                       \n"
-      " nop                                         \n"
-
-     "1:                                            \n"
-      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
-      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
-      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
-      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
-      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
-      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
-      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
-      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
-      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
-      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
-      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
-      "addiu          %[src_ptr], %[src_ptr], 32    \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sw             $t1, 0(%[dst])                \n"
-      "sw             $t5, 4(%[dst])                \n"
-      "bgtz           $t9, 1b                       \n"
-      " addiu         %[dst], %[dst], 8             \n"
-
-    "2:                                             \n"
-      "andi           $t9, %[dst_width], 7          \n"  // residue
-      "beqz           $t9, 3f                       \n"
-      " nop                                         \n"
-
-    "21:                                            \n"
-      "lbu            $t1, 0(%[src_ptr])            \n"
-      "addiu          %[src_ptr], %[src_ptr], 4     \n"
-      "addiu          $t9, $t9, -1                  \n"
-      "sb             $t1, 0(%[dst])                \n"
-      "bgtz           $t9, 21b                      \n"
-      " addiu         %[dst], %[dst], 1             \n"
-
-    "3:                                             \n"
-      ".set pop                                     \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst)
-      : [dst_width] "r" (dst_width)
-      : "t1", "t2", "t3", "t4", "t5",
-        "t6", "t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown4Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  const uint8* s2 = s1 + stride;
-  const uint8* s3 = s2 + stride;
-
-  __asm__ __volatile__ (
-      ".set push                                  \n"
-      ".set noreorder                             \n"
-
-      "srl           $t9, %[dst_width], 1         \n"
-      "andi          $t8, %[dst_width], 1         \n"
-
-     "1:                                          \n"
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
-      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
-      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
-      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
-      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
-      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
-      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "add           $t4, $t4, $t5                \n"
-      "add           $t6, $t6, $t7                \n"
-      "add           $t4, $t4, $t6                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "shra_r.w      $t4, $t4, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-      "sb            $t4, 1(%[dst])               \n"
-      "addiu         %[src_ptr], %[src_ptr], 8    \n"
-      "addiu         %[s1], %[s1], 8              \n"
-      "addiu         %[s2], %[s2], 8              \n"
-      "addiu         %[s3], %[s3], 8              \n"
-      "addiu         $t9, $t9, -1                 \n"
-      "bgtz          $t9, 1b                      \n"
-      " addiu        %[dst], %[dst], 2            \n"
-      "beqz          $t8, 2f                      \n"
-      " nop                                       \n"
-
-      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
-      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
-      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
-      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
-      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
-      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
-      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
-      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
-      "add           $t0, $t0, $t1                \n"
-      "add           $t1, $t2, $t3                \n"
-      "add           $t0, $t0, $t1                \n"
-      "shra_r.w      $t0, $t0, 4                  \n"
-      "sb            $t0, 0(%[dst])               \n"
-
-      "2:                                         \n"
-      ".set pop                                   \n"
-
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [s3] "+r" (s3)
-      : [dst_width] "r" (dst_width)
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                          \n"
-      ".set noreorder                                     \n"
-    "1:                                                   \n"
-      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
-      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
-      "addiu           %[dst_width], %[dst_width], -24    \n"
-      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
-      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
-      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
-      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
-      "addiu           %[src_ptr], %[src_ptr], 32         \n"
-      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
-      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
-      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
-      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
-      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
-      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
-      "sw              $t1, 0(%[dst])                     \n"
-      "sw              $t0, 4(%[dst])                     \n"
-      "sw              $t3, 8(%[dst])                     \n"
-      "sw              $t5, 12(%[dst])                    \n"
-      "sw              $t9, 16(%[dst])                    \n"
-      "sw              $t7, 20(%[dst])                    \n"
-      "bnez            %[dst_width], 1b                   \n"
-      " addiu          %[dst], %[dst], 24                 \n"
-      ".set pop                                           \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4", "t5",
-        "t6","t7", "t8", "t9"
-  );
-}
-
-void ScaleRowDown34_0_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-      "repl.ph           $t3, 3                          \n"  // 0x00030003
-
-    "1:                                                  \n"
-      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
-      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
-      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                        \n"
-      "raddu.w.qb        $t1, $t1                        \n"
-      "shra_r.w          $t0, $t0, 1                     \n"
-      "shra_r.w          $t1, $t1, 1                     \n"
-      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
-      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
-      "addu.ph           $t2, $t2, $t4                   \n"
-      "addu.ph           $t6, $t6, $t5                   \n"
-      "sll               $t5, $t0, 1                     \n"
-      "add               $t0, $t5, $t0                   \n"
-      "shra_r.ph         $t2, $t2, 2                     \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "shll.ph           $t4, $t2, 1                     \n"
-      "addq.ph           $t4, $t4, $t2                   \n"
-      "addu              $t0, $t0, $t1                   \n"
-      "addiu             %[src_ptr], %[src_ptr], 4       \n"
-      "shra_r.w          $t0, $t0, 2                     \n"
-      "addu.ph           $t6, $t6, $t4                   \n"
-      "shra_r.ph         $t6, $t6, 2                     \n"
-      "srl               $t1, $t6, 16                    \n"
-      "addiu             %[dst_width], %[dst_width], -3  \n"
-      "sb                $t1, 0(%[d])                    \n"
-      "sb                $t0, 1(%[d])                    \n"
-      "sb                $t6, 2(%[d])                    \n"
-      "bgtz              %[dst_width], 1b                \n"
-      " addiu            %[d], %[d], 3                   \n"
-    "3:                                                  \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown34_1_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* d, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                           \n"
-      ".set noreorder                                      \n"
-      "repl.ph           $t2, 3                            \n"  // 0x00030003
-
-    "1:                                                    \n"
-      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
-      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
-      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
-      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
-      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
-      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
-      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
-      "raddu.w.qb        $t0, $t0                          \n"
-      "raddu.w.qb        $t1, $t1                          \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "shra_r.w          $t1, $t1, 1                       \n"
-      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
-      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
-      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
-      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
-      "addu.ph           $t4, $t4, $t3                     \n"
-      "addu.ph           $t6, $t6, $t5                     \n"
-      "shra_r.ph         $t6, $t6, 2                       \n"
-      "shra_r.ph         $t4, $t4, 2                       \n"
-      "addu.ph           $t6, $t6, $t4                     \n"
-      "addiu             %[src_ptr], %[src_ptr], 4         \n"
-      "shra_r.ph         $t6, $t6, 1                       \n"
-      "addu              $t0, $t0, $t1                     \n"
-      "addiu             %[dst_width], %[dst_width], -3    \n"
-      "shra_r.w          $t0, $t0, 1                       \n"
-      "srl               $t1, $t6, 16                      \n"
-      "sb                $t1, 0(%[d])                      \n"
-      "sb                $t0, 1(%[d])                      \n"
-      "sb                $t6, 2(%[d])                      \n"
-      "bgtz              %[dst_width], 1b                  \n"
-      " addiu            %[d], %[d], 3                     \n"
-    "3:                                                    \n"
-      ".set pop                                            \n"
-      : [src_ptr] "+r" (src_ptr),
-        [src_stride] "+r" (src_stride),
-        [d] "+r" (d),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3",
-        "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst, int dst_width) {
-  __asm__ __volatile__ (
-      ".set push                                     \n"
-      ".set noreorder                                \n"
-
-    "1:                                              \n"
-      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
-      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
-      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
-      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
-      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
-      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
-      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
-      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
-      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
-      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
-      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
-      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
-      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
-      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
-      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
-      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
-      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
-      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
-      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
-      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
-      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
-      "addiu      %[src_ptr], %[src_ptr], 32         \n"
-      "addiu      %[dst_width], %[dst_width], -12    \n"
-      "addiu      $t8,%[dst_width], -12              \n"
-      "sw         $t1, 0(%[dst])                     \n"
-      "sw         $t4, 4(%[dst])                     \n"
-      "sw         $t6, 8(%[dst])                     \n"
-      "bgez       $t8, 1b                            \n"
-      " addiu     %[dst], %[dst], 12                 \n"
-      ".set pop                                      \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst] "+r" (dst),
-        [dst_width] "+r" (dst_width)
-      :
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-void ScaleRowDown38_2_Box_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* t = src_ptr + stride;
-  const int c = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
-      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
-      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
-      "srl             $t4, $t4, 2                       \n"  // t4 / 4
-      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
-      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
-      "addu            $t6, $t5, $t6                     \n"
-      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
-      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
-      "addu            $t0, $t0, $t2                     \n"
-      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[t], %[t], 8                     \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t4, -1(%[dst_ptr])               \n"
-      "sb              $t6, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [t] "+r" (t),
-        [dst_width] "+r" (dst_width)
-      : [c] "r" (c)
-      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
-  );
-}
-
-void ScaleRowDown38_3_Box_DSPR2(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
-  intptr_t stride = src_stride;
-  const uint8* s1 = src_ptr + stride;
-  stride += stride;
-  const uint8* s2 = src_ptr + stride;
-  const int c1 = 0x1C71;
-  const int c2 = 0x2AAA;
-
-  __asm__ __volatile__ (
-      ".set push                                         \n"
-      ".set noreorder                                    \n"
-
-    "1:                                                  \n"
-      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
-      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
-      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
-      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
-      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
-      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
-      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
-      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
-      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
-      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
-      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
-      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
-      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
-      "addu            $t7, $t7, $t8                     \n"
-      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
-      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
-      "addu            $t6, $t6, $t8                     \n"
-      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
-      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
-      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
-      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
-      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
-      "addu            $t7, $t7, $t8                     \n"
-      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
-      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
-      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
-      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
-      "raddu.w.qb      $t0, $t0                          \n"
-      "raddu.w.qb      $t2, $t2                          \n"
-      "raddu.w.qb      $t4, $t4                          \n"
-      "addu            $t0, $t0, $t2                     \n"
-      "addu            $t0, $t0, $t4                     \n"
-      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
-      "addiu           %[src_ptr], %[src_ptr], 8         \n"
-      "addiu           %[s1], %[s1], 8                   \n"
-      "addiu           %[s2], %[s2], 8                   \n"
-      "addiu           %[dst_width], %[dst_width], -3    \n"
-      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
-      "srl             $t6, $t6, 16                      \n"
-      "srl             $t7, $t7, 16                      \n"
-      "srl             $t0, $t0, 16                      \n"
-      "sb              $t6, -1(%[dst_ptr])               \n"
-      "sb              $t7, -2(%[dst_ptr])               \n"
-      "bgtz            %[dst_width], 1b                  \n"
-      " sb             $t0, -3(%[dst_ptr])               \n"
-      ".set pop                                          \n"
-      : [src_ptr] "+r" (src_ptr),
-        [dst_ptr] "+r" (dst_ptr),
-        [s1] "+r" (s1),
-        [s2] "+r" (s2),
-        [dst_width] "+r" (dst_width)
-      : [c1] "r" (c1), [c2] "r" (c2)
-      : "t0", "t1", "t2", "t3", "t4",
-        "t5", "t6", "t7", "t8"
-  );
-}
-
-#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
-
-#ifdef __cplusplus
-}  // extern "C"
-}  // namespace libyuv
-#endif
-
diff --git a/libs/libvpx/third_party/libyuv/source/scale_msa.cc b/libs/libvpx/third_party/libyuv/source/scale_msa.cc
new file mode 100644
index 00000000000..482a521f0d2
--- /dev/null
+++ b/libs/libvpx/third_party/libyuv/source/scale_msa.cc
@@ -0,0 +1,949 @@
+/*
+ *  Copyright 2016 The LibYuv Project Authors. All rights reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS. All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "libyuv/scale_row.h"
+
+// This module is for GCC MSA
+#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
+#include "libyuv/macros_msa.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define LOAD_INDEXED_DATA(srcp, indx0, out0) \
+  {                                          \
+    out0[0] = srcp[indx0[0]];                \
+    out0[1] = srcp[indx0[1]];                \
+    out0[2] = srcp[indx0[2]];                \
+    out0[3] = srcp[indx0[3]];                \
+  }
+
+void ScaleARGBRowDown2_MSA(const uint8_t* src_argb,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_argb,
+                           int dst_width) {
+  int x;
+  v16u8 src0, src1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    dst0 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Linear_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  v16u8 src0, src1, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_argb, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_argb, 16);
+    vec0 = (v16u8)__msa_pckev_w((v4i32)src1, (v4i32)src0);
+    vec1 = (v16u8)__msa_pckod_w((v4i32)src1, (v4i32)src0);
+    dst0 = (v16u8)__msa_aver_u_b((v16u8)vec0, (v16u8)vec1);
+    ST_UB(dst0, dst_argb);
+    src_argb += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDown2Box_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  const uint8_t* s = src_argb;
+  const uint8_t* t = src_argb + src_stride;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0;
+  v8u16 reg0, reg1, reg2, reg3;
+  v16i8 shuffler = {0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15};
+
+  for (x = 0; x < dst_width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src1, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src2, (v16i8)src2);
+    vec3 = (v16u8)__msa_vshf_b(shuffler, (v16i8)src3, (v16i8)src3);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg0 += reg2;
+    reg1 += reg3;
+    reg0 = (v8u16)__msa_srari_h((v8i16)reg0, 2);
+    reg1 = (v8u16)__msa_srari_h((v8i16)reg1, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    ST_UB(dst0, dst_argb);
+    s += 32;
+    t += 32;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEven_MSA(const uint8_t* src_argb,
+                              ptrdiff_t src_stride,
+                              int32_t src_stepx,
+                              uint8_t* dst_argb,
+                              int dst_width) {
+  int x;
+  int32_t stepx = src_stepx * 4;
+  int32_t data0, data1, data2, data3;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LW(src_argb);
+    data1 = LW(src_argb + stepx);
+    data2 = LW(src_argb + stepx * 2);
+    data3 = LW(src_argb + stepx * 3);
+    SW(data0, dst_argb);
+    SW(data1, dst_argb + 4);
+    SW(data2, dst_argb + 8);
+    SW(data3, dst_argb + 12);
+    src_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleARGBRowDownEvenBox_MSA(const uint8_t* src_argb,
+                                 ptrdiff_t src_stride,
+                                 int src_stepx,
+                                 uint8_t* dst_argb,
+                                 int dst_width) {
+  int x;
+  const uint8_t* nxt_argb = src_argb + src_stride;
+  int32_t stepx = src_stepx * 4;
+  int64_t data0, data1, data2, data3;
+  v16u8 src0 = {0}, src1 = {0}, src2 = {0}, src3 = {0};
+  v16u8 vec0, vec1, vec2, vec3;
+  v8u16 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 dst0;
+
+  for (x = 0; x < dst_width; x += 4) {
+    data0 = LD(src_argb);
+    data1 = LD(src_argb + stepx);
+    data2 = LD(src_argb + stepx * 2);
+    data3 = LD(src_argb + stepx * 3);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 0, data0);
+    src0 = (v16u8)__msa_insert_d((v2i64)src0, 1, data1);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 0, data2);
+    src1 = (v16u8)__msa_insert_d((v2i64)src1, 1, data3);
+    data0 = LD(nxt_argb);
+    data1 = LD(nxt_argb + stepx);
+    data2 = LD(nxt_argb + stepx * 2);
+    data3 = LD(nxt_argb + stepx * 3);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 0, data0);
+    src2 = (v16u8)__msa_insert_d((v2i64)src2, 1, data1);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 0, data2);
+    src3 = (v16u8)__msa_insert_d((v2i64)src3, 1, data3);
+    vec0 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec2 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec3 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    reg0 = __msa_hadd_u_h(vec0, vec0);
+    reg1 = __msa_hadd_u_h(vec1, vec1);
+    reg2 = __msa_hadd_u_h(vec2, vec2);
+    reg3 = __msa_hadd_u_h(vec3, vec3);
+    reg4 = (v8u16)__msa_pckev_d((v2i64)reg2, (v2i64)reg0);
+    reg5 = (v8u16)__msa_pckev_d((v2i64)reg3, (v2i64)reg1);
+    reg6 = (v8u16)__msa_pckod_d((v2i64)reg2, (v2i64)reg0);
+    reg7 = (v8u16)__msa_pckod_d((v2i64)reg3, (v2i64)reg1);
+    reg4 += reg6;
+    reg5 += reg7;
+    reg4 = (v8u16)__msa_srari_h((v8i16)reg4, 2);
+    reg5 = (v8u16)__msa_srari_h((v8i16)reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    ST_UB(dst0, dst_argb);
+    src_argb += stepx * 4;
+    nxt_argb += stepx * 4;
+    dst_argb += 16;
+  }
+}
+
+void ScaleRowDown2_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    dst1 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Linear_MSA(const uint8_t* src_ptr,
+                             ptrdiff_t src_stride,
+                             uint8_t* dst,
+                             int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, dst0, dst1;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    vec1 = (v16u8)__msa_pckod_b((v16i8)src1, (v16i8)src0);
+    vec3 = (v16u8)__msa_pckod_b((v16i8)src3, (v16i8)src2);
+    dst0 = __msa_aver_u_b(vec1, vec0);
+    dst1 = __msa_aver_u_b(vec3, vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    src_ptr += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown2Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1;
+  v8u16 vec0, vec1, vec2, vec3;
+
+  for (x = 0; x < dst_width; x += 32) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    vec0 = (v8u16)__msa_srari_h((v8i16)vec0, 2);
+    vec1 = (v8u16)__msa_srari_h((v8i16)vec1, 2);
+    vec2 = (v8u16)__msa_srari_h((v8i16)vec2, 2);
+    vec3 = (v8u16)__msa_srari_h((v8i16)vec3, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+    ST_UB2(dst0, dst1, dst, 16);
+    s += 64;
+    t += 64;
+    dst += 32;
+  }
+}
+
+void ScaleRowDown4_MSA(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint8_t* dst,
+                       int dst_width) {
+  int x;
+  v16u8 src0, src1, src2, src3, vec0, vec1, dst0;
+  (void)src_stride;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_pckev_b((v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_pckev_b((v16i8)src3, (v16i8)src2);
+    dst0 = (v16u8)__msa_pckod_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    src_ptr += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown4Box_MSA(const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          uint8_t* dst,
+                          int dst_width) {
+  int x;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  const uint8_t* t2 = s + src_stride * 3;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0;
+  v8u16 vec0, vec1, vec2, vec3;
+  v4u32 reg0, reg1, reg2, reg3;
+
+  for (x = 0; x < dst_width; x += 16) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t0, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t0, 48);
+    vec0 = __msa_hadd_u_h(src0, src0);
+    vec1 = __msa_hadd_u_h(src1, src1);
+    vec2 = __msa_hadd_u_h(src2, src2);
+    vec3 = __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    src0 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t1, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t1, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t2, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t2, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t2, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t2, 48);
+    vec0 += __msa_hadd_u_h(src0, src0);
+    vec1 += __msa_hadd_u_h(src1, src1);
+    vec2 += __msa_hadd_u_h(src2, src2);
+    vec3 += __msa_hadd_u_h(src3, src3);
+    vec0 += __msa_hadd_u_h(src4, src4);
+    vec1 += __msa_hadd_u_h(src5, src5);
+    vec2 += __msa_hadd_u_h(src6, src6);
+    vec3 += __msa_hadd_u_h(src7, src7);
+    reg0 = __msa_hadd_u_w(vec0, vec0);
+    reg1 = __msa_hadd_u_w(vec1, vec1);
+    reg2 = __msa_hadd_u_w(vec2, vec2);
+    reg3 = __msa_hadd_u_w(vec3, vec3);
+    reg0 = (v4u32)__msa_srari_w((v4i32)reg0, 4);
+    reg1 = (v4u32)__msa_srari_w((v4i32)reg1, 4);
+    reg2 = (v4u32)__msa_srari_w((v4i32)reg2, 4);
+    reg3 = (v4u32)__msa_srari_w((v4i32)reg3, 4);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)reg1, (v8i16)reg0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)reg3, (v8i16)reg2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)vec1, (v16i8)vec0);
+    ST_UB(dst0, dst);
+    s += 64;
+    t0 += 64;
+    t1 += 64;
+    t2 += 64;
+    dst += 16;
+  }
+}
+
+void ScaleRowDown38_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x, width;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, vec0;
+  v16i8 mask = {0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0};
+  (void)src_stride;
+
+  assert(dst_width % 3 == 0);
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    vec0 = (v16u8)__msa_vshf_b(mask, (v16i8)src1, (v16i8)src0);
+    dst0 = __msa_copy_u_d((v2i64)vec0, 0);
+    dst1 = __msa_copy_u_w((v4i32)vec0, 2);
+    SD(dst0, dst);
+    SW(dst1, dst + 8);
+    src_ptr += 32;
+    dst += 12;
+  }
+}
+
+void ScaleRowDown38_2_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8i16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+  v4u32 const_0x4000 = (v4u32)__msa_fill_w(0x4000);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec4 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x2AAA;
+    tmp1 *= const_0x2AAA;
+    tmp4 *= const_0x4000;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleRowDown38_3_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst_ptr,
+                              int dst_width) {
+  int x, width;
+  const uint8_t* s = src_ptr;
+  const uint8_t* t0 = s + src_stride;
+  const uint8_t* t1 = s + src_stride * 2;
+  uint64_t dst0;
+  uint32_t dst1;
+  v16u8 src0, src1, src2, src3, src4, src5, out;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v4u32 tmp0, tmp1, tmp2, tmp3, tmp4;
+  v8u16 zero = {0};
+  v8i16 mask = {0, 1, 2, 8, 3, 4, 5, 9};
+  v16i8 dst_mask = {0, 2, 16, 4, 6, 18, 8, 10, 20, 12, 14, 22, 0, 0, 0, 0};
+  v4u32 const_0x1C71 = (v4u32)__msa_fill_w(0x1C71);
+  v4u32 const_0x2AAA = (v4u32)__msa_fill_w(0x2AAA);
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+  width = dst_width / 3;
+
+  for (x = 0; x < width; x += 4) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)t0, 0);
+    src3 = (v16u8)__msa_ld_b((v16i8*)t0, 16);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t1, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t1, 16);
+    vec0 = (v8u16)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    vec1 = (v8u16)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    vec2 = (v8u16)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    vec3 = (v8u16)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    vec4 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src4);
+    vec5 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src4);
+    vec6 = (v8u16)__msa_ilvr_b((v16i8)zero, (v16i8)src5);
+    vec7 = (v8u16)__msa_ilvl_b((v16i8)zero, (v16i8)src5);
+    vec0 = __msa_hadd_u_h((v16u8)vec0, (v16u8)vec0);
+    vec1 = __msa_hadd_u_h((v16u8)vec1, (v16u8)vec1);
+    vec2 = __msa_hadd_u_h((v16u8)vec2, (v16u8)vec2);
+    vec3 = __msa_hadd_u_h((v16u8)vec3, (v16u8)vec3);
+    vec0 += __msa_hadd_u_h((v16u8)vec4, (v16u8)vec4);
+    vec1 += __msa_hadd_u_h((v16u8)vec5, (v16u8)vec5);
+    vec2 += __msa_hadd_u_h((v16u8)vec6, (v16u8)vec6);
+    vec3 += __msa_hadd_u_h((v16u8)vec7, (v16u8)vec7);
+    vec4 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec0);
+    vec5 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec1);
+    vec6 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec2);
+    vec7 = (v8u16)__msa_vshf_h(mask, (v8i16)zero, (v8i16)vec3);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    vec1 = (v8u16)__msa_pckod_w((v4i32)vec3, (v4i32)vec2);
+    vec0 = (v8u16)__msa_pckod_w((v4i32)vec1, (v4i32)vec0);
+    tmp0 = __msa_hadd_u_w(vec4, vec4);
+    tmp1 = __msa_hadd_u_w(vec5, vec5);
+    tmp2 = __msa_hadd_u_w(vec6, vec6);
+    tmp3 = __msa_hadd_u_w(vec7, vec7);
+    tmp4 = __msa_hadd_u_w(vec0, vec0);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    tmp0 = __msa_hadd_u_w(vec0, vec0);
+    tmp1 = __msa_hadd_u_w(vec1, vec1);
+    tmp0 *= const_0x1C71;
+    tmp1 *= const_0x1C71;
+    tmp4 *= const_0x2AAA;
+    tmp0 = (v4u32)__msa_srai_w((v4i32)tmp0, 16);
+    tmp1 = (v4u32)__msa_srai_w((v4i32)tmp1, 16);
+    tmp4 = (v4u32)__msa_srai_w((v4i32)tmp4, 16);
+    vec0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    vec1 = (v8u16)__msa_pckev_h((v8i16)tmp4, (v8i16)tmp4);
+    out = (v16u8)__msa_vshf_b(dst_mask, (v16i8)vec1, (v16i8)vec0);
+    dst0 = __msa_copy_u_d((v2i64)out, 0);
+    dst1 = __msa_copy_u_w((v4i32)out, 2);
+    SD(dst0, dst_ptr);
+    SW(dst1, dst_ptr + 8);
+    s += 32;
+    t0 += 32;
+    t1 += 32;
+    dst_ptr += 12;
+  }
+}
+
+void ScaleAddRow_MSA(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
+  int x;
+  v16u8 src0;
+  v8u16 dst0, dst1;
+  v16i8 zero = {0};
+
+  assert(src_width > 0);
+
+  for (x = 0; x < src_width; x += 16) {
+    src0 = LD_UB(src_ptr);
+    dst0 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 0);
+    dst1 = (v8u16)__msa_ld_h((v8i16*)dst_ptr, 16);
+    dst0 += (v8u16)__msa_ilvr_b(zero, (v16i8)src0);
+    dst1 += (v8u16)__msa_ilvl_b(zero, (v16i8)src0);
+    ST_UH2(dst0, dst1, dst_ptr, 8);
+    src_ptr += 16;
+    dst_ptr += 16;
+  }
+}
+
+void ScaleFilterCols_MSA(uint8_t* dst_ptr,
+                         const uint8_t* src_ptr,
+                         int dst_width,
+                         int x,
+                         int dx) {
+  int j;
+  v4i32 vec_x = __msa_fill_w(x);
+  v4i32 vec_dx = __msa_fill_w(dx);
+  v4i32 vec_const = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  v8u16 reg0, reg1;
+  v16u8 dst0;
+  v4i32 const_0xFFFF = __msa_fill_w(0xFFFF);
+  v4i32 const_0x40 = __msa_fill_w(0x40);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 16) {
+    vec2 = vec_x >> 16;
+    vec6 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    vec7 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec4 = vec_x >> 16;
+    vec8 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec5 = vec_x >> 16;
+    vec9 = vec_x & const_0xFFFF;
+    vec_x += vec1;
+    vec6 >>= 9;
+    vec7 >>= 9;
+    vec8 >>= 9;
+    vec9 >>= 9;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp0);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp1);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp2);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp3);
+    vec2 += 1;
+    vec3 += 1;
+    vec4 += 1;
+    vec5 += 1;
+    LOAD_INDEXED_DATA(src_ptr, vec2, tmp4);
+    LOAD_INDEXED_DATA(src_ptr, vec3, tmp5);
+    LOAD_INDEXED_DATA(src_ptr, vec4, tmp6);
+    LOAD_INDEXED_DATA(src_ptr, vec5, tmp7);
+    tmp4 -= tmp0;
+    tmp5 -= tmp1;
+    tmp6 -= tmp2;
+    tmp7 -= tmp3;
+    tmp4 *= vec6;
+    tmp5 *= vec7;
+    tmp6 *= vec8;
+    tmp7 *= vec9;
+    tmp4 += const_0x40;
+    tmp5 += const_0x40;
+    tmp6 += const_0x40;
+    tmp7 += const_0x40;
+    tmp4 >>= 7;
+    tmp5 >>= 7;
+    tmp6 >>= 7;
+    tmp7 >>= 7;
+    tmp0 += tmp4;
+    tmp1 += tmp5;
+    tmp2 += tmp6;
+    tmp3 += tmp7;
+    reg0 = (v8u16)__msa_pckev_h((v8i16)tmp1, (v8i16)tmp0);
+    reg1 = (v8u16)__msa_pckev_h((v8i16)tmp3, (v8i16)tmp2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    __msa_st_b(dst0, dst_ptr, 0);
+    dst_ptr += 16;
+  }
+}
+
+void ScaleARGBCols_MSA(uint8_t* dst_argb,
+                       const uint8_t* src_argb,
+                       int dst_width,
+                       int x,
+                       int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  uint32_t* dst = (uint32_t*)(dst_argb);
+  int j;
+  v4i32 x_vec = __msa_fill_w(x);
+  v4i32 dx_vec = __msa_fill_w(dx);
+  v4i32 const_vec = {0, 1, 2, 3};
+  v4i32 vec0, vec1, vec2;
+  v4i32 dst0;
+
+  vec0 = dx_vec * const_vec;
+  vec1 = dx_vec * 4;
+  x_vec += vec0;
+
+  for (j = 0; j < dst_width; j += 4) {
+    vec2 = x_vec >> 16;
+    x_vec += vec1;
+    LOAD_INDEXED_DATA(src, vec2, dst0);
+    __msa_st_w(dst0, dst, 0);
+    dst += 4;
+  }
+}
+
+void ScaleARGBFilterCols_MSA(uint8_t* dst_argb,
+                             const uint8_t* src_argb,
+                             int dst_width,
+                             int x,
+                             int dx) {
+  const uint32_t* src = (const uint32_t*)(src_argb);
+  int j;
+  v4u32 src0, src1, src2, src3;
+  v4u32 vec0, vec1, vec2, vec3;
+  v16u8 reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  v16u8 mult0, mult1, mult2, mult3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 dst0, dst1;
+  v4u32 vec_x = (v4u32)__msa_fill_w(x);
+  v4u32 vec_dx = (v4u32)__msa_fill_w(dx);
+  v4u32 vec_const = {0, 1, 2, 3};
+  v16u8 const_0x7f = (v16u8)__msa_fill_b(0x7f);
+
+  vec0 = vec_dx * vec_const;
+  vec1 = vec_dx * 4;
+  vec_x += vec0;
+
+  for (j = 0; j < dst_width - 1; j += 8) {
+    vec2 = vec_x >> 16;
+    reg0 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    vec3 = vec_x >> 16;
+    reg1 = (v16u8)(vec_x >> 9);
+    vec_x += vec1;
+    reg0 = reg0 & const_0x7f;
+    reg1 = reg1 & const_0x7f;
+    reg0 = (v16u8)__msa_shf_b((v16i8)reg0, 0);
+    reg1 = (v16u8)__msa_shf_b((v16i8)reg1, 0);
+    reg2 = reg0 ^ const_0x7f;
+    reg3 = reg1 ^ const_0x7f;
+    mult0 = (v16u8)__msa_ilvr_b((v16i8)reg0, (v16i8)reg2);
+    mult1 = (v16u8)__msa_ilvl_b((v16i8)reg0, (v16i8)reg2);
+    mult2 = (v16u8)__msa_ilvr_b((v16i8)reg1, (v16i8)reg3);
+    mult3 = (v16u8)__msa_ilvl_b((v16i8)reg1, (v16i8)reg3);
+    LOAD_INDEXED_DATA(src, vec2, src0);
+    LOAD_INDEXED_DATA(src, vec3, src1);
+    vec2 += 1;
+    vec3 += 1;
+    LOAD_INDEXED_DATA(src, vec2, src2);
+    LOAD_INDEXED_DATA(src, vec3, src3);
+    reg4 = (v16u8)__msa_ilvr_b((v16i8)src2, (v16i8)src0);
+    reg5 = (v16u8)__msa_ilvl_b((v16i8)src2, (v16i8)src0);
+    reg6 = (v16u8)__msa_ilvr_b((v16i8)src3, (v16i8)src1);
+    reg7 = (v16u8)__msa_ilvl_b((v16i8)src3, (v16i8)src1);
+    tmp0 = __msa_dotp_u_h(reg4, mult0);
+    tmp1 = __msa_dotp_u_h(reg5, mult1);
+    tmp2 = __msa_dotp_u_h(reg6, mult2);
+    tmp3 = __msa_dotp_u_h(reg7, mult3);
+    tmp0 >>= 7;
+    tmp1 >>= 7;
+    tmp2 >>= 7;
+    tmp3 >>= 7;
+    dst0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+    __msa_st_b(dst0, dst_argb, 0);
+    __msa_st_b(dst1, dst_argb, 16);
+    dst_argb += 32;
+  }
+}
+
+void ScaleRowDown34_MSA(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  int x;
+  (void)src_stride;
+  v16u8 src0, src1, src2, src3;
+  v16u8 vec0, vec1, vec2;
+  v16i8 mask0 = {0, 1, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20};
+  v16i8 mask1 = {5, 7, 8, 9, 11, 12, 13, 15, 16, 17, 19, 20, 21, 23, 24, 25};
+  v16i8 mask2 = {11, 12, 13, 15, 16, 17, 19, 20,
+                 21, 23, 24, 25, 27, 28, 29, 31};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)src_ptr, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src1, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src2, (v16i8)src1);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src2);
+    __msa_st_b((v16i8)vec0, dst, 0);
+    __msa_st_b((v16i8)vec1, dst, 16);
+    __msa_st_b((v16i8)vec2, dst, 32);
+    src_ptr += 64;
+    dst += 48;
+  }
+}
+
+void ScaleRowDown34_0_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 = reg0 * 3 + reg6;
+    reg1 = reg1 * 3 + reg7;
+    reg2 = reg2 * 3 + reg8;
+    reg3 = reg3 * 3 + reg9;
+    reg4 = reg4 * 3 + reg10;
+    reg5 = reg5 * 3 + reg11;
+    reg0 = __msa_srari_h(reg0, 2);
+    reg1 = __msa_srari_h(reg1, 2);
+    reg2 = __msa_srari_h(reg2, 2);
+    reg3 = __msa_srari_h(reg3, 2);
+    reg4 = __msa_srari_h(reg4, 2);
+    reg5 = __msa_srari_h(reg5, 2);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+void ScaleRowDown34_1_Box_MSA(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* d,
+                              int dst_width) {
+  const uint8_t* s = src_ptr;
+  const uint8_t* t = src_ptr + src_stride;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, dst0, dst1, dst2;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5;
+  v16u8 vec6, vec7, vec8, vec9, vec10, vec11;
+  v8i16 reg0, reg1, reg2, reg3, reg4, reg5;
+  v8i16 reg6, reg7, reg8, reg9, reg10, reg11;
+  v16u8 const0 = {3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1};
+  v16u8 const1 = {1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1};
+  v16u8 const2 = {1, 1, 1, 3, 3, 1, 1, 1, 1, 3, 3, 1, 1, 1, 1, 3};
+  v16i8 mask0 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
+  v16i8 mask1 = {10, 11, 12, 13, 13, 14, 14, 15,
+                 16, 17, 17, 18, 18, 19, 20, 21};
+  v16i8 mask2 = {5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15};
+  v8i16 shft0 = {2, 1, 2, 2, 1, 2, 2, 1};
+  v8i16 shft1 = {2, 2, 1, 2, 2, 1, 2, 2};
+  v8i16 shft2 = {1, 2, 2, 1, 2, 2, 1, 2};
+
+  assert((dst_width % 3 == 0) && (dst_width > 0));
+
+  for (x = 0; x < dst_width; x += 48) {
+    src0 = (v16u8)__msa_ld_b((v16i8*)s, 0);
+    src1 = (v16u8)__msa_ld_b((v16i8*)s, 16);
+    src2 = (v16u8)__msa_ld_b((v16i8*)s, 32);
+    src3 = (v16u8)__msa_ld_b((v16i8*)s, 48);
+    src4 = (v16u8)__msa_ld_b((v16i8*)t, 0);
+    src5 = (v16u8)__msa_ld_b((v16i8*)t, 16);
+    src6 = (v16u8)__msa_ld_b((v16i8*)t, 32);
+    src7 = (v16u8)__msa_ld_b((v16i8*)t, 48);
+    vec0 = (v16u8)__msa_vshf_b(mask0, (v16i8)src0, (v16i8)src0);
+    vec1 = (v16u8)__msa_vshf_b(mask1, (v16i8)src1, (v16i8)src0);
+    vec2 = (v16u8)__msa_vshf_b(mask2, (v16i8)src1, (v16i8)src1);
+    vec3 = (v16u8)__msa_vshf_b(mask0, (v16i8)src2, (v16i8)src2);
+    vec4 = (v16u8)__msa_vshf_b(mask1, (v16i8)src3, (v16i8)src2);
+    vec5 = (v16u8)__msa_vshf_b(mask2, (v16i8)src3, (v16i8)src3);
+    vec6 = (v16u8)__msa_vshf_b(mask0, (v16i8)src4, (v16i8)src4);
+    vec7 = (v16u8)__msa_vshf_b(mask1, (v16i8)src5, (v16i8)src4);
+    vec8 = (v16u8)__msa_vshf_b(mask2, (v16i8)src5, (v16i8)src5);
+    vec9 = (v16u8)__msa_vshf_b(mask0, (v16i8)src6, (v16i8)src6);
+    vec10 = (v16u8)__msa_vshf_b(mask1, (v16i8)src7, (v16i8)src6);
+    vec11 = (v16u8)__msa_vshf_b(mask2, (v16i8)src7, (v16i8)src7);
+    reg0 = (v8i16)__msa_dotp_u_h(vec0, const0);
+    reg1 = (v8i16)__msa_dotp_u_h(vec1, const1);
+    reg2 = (v8i16)__msa_dotp_u_h(vec2, const2);
+    reg3 = (v8i16)__msa_dotp_u_h(vec3, const0);
+    reg4 = (v8i16)__msa_dotp_u_h(vec4, const1);
+    reg5 = (v8i16)__msa_dotp_u_h(vec5, const2);
+    reg6 = (v8i16)__msa_dotp_u_h(vec6, const0);
+    reg7 = (v8i16)__msa_dotp_u_h(vec7, const1);
+    reg8 = (v8i16)__msa_dotp_u_h(vec8, const2);
+    reg9 = (v8i16)__msa_dotp_u_h(vec9, const0);
+    reg10 = (v8i16)__msa_dotp_u_h(vec10, const1);
+    reg11 = (v8i16)__msa_dotp_u_h(vec11, const2);
+    reg0 = __msa_srar_h(reg0, shft0);
+    reg1 = __msa_srar_h(reg1, shft1);
+    reg2 = __msa_srar_h(reg2, shft2);
+    reg3 = __msa_srar_h(reg3, shft0);
+    reg4 = __msa_srar_h(reg4, shft1);
+    reg5 = __msa_srar_h(reg5, shft2);
+    reg6 = __msa_srar_h(reg6, shft0);
+    reg7 = __msa_srar_h(reg7, shft1);
+    reg8 = __msa_srar_h(reg8, shft2);
+    reg9 = __msa_srar_h(reg9, shft0);
+    reg10 = __msa_srar_h(reg10, shft1);
+    reg11 = __msa_srar_h(reg11, shft2);
+    reg0 += reg6;
+    reg1 += reg7;
+    reg2 += reg8;
+    reg3 += reg9;
+    reg4 += reg10;
+    reg5 += reg11;
+    reg0 = __msa_srari_h(reg0, 1);
+    reg1 = __msa_srari_h(reg1, 1);
+    reg2 = __msa_srari_h(reg2, 1);
+    reg3 = __msa_srari_h(reg3, 1);
+    reg4 = __msa_srari_h(reg4, 1);
+    reg5 = __msa_srari_h(reg5, 1);
+    dst0 = (v16u8)__msa_pckev_b((v16i8)reg1, (v16i8)reg0);
+    dst1 = (v16u8)__msa_pckev_b((v16i8)reg3, (v16i8)reg2);
+    dst2 = (v16u8)__msa_pckev_b((v16i8)reg5, (v16i8)reg4);
+    __msa_st_b((v16i8)dst0, d, 0);
+    __msa_st_b((v16i8)dst1, d, 16);
+    __msa_st_b((v16i8)dst2, d, 32);
+    s += 64;
+    t += 64;
+    d += 48;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+}  // namespace libyuv
+#endif
+
+#endif  // !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon.cc b/libs/libvpx/third_party/libyuv/source/scale_neon.cc
index 44b0c8080d1..459a2995dfe 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_neon.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_neon.cc
@@ -23,564 +23,541 @@ extern "C" {
 // Provided by Fritz Koenig
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.8     {q0, q1}, [%0]!                \n"
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"              // Clobber List
-  );
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into q0, odd into q1
+      "vld2.8     {q0, q1}, [%0]!                \n"
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load pixels and post inc
-    "subs       %2, %2, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1"     // Clobber List
-  );
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {q0, q1}, [%0]!                \n"  // load 32 pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1"  // Clobber List
+      );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
-    "subs       %3, %3, #16                    \n"  // 16 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
-    "vpaddl.u8  q1, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent + row1
-    "vpadal.u8  q1, q3                         \n"
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "q0", "q1", "q2", "q3"     // Clobber List
-  );
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0                         \n"
+      "1:                                        \n"
+      "vld1.8     {q0, q1}, [%0]!                \n"  // load row 1 and post inc
+      "vld1.8     {q2, q3}, [%1]!                \n"  // load row 2 and post inc
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop
+      "vpaddl.u8  q0, q0                         \n"  // row 1 add adjacent
+      "vpaddl.u8  q1, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"  // row 2 add adjacent +
+                                                      // row1
+      "vpadal.u8  q1, q3                         \n"
+      "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and
+                                                      // pack
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n" // src line 0
-    "subs       %2, %2, #8                     \n" // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {d2}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "q0", "q1", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vst1.8     {d2}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "q0", "q1", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0]!                    \n"   // load up 16x4
-    MEMACCESS(3)
-    "vld1.8     {q1}, [%3]!                    \n"
-    MEMACCESS(4)
-    "vld1.8     {q2}, [%4]!                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q3}, [%5]!                    \n"
-    "subs       %2, %2, #4                     \n"
-    "vpaddl.u8  q0, q0                         \n"
-    "vpadal.u8  q0, q1                         \n"
-    "vpadal.u8  q0, q2                         \n"
-    "vpadal.u8  q0, q3                         \n"
-    "vpaddl.u16 q0, q0                         \n"
-    "vrshrn.u32 d0, q0, #4                     \n"   // divide by 16 w/rounding
-    "vmovn.u16  d0, q0                         \n"
-    MEMACCESS(1)
-    "vst1.32    {d0[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(dst_width), // %2
-    "+r"(src_ptr1),  // %3
-    "+r"(src_ptr2),  // %4
-    "+r"(src_ptr3)   // %5
-  :
-  : "q0", "q1", "q2", "q3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load up 16x4
+      "vld1.8     {q1}, [%3]!                    \n"
+      "vld1.8     {q2}, [%4]!                    \n"
+      "vld1.8     {q3}, [%5]!                    \n"
+      "subs       %2, %2, #4                     \n"
+      "vpaddl.u8  q0, q0                         \n"
+      "vpadal.u8  q0, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"
+      "vpadal.u8  q0, q3                         \n"
+      "vpaddl.u16 q0, q0                         \n"
+      "vrshrn.u32 d0, q0, #4                     \n"  // divide by 16 w/rounding
+      "vmovn.u16  d0, q0                         \n"
+      "vst1.32    {d0[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_ptr1),   // %3
+        "+r"(src_ptr2),   // %4
+        "+r"(src_ptr3)    // %5
+      :
+      : "q0", "q1", "q2", "q3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    "subs       %2, %2, #24                  \n"
-    "vmov       d2, d3                       \n" // order d0, d1, d2
-    MEMACCESS(1)
-    "vst3.8     {d0, d1, d2}, [%1]!          \n"
-    "bgt        1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "d0", "d1", "d2", "d3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // src line 0
+      "subs       %2, %2, #24                    \n"
+      "vmov       d2, d3                         \n"  // order d0, d1, d2
+      "vst3.8     {d0, d1, d2}, [%1]!            \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "d0", "d1", "d2", "d3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "vmovl.u8     q8, d4                       \n"
-    "vmovl.u8     q9, d5                       \n"
-    "vmovl.u8     q10, d6                      \n"
-    "vmovl.u8     q11, d7                      \n"
-
-    // 3 * line_0 + line_1
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vmlal.u8     q9, d1, d24                  \n"
-    "vmlal.u8     q10, d2, d24                 \n"
-    "vmlal.u8     q11, d3, d24                 \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "vqrshrn.u16  d0, q8, #2                   \n"
-    "vqrshrn.u16  d1, q9, #2                   \n"
-    "vqrshrn.u16  d2, q10, #2                  \n"
-    "vqrshrn.u16  d3, q11, #2                  \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q8, d1                       \n"
-    "vmlal.u8     q8, d0, d24                  \n"
-    "vqrshrn.u16  d0, q8, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q8, d2                       \n"
-    "vmlal.u8     q8, d3, d24                  \n"
-    "vqrshrn.u16  d2, q8, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "vmovl.u8     q8, d4                       \n"
+      "vmovl.u8     q9, d5                       \n"
+      "vmovl.u8     q10, d6                      \n"
+      "vmovl.u8     q11, d7                      \n"
+
+      // 3 * line_0 + line_1
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vmlal.u8     q9, d1, d24                  \n"
+      "vmlal.u8     q10, d2, d24                 \n"
+      "vmlal.u8     q11, d3, d24                 \n"
+
+      // (3 * line_0 + line_1) >> 2
+      "vqrshrn.u16  d0, q8, #2                   \n"
+      "vqrshrn.u16  d1, q9, #2                   \n"
+      "vqrshrn.u16  d2, q10, #2                  \n"
+      "vqrshrn.u16  d3, q11, #2                  \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q8, d1                       \n"
+      "vmlal.u8     q8, d0, d24                  \n"
+      "vqrshrn.u16  d0, q8, #2                   \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q8, d2                       \n"
+      "vmlal.u8     q8, d3, d24                  \n"
+      "vqrshrn.u16  d2, q8, #2                   \n"
+
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory",
+        "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "vmov.u8    d24, #3                        \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n" // src line 0
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n" // src line 1
-    "subs         %2, %2, #24                  \n"
-    // average src line 0 with src line 1
-    "vrhadd.u8    q0, q0, q2                   \n"
-    "vrhadd.u8    q1, q1, q3                   \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "vmovl.u8     q3, d1                       \n"
-    "vmlal.u8     q3, d0, d24                  \n"
-    "vqrshrn.u16  d0, q3, #2                   \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "vrhadd.u8    d1, d1, d2                   \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "vmovl.u8     q3, d2                       \n"
-    "vmlal.u8     q3, d3, d24                  \n"
-    "vqrshrn.u16  d2, q3, #2                   \n"
-
-    MEMACCESS(1)
-    "vst3.8       {d0, d1, d2}, [%1]!          \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vmov.u8    d24, #3                        \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"  // src line 0
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"  // src line 1
+      "subs         %2, %2, #24                  \n"
+      // average src line 0 with src line 1
+      "vrhadd.u8    q0, q0, q2                   \n"
+      "vrhadd.u8    q1, q1, q3                   \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "vmovl.u8     q3, d1                       \n"
+      "vmlal.u8     q3, d0, d24                  \n"
+      "vqrshrn.u16  d0, q3, #2                   \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "vrhadd.u8    d1, d1, d2                   \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "vmovl.u8     q3, d2                       \n"
+      "vmlal.u8     q3, d3, d24                  \n"
+      "vqrshrn.u16  d2, q3, #2                   \n"
+
+      "vst3.8       {d0, d1, d2}, [%1]!          \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc");
 }
 
 #define HAS_SCALEROWDOWN38_NEON
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  8, 16, 2,  10, 17, 4, 12,
+                                18, 6, 14, 19, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "vld1.8     {q3}, [%3]                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
-    "subs       %2, %2, #12                    \n"
-    "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
-    "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
-    MEMACCESS(1)
-    "vst1.8     {d4}, [%1]!                    \n"
-    MEMACCESS(1)
-    "vst1.32    {d5[0]}, [%1]!                 \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "vld1.8     {q3}, [%3]                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0, d1, d2, d3}, [%0]!        \n"
+      "subs       %2, %2, #12                    \n"
+      "vtbl.u8    d4, {d0, d1, d2, d3}, d6       \n"
+      "vtbl.u8    d5, {d0, d1, d2, d3}, d7       \n"
+      "vst1.8     {d4}, [%1]!                    \n"
+      "vst1.32    {d5[0]}, [%1]!                 \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
-
-  asm volatile (
-    MEMACCESS(5)
-    "vld1.16    {q13}, [%5]                    \n"
-    MEMACCESS(6)
-    "vld1.8     {q14}, [%6]                    \n"
-    MEMACCESS(7)
-    "vld1.8     {q15}, [%7]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    MEMACCESS(4)
-    "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-    "vtrn.u8      d16, d17                     \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-    "vtrn.u8      d18, d19                     \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-    "vpaddl.u8    q8, q8                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-    "vpaddl.u8    d19, d19                     \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     q0, q8                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-    "vadd.u16     d4, d19                      \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "vqrdmulh.s16 q2, q2, q13                  \n"
-    "vmovn.u16    d4, q2                       \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-    "vmovl.u8     q9, d18                      \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-    "vadd.u16     q1, q9                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q15                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride),       // %3
-    "+r"(src_ptr1)          // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
-  );
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
+
+  asm volatile(
+      "vld1.16    {q13}, [%5]                    \n"
+      "vld1.8     {q14}, [%6]                    \n"
+      "vld1.8     {q15}, [%7]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "vld4.8       {d16, d17, d18, d19}, [%4]!  \n"
+      "subs         %2, %2, #12                  \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
+      "vtrn.u8      d16, d17                     \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
+      "vtrn.u8      d18, d19                     \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
+      "vpaddl.u8    q8, q8                       \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
+      "vpaddl.u8    d19, d19                     \n"
+
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     q0, q8                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
+      "vadd.u16     d4, d19                      \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "vqrdmulh.s16 q2, q2, q13                  \n"
+      "vmovn.u16    d4, q2                       \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
+      "vmovl.u8     q9, d18                      \n"
+
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
+      "vadd.u16     q1, q9                       \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q15                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
+
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride),    // %3
+        "+r"(src_ptr1)       // %4
+      : "r"(&kMult38_Div6),  // %5
+        "r"(&kShuf38_2),     // %6
+        "r"(&kMult38_Div9)   // %7
+      : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory",
+        "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(4)
-    "vld1.16    {q13}, [%4]                    \n"
-    MEMACCESS(5)
-    "vld1.8     {q14}, [%5]                    \n"
-    "add        %3, %0                         \n"
-  "1:                                          \n"
-
-    // d0 = 00 40 01 41 02 42 03 43
-    // d1 = 10 50 11 51 12 52 13 53
-    // d2 = 20 60 21 61 22 62 23 63
-    // d3 = 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
-    MEMACCESS(3)
-    "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
-    "subs         %2, %2, #12                  \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // d0 = 00 10 01 11 02 12 03 13
-    // d1 = 40 50 41 51 42 52 43 53
-    "vtrn.u8      d0, d1                       \n"
-    "vtrn.u8      d4, d5                       \n"
-
-    // d2 = 20 30 21 31 22 32 23 33
-    // d3 = 60 70 61 71 62 72 63 73
-    "vtrn.u8      d2, d3                       \n"
-    "vtrn.u8      d6, d7                       \n"
-
-    // d0 = 00+10 01+11 02+12 03+13
-    // d2 = 40+50 41+51 42+52 43+53
-    "vpaddl.u8    q0, q0                       \n"
-    "vpaddl.u8    q2, q2                       \n"
-
-    // d3 = 60+70 61+71 62+72 63+73
-    "vpaddl.u8    d3, d3                       \n"
-    "vpaddl.u8    d7, d7                       \n"
-
-    // combine source lines
-    "vadd.u16     q0, q2                       \n"
-    "vadd.u16     d4, d3, d7                   \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "vqrshrn.u16  d4, q2, #2                   \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "vmovl.u8     q1, d2                       \n"
-    "vmovl.u8     q3, d6                       \n"
-
-    // combine source lines
-    "vadd.u16     q1, q3                       \n"
-
-    // d4 = xx 20 xx 30 xx 22 xx 32
-    // d5 = xx 21 xx 31 xx 23 xx 33
-    "vtrn.u32     d2, d3                       \n"
-
-    // d4 = xx 20 xx 21 xx 22 xx 23
-    // d5 = xx 30 xx 31 xx 32 xx 33
-    "vtrn.u16     d2, d3                       \n"
-
-    // 0+1+2, 3+4+5
-    "vadd.u16     q0, q1                       \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "vqrdmulh.s16 q0, q0, q13                  \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "vmov.u8      d2, d4                       \n"
-
-    "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
-    "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
-
-    MEMACCESS(1)
-    "vst1.8       {d3}, [%1]!                  \n"
-    MEMACCESS(1)
-    "vst1.32      {d4[0]}, [%1]!               \n"
-    "bgt          1b                           \n"
-  : "+r"(src_ptr),       // %0
-    "+r"(dst_ptr),       // %1
-    "+r"(dst_width),     // %2
-    "+r"(src_stride)     // %3
-  : "r"(&kMult38_Div6),  // %4
-    "r"(&kShuf38_2)      // %5
-  : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "vld1.16    {q13}, [%4]                    \n"
+      "vld1.8     {q14}, [%5]                    \n"
+      "add        %3, %0                         \n"
+      "1:                                        \n"
+
+      // d0 = 00 40 01 41 02 42 03 43
+      // d1 = 10 50 11 51 12 52 13 53
+      // d2 = 20 60 21 61 22 62 23 63
+      // d3 = 30 70 31 71 32 72 33 73
+      "vld4.8       {d0, d1, d2, d3}, [%0]!      \n"
+      "vld4.8       {d4, d5, d6, d7}, [%3]!      \n"
+      "subs         %2, %2, #12                  \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // d0 = 00 10 01 11 02 12 03 13
+      // d1 = 40 50 41 51 42 52 43 53
+      "vtrn.u8      d0, d1                       \n"
+      "vtrn.u8      d4, d5                       \n"
+
+      // d2 = 20 30 21 31 22 32 23 33
+      // d3 = 60 70 61 71 62 72 63 73
+      "vtrn.u8      d2, d3                       \n"
+      "vtrn.u8      d6, d7                       \n"
+
+      // d0 = 00+10 01+11 02+12 03+13
+      // d2 = 40+50 41+51 42+52 43+53
+      "vpaddl.u8    q0, q0                       \n"
+      "vpaddl.u8    q2, q2                       \n"
+
+      // d3 = 60+70 61+71 62+72 63+73
+      "vpaddl.u8    d3, d3                       \n"
+      "vpaddl.u8    d7, d7                       \n"
+
+      // combine source lines
+      "vadd.u16     q0, q2                       \n"
+      "vadd.u16     d4, d3, d7                   \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "vqrshrn.u16  d4, q2, #2                   \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "vmovl.u8     q1, d2                       \n"
+      "vmovl.u8     q3, d6                       \n"
+
+      // combine source lines
+      "vadd.u16     q1, q3                       \n"
+
+      // d4 = xx 20 xx 30 xx 22 xx 32
+      // d5 = xx 21 xx 31 xx 23 xx 33
+      "vtrn.u32     d2, d3                       \n"
+
+      // d4 = xx 20 xx 21 xx 22 xx 23
+      // d5 = xx 30 xx 31 xx 32 xx 33
+      "vtrn.u16     d2, d3                       \n"
+
+      // 0+1+2, 3+4+5
+      "vadd.u16     q0, q1                       \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "vqrdmulh.s16 q0, q0, q13                  \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+      "vmov.u8      d2, d4                       \n"
+
+      "vtbl.u8      d3, {d0, d1, d2}, d28        \n"
+      "vtbl.u8      d4, {d0, d1, d2}, d29        \n"
+
+      "vst1.8       {d3}, [%1]!                  \n"
+      "vst1.32      {d4[0]}, [%1]!               \n"
+      "bgt          1b                           \n"
+      : "+r"(src_ptr),       // %0
+        "+r"(dst_ptr),       // %1
+        "+r"(dst_width),     // %2
+        "+r"(src_stride)     // %3
+      : "r"(&kMult38_Div6),  // %4
+        "r"(&kShuf38_2)      // %5
+      : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       r12, %5                         \n"
-    "veor      q2, q2, q2                      \n"
-    "veor      q3, q3, q3                      \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "vld1.8     {q0}, [%0], %3                 \n"
-    "vaddw.u8   q3, q3, d1                     \n"
-    "vaddw.u8   q2, q2, d0                     \n"
-    "subs       r12, r12, #1                   \n"
-    "bgt        2b                             \n"
-    MEMACCESS(2)
-    "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
-    "add        %1, %1, #16                    \n"
-    "subs       %4, %4, #16                    \n"  // 16 processed per loop
-    "bgt        1b                             \n"
-  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),  // %3
-    "+r"(src_width),   // %4
-    "+r"(src_height)   // %5
-  :
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       r12, %5                         \n"
+      "veor      q2, q2, q2                      \n"
+      "veor      q3, q3, q3                      \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "vld1.8     {q0}, [%0], %3                 \n"
+      "vaddw.u8   q3, q3, d1                     \n"
+      "vaddw.u8   q2, q2, d0                     \n"
+      "subs       r12, r12, #1                   \n"
+      "bgt        2b                             \n"
+      "vst1.16    {q2, q3}, [%2]!                \n"  // store pixels
+      "add        %1, %1, #16                    \n"
+      "subs       %4, %4, #16                    \n"  // 16 processed per loop
+      "bgt        1b                             \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                     \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld2.8     {d6["#n"], d7["#n"]}, [%6]     \n"
-
-// The NEON version mimics this formula:
-// #define BLENDER(a, b, f) (uint8)((int)(a) +
-//    ((int)(f) * ((int)(b) - (int)(a)) >> 16))
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld2.8     {d6[" #n "], d7[" #n "]}, [%6] \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
+  const uint8_t* src_tmp = src_ptr;
   asm volatile (
     "vdup.32    q0, %3                         \n"  // x
     "vdup.32    q1, %4                         \n"  // dx
@@ -617,7 +594,6 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "vadd.s16   q8, q8, q9                     \n"
     "vmovn.s16  d6, q8                         \n"
 
-    MEMACCESS(0)
     "vst1.8     {d6}, [%0]!                    \n"  // store pixels
     "vadd.s32   q1, q1, q0                     \n"
     "vadd.s32   q2, q2, q0                     \n"
@@ -639,325 +615,299 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-  asm volatile (
-    "cmp          %4, #0                       \n"
-    "beq          100f                         \n"
-    "add          %2, %1                       \n"
-    "cmp          %4, #64                      \n"
-    "beq          75f                          \n"
-    "cmp          %4, #128                     \n"
-    "beq          50f                          \n"
-    "cmp          %4, #192                     \n"
-    "beq          25f                          \n"
-
-    "vdup.8       d5, %4                       \n"
-    "rsb          %4, #256                     \n"
-    "vdup.8       d4, %4                       \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vmull.u8     q13, d0, d4                  \n"
-    "vmull.u8     q14, d1, d4                  \n"
-    "vmlal.u8     q13, d2, d5                  \n"
-    "vmlal.u8     q14, d3, d5                  \n"
-    "vrshrn.u16   d0, q13, #8                  \n"
-    "vrshrn.u16   d1, q14, #8                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q1}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "vld1.8       {q1}, [%1]!                  \n"
-    MEMACCESS(2)
-    "vld1.8       {q0}, [%2]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    "vrhadd.u8    q0, q1                       \n"
-    "vrhadd.u8    q0, q1                       \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "vld1.8       {q0}, [%1]!                  \n"
-    "subs         %3, %3, #16                  \n"
-    MEMACCESS(0)
-    "vst1.8       {q0}, [%0]!                  \n"
-    "bgt          100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "vst1.8       {d1[7]}, [%0]                \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction) // %4
-  :
-  : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc"
-  );
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  asm volatile(
+      "cmp          %4, #0                       \n"
+      "beq          100f                         \n"
+      "add          %2, %1                       \n"
+      "cmp          %4, #64                      \n"
+      "beq          75f                          \n"
+      "cmp          %4, #128                     \n"
+      "beq          50f                          \n"
+      "cmp          %4, #192                     \n"
+      "beq          25f                          \n"
+
+      "vdup.8       d5, %4                       \n"
+      "rsb          %4, #256                     \n"
+      "vdup.8       d4, %4                       \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vmull.u8     q13, d0, d4                  \n"
+      "vmull.u8     q14, d1, d4                  \n"
+      "vmlal.u8     q13, d2, d5                  \n"
+      "vmlal.u8     q14, d3, d5                  \n"
+      "vrshrn.u16   d0, q13, #8                  \n"
+      "vrshrn.u16   d1, q14, #8                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          1b                           \n"
+      "b            99f                          \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          25b                          \n"
+      "b            99f                          \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "vld1.8       {q1}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          50b                          \n"
+      "b            99f                          \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "vld1.8       {q1}, [%1]!                  \n"
+      "vld1.8       {q0}, [%2]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vrhadd.u8    q0, q1                       \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          75b                          \n"
+      "b            99f                          \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "vld1.8       {q0}, [%1]!                  \n"
+      "subs         %3, %3, #16                  \n"
+      "vst1.8       {q0}, [%0]!                  \n"
+      "bgt          100b                         \n"
+
+      "99:                                       \n"
+      "vst1.8       {d1[7]}, [%0]                \n"
+      : "+r"(dst_ptr),           // %0
+        "+r"(src_ptr),           // %1
+        "+r"(src_stride),        // %2
+        "+r"(dst_width),         // %3
+        "+r"(source_y_fraction)  // %4
+      :
+      : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS(0)
-    "vld2.32    {q0, q1}, [%0]!                \n"
-    MEMACCESS(0)
-    "vld2.32    {q2, q3}, [%0]!                \n"
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "vst1.8     {q1}, [%1]!                    \n"  // store odd pixels
-    MEMACCESS(1)
-    "vst1.8     {q3}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-  );
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vmov       q2, q1                         \n"  // load next 8 ARGB
+      "vst2.32    {q2, q3}, [%1]!                \n"  // store odd pixels
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #1                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #1                     \n"
-    "vrshrn.u16 d2, q2, #1                     \n"
-    "vrshrn.u16 d3, q3, #1                     \n"
-    MEMACCESS(1)
-    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"
-    "bgt       1b                              \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3"     // Clobber List
-  );
+//  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
+//  4a:  3e04        subs  r6, #4
+//  4c:  f964 118d   vld4.32  {d17,d19,d21,d23}, [r4]!
+//  50:  ef64 21f4   vorr  q9, q10, q10
+//  54:  f942 038d   vst2.32  {d16-d19}, [r2]!
+//  58:  d1f5        bne.n  46 <ScaleARGBRowDown2_C+0x46>
+
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "vld4.32    {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.32    {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "vrhadd.u8  q0, q0, q1                     \n"  // rounding half add
+      "vrhadd.u8  q1, q2, q3                     \n"  // rounding half add
+      "vst2.32    {q0, q1}, [%1]!                \n"
+      "bgt       1b                              \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
-    MEMACCESS(0)
-    "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB pixels.
-    "subs       %3, %3, #8                     \n"  // 8 processed per loop.
-    "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
-    "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
-    "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
-    "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS(1)
-    "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB pixels.
-    MEMACCESS(1)
-    "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB pixels.
-    "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
-    "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
-    "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
-    "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
-    "vrshrn.u16 d0, q0, #2                     \n"  // downshift, round and pack
-    "vrshrn.u16 d1, q1, #2                     \n"
-    "vrshrn.u16 d2, q2, #2                     \n"
-    "vrshrn.u16 d3, q3, #2                     \n"
-    MEMACCESS(2)
-    "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
-    "bgt        1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 ARGB pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 ARGB
+      "subs       %3, %3, #8                     \n"  // 8 processed per loop.
+      "vpaddl.u8  q0, q0                         \n"  // B 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // G 16 bytes -> 8 shorts.
+      "vpaddl.u8  q2, q2                         \n"  // R 16 bytes -> 8 shorts.
+      "vpaddl.u8  q3, q3                         \n"  // A 16 bytes -> 8 shorts.
+      "vld4.8     {d16, d18, d20, d22}, [%1]!    \n"  // load 8 more ARGB
+      "vld4.8     {d17, d19, d21, d23}, [%1]!    \n"  // load last 8 ARGB
+      "vpadal.u8  q0, q8                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q9                         \n"  // G 16 bytes -> 8 shorts.
+      "vpadal.u8  q2, q10                        \n"  // R 16 bytes -> 8 shorts.
+      "vpadal.u8  q3, q11                        \n"  // A 16 bytes -> 8 shorts.
+      "vrshrn.u16 d0, q0, #2                     \n"  // round and pack to bytes
+      "vrshrn.u16 d1, q1, #2                     \n"
+      "vrshrn.u16 d2, q2, #2                     \n"
+      "vrshrn.u16 d3, q3, #2                     \n"
+      "vst4.8     {d0, d1, d2, d3}, [%2]!        \n"
+      "bgt        1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %3, lsl #2                \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d0[1]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[0]}, [%0], r12             \n"
-    MEMACCESS(0)
-    "vld1.32    {d1[1]}, [%0], r12             \n"
-    "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "vst1.8     {q0}, [%1]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"(src_stepx)     // %3
-  : "memory", "cc", "r12", "q0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "mov        r12, %3, lsl #2                \n"
+      "1:                                        \n"
+      "vld1.32    {d0[0]}, [%0], r12             \n"
+      "vld1.32    {d0[1]}, [%0], r12             \n"
+      "vld1.32    {d1[0]}, [%0], r12             \n"
+      "vld1.32    {d1[1]}, [%0], r12             \n"
+      "subs       %2, %2, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%1]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      : "r"(src_stepx)   // %3
+      : "memory", "cc", "r12", "q0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "mov        r12, %4, lsl #2                \n"
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "vld1.8     {d0}, [%0], r12                \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "vld1.8     {d1}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d2}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d3}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d4}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d5}, [%1], r12                \n"
-    MEMACCESS(0)
-    "vld1.8     {d6}, [%0], r12                \n"
-    MEMACCESS(1)
-    "vld1.8     {d7}, [%1], r12                \n"
-    "vaddl.u8   q0, d0, d1                     \n"
-    "vaddl.u8   q1, d2, d3                     \n"
-    "vaddl.u8   q2, d4, d5                     \n"
-    "vaddl.u8   q3, d6, d7                     \n"
-    "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
-    "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
-    "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
-    "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
-    "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
-    "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
-    "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "vst1.8     {q0}, [%2]!                    \n"
-    "bgt        1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"(src_stepx)     // %4
-  : "memory", "cc", "r12", "q0", "q1", "q2", "q3"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "mov        r12, %4, lsl #2                \n"
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "vld1.8     {d0}, [%0], r12                \n"  // 4 2x2 blocks -> 2x1
+      "vld1.8     {d1}, [%1], r12                \n"
+      "vld1.8     {d2}, [%0], r12                \n"
+      "vld1.8     {d3}, [%1], r12                \n"
+      "vld1.8     {d4}, [%0], r12                \n"
+      "vld1.8     {d5}, [%1], r12                \n"
+      "vld1.8     {d6}, [%0], r12                \n"
+      "vld1.8     {d7}, [%1], r12                \n"
+      "vaddl.u8   q0, d0, d1                     \n"
+      "vaddl.u8   q1, d2, d3                     \n"
+      "vaddl.u8   q2, d4, d5                     \n"
+      "vaddl.u8   q3, d6, d7                     \n"
+      "vswp.8     d1, d2                         \n"  // ab_cd -> ac_bd
+      "vswp.8     d5, d6                         \n"  // ef_gh -> eg_fh
+      "vadd.u16   q0, q0, q1                     \n"  // (a+b)_(c+d)
+      "vadd.u16   q2, q2, q3                     \n"  // (e+f)_(g+h)
+      "vrshrn.u16 d0, q0, #2                     \n"  // first 2 pixels.
+      "vrshrn.u16 d1, q2, #2                     \n"  // next 2 pixels.
+      "subs       %3, %3, #4                     \n"  // 4 pixels per loop.
+      "vst1.8     {q0}, [%2]!                    \n"
+      "bgt        1b                             \n"
+      : "+r"(src_argb),    // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst_argb),    // %2
+        "+r"(dst_width)    // %3
+      : "r"(src_stepx)     // %4
+      : "memory", "cc", "r12", "q0", "q1", "q2", "q3");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(dn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "vld1.32    {"#dn"["#n"]}, [%6]            \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+#define LOAD1_DATA32_LANE(dn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "vld1.32    {" #dn "[" #n "]}, [%6]        \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
   int tmp;
-  const uint8* src_tmp = src_argb;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(d0, 0)
-    LOAD1_DATA32_LANE(d0, 1)
-    LOAD1_DATA32_LANE(d1, 0)
-    LOAD1_DATA32_LANE(d1, 1)
-    LOAD1_DATA32_LANE(d2, 0)
-    LOAD1_DATA32_LANE(d2, 1)
-    LOAD1_DATA32_LANE(d3, 0)
-    LOAD1_DATA32_LANE(d3, 1)
-
-    MEMACCESS(0)
-    "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
-    "subs       %2, %2, #8                     \n"  // 8 processed per loop
-    "bgt        1b                             \n"
-  : "+r"(dst_argb),   // %0
-    "+r"(src_argb),   // %1
-    "+r"(dst_width),  // %2
-    "+r"(x),          // %3
-    "+r"(dx),         // %4
-    "=&r"(tmp),       // %5
-    "+r"(src_tmp)     // %6
-  :
-  : "memory", "cc", "q0", "q1"
-  );
+  const uint8_t* src_tmp = src_argb;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(d0, 0)
+      LOAD1_DATA32_LANE(d0, 1)
+      LOAD1_DATA32_LANE(d1, 0)
+      LOAD1_DATA32_LANE(d1, 1)
+      LOAD1_DATA32_LANE(d2, 0)
+      LOAD1_DATA32_LANE(d2, 1)
+      LOAD1_DATA32_LANE(d3, 0)
+      LOAD1_DATA32_LANE(d3, 1)
+      // clang-format on
+      "vst1.32     {q0, q1}, [%0]!               \n"  // store pixels
+      "subs       %2, %2, #8                     \n"  // 8 processed per loop
+      "bgt        1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x),          // %3
+        "+r"(dx),         // %4
+        "=&r"(tmp),       // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "q0", "q1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(dn1, dn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "vld2.32    {"#dn1"["#n"], "#dn2"["#n"]}, [%6]    \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(dn1, dn2, n)                       \
+  "lsr        %5, %3, #16                                \n" \
+  "add        %6, %1, %5, lsl #2                         \n" \
+  "add        %3, %3, %4                                 \n" \
+  "vld2.32    {" #dn1 "[" #n "], " #dn2 "[" #n "]}, [%6] \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
+  const uint8_t* src_tmp = src_argb;
   asm volatile (
     "vdup.32    q0, %3                         \n"  // x
     "vdup.32    q1, %4                         \n"  // dx
@@ -993,7 +943,6 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "vshrn.i16   d0, q11, #7                   \n"
     "vshrn.i16   d1, q12, #7                   \n"
 
-    MEMACCESS(0)
     "vst1.32     {d0, d1}, [%0]!               \n"  // store pixels
     "vadd.s32    q8, q8, q9                    \n"
     "subs        %2, %2, #4                    \n"  // 4 processed per loop
diff --git a/libs/libvpx/third_party/libyuv/source/scale_neon64.cc b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc
index ff277f26ff6..494a9cfbfbe 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_neon64.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_neon64.cc
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "libyuv/scale.h"
 #include "libyuv/row.h"
+#include "libyuv/scale.h"
 #include "libyuv/scale_row.h"
 
 #ifdef __cplusplus
@@ -21,580 +21,556 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // Read 32x1 throw away even pixels, and write 16x1.
-void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into v0, odd into v1
-    MEMACCESS(0)
-    "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    MEMACCESS(1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"              // Clobber List
-  );
+void ScaleRowDown2_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
 }
 
 // Read 32x1 average down and write 16x1.
-void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32     \n"  // load pixels and post inc
-    "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #1              \n"
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst),              // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1"     // Clobber List
-  );
+void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint8_t* dst,
+                              int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load even pixels into v0, odd into v1
+      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
+      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1"  // Clobber List
+      );
 }
 
 // Read 32x2 average down and write 16x1.
-void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.16b,v1.16b}, [%0], #32    \n"  // load row 1 and post inc
-    MEMACCESS(1)
-    "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
-    "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
-    "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
-    "uaddlp     v1.8h, v1.16b                  \n"
-    "uadalp     v0.8h, v2.16b                  \n"  // row 2 add adjacent + row1
-    "uadalp     v1.8h, v3.16b                  \n"
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn2     v0.16b, v1.8h, #2              \n"
-    MEMACCESS(2)
-    "st1        {v0.16b}, [%2], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(src_stride),       // %1
-    "+r"(dst),              // %2
-    "+r"(dst_width)         // %3
-  :
-  : "v0", "v1", "v2", "v3"     // Clobber List
-  );
+void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst,
+                           int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"  // load row 1 and post inc
+      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
+      "uaddlp     v0.8h, v0.16b                  \n"  // row 1 add adjacent
+      "uaddlp     v1.8h, v1.16b                  \n"
+      "uadalp     v0.8h, v2.16b                  \n"  // += row 2 add adjacent
+      "uadalp     v1.8h, v3.16b                  \n"
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn2     v0.16b, v1.8h, #2              \n"
+      "st1        {v0.16b}, [%2], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32          \n"  // src line 0
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS(1)
-    "st1     {v2.8b}, [%1], #8                 \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4_NEON(const uint8_t* src_ptr,
+                        ptrdiff_t src_stride,
+                        uint8_t* dst_ptr,
+                        int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "st1     {v2.8b}, [%1], #8                 \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride;
-  const uint8* src_ptr2 = src_ptr + src_stride * 2;
-  const uint8* src_ptr3 = src_ptr + src_stride * 3;
-asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1     {v0.16b}, [%0], #16               \n"   // load up 16x4
-    MEMACCESS(3)
-    "ld1     {v1.16b}, [%2], #16               \n"
-    MEMACCESS(4)
-    "ld1     {v2.16b}, [%3], #16               \n"
-    MEMACCESS(5)
-    "ld1     {v3.16b}, [%4], #16               \n"
-    "subs    %w5, %w5, #4                      \n"
-    "uaddlp  v0.8h, v0.16b                     \n"
-    "uadalp  v0.8h, v1.16b                     \n"
-    "uadalp  v0.8h, v2.16b                     \n"
-    "uadalp  v0.8h, v3.16b                     \n"
-    "addp    v0.8h, v0.8h, v0.8h               \n"
-    "rshrn   v0.8b, v0.8h, #4                  \n"   // divide by 16 w/rounding
-    MEMACCESS(1)
-    "st1    {v0.s}[0], [%1], #4                \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_ptr),   // %0
-    "+r"(dst_ptr),   // %1
-    "+r"(src_ptr1),  // %2
-    "+r"(src_ptr2),  // %3
-    "+r"(src_ptr3),  // %4
-    "+r"(dst_width)  // %5
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
+                           ptrdiff_t src_stride,
+                           uint8_t* dst_ptr,
+                           int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride;
+  const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
+  const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
+  asm volatile(
+      "1:                                        \n"
+      "ld1     {v0.16b}, [%0], #16               \n"  // load up 16x4
+      "ld1     {v1.16b}, [%2], #16               \n"
+      "ld1     {v2.16b}, [%3], #16               \n"
+      "ld1     {v3.16b}, [%4], #16               \n"
+      "subs    %w5, %w5, #4                      \n"
+      "uaddlp  v0.8h, v0.16b                     \n"
+      "uadalp  v0.8h, v1.16b                     \n"
+      "uadalp  v0.8h, v2.16b                     \n"
+      "uadalp  v0.8h, v3.16b                     \n"
+      "addp    v0.8h, v0.8h, v0.8h               \n"
+      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
+      "st1    {v0.s}[0], [%1], #4                \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(src_ptr1),  // %2
+        "+r"(src_ptr2),  // %3
+        "+r"(src_ptr3),  // %4
+        "+r"(dst_width)  // %5
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // Down scale from 4 to 3 pixels. Use the neon multilane read/write
 // to load up the every 4th pixel into a 4 different registers.
 // Point samples 32 pixels to 24 pixels.
-void ScaleRowDown34_NEON(const uint8* src_ptr,
+void ScaleRowDown34_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    "subs      %w2, %w2, #24                           \n"
-    "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0, v1, v2
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  :
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "subs      %w2, %w2, #24                           \n"
+      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      :
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
-void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-
-    // filter src line 0 with src line 1
-    // expand chars to shorts to allow for room
-    // when adding lines together
-    "ushll     v16.8h, v4.8b, #0                       \n"
-    "ushll     v17.8h, v5.8b, #0                       \n"
-    "ushll     v18.8h, v6.8b, #0                       \n"
-    "ushll     v19.8h, v7.8b, #0                       \n"
-
-    // 3 * line_0 + line_1
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "umlal     v17.8h, v1.8b, v20.8b                   \n"
-    "umlal     v18.8h, v2.8b, v20.8b                   \n"
-    "umlal     v19.8h, v3.8b, v20.8b                   \n"
-
-    // (3 * line_0 + line_1) >> 2
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-    "uqrshrn   v1.8b, v17.8h, #2                       \n"
-    "uqrshrn   v2.8b, v18.8h, #2                       \n"
-    "uqrshrn   v3.8b, v19.8h, #2                       \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v16.8h, v1.8b, #0                       \n"
-    "umlal     v16.8h, v0.8b, v20.8b                   \n"
-    "uqrshrn   v0.8b, v16.8h, #2                       \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v16.8h, v2.8b, #0                       \n"
-    "umlal     v16.8h, v3.8b, v20.8b                   \n"
-    "uqrshrn   v2.8b, v16.8h, #2                       \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19",
-    "v20", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
+
+      // filter src line 0 with src line 1
+      // expand chars to shorts to allow for room
+      // when adding lines together
+      "ushll     v16.8h, v4.8b, #0                       \n"
+      "ushll     v17.8h, v5.8b, #0                       \n"
+      "ushll     v18.8h, v6.8b, #0                       \n"
+      "ushll     v19.8h, v7.8b, #0                       \n"
+
+      // 3 * line_0 + line_1
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "umlal     v17.8h, v1.8b, v20.8b                   \n"
+      "umlal     v18.8h, v2.8b, v20.8b                   \n"
+      "umlal     v19.8h, v3.8b, v20.8b                   \n"
+
+      // (3 * line_0 + line_1) >> 2
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+      "uqrshrn   v1.8b, v17.8h, #2                       \n"
+      "uqrshrn   v2.8b, v18.8h, #2                       \n"
+      "uqrshrn   v3.8b, v19.8h, #2                       \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v16.8h, v1.8b, #0                       \n"
+      "umlal     v16.8h, v0.8b, v20.8b                   \n"
+      "uqrshrn   v0.8b, v16.8h, #2                       \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v16.8h, v2.8b, #0                       \n"
+      "umlal     v16.8h, v3.8b, v20.8b                   \n"
+      "uqrshrn   v2.8b, v16.8h, #2                       \n"
+
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "memory", "cc");
 }
 
-void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    "movi      v20.8b, #3                              \n"
-    "add       %3, %3, %0                              \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"  // src line 0
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32                \n"  // src line 1
-    "subs         %w2, %w2, #24                        \n"
-    // average src line 0 with src line 1
-    "urhadd    v0.8b, v0.8b, v4.8b                     \n"
-    "urhadd    v1.8b, v1.8b, v5.8b                     \n"
-    "urhadd    v2.8b, v2.8b, v6.8b                     \n"
-    "urhadd    v3.8b, v3.8b, v7.8b                     \n"
-
-    // a0 = (src[0] * 3 + s[1] * 1) >> 2
-    "ushll     v4.8h, v1.8b, #0                        \n"
-    "umlal     v4.8h, v0.8b, v20.8b                    \n"
-    "uqrshrn   v0.8b, v4.8h, #2                        \n"
-
-    // a1 = (src[1] * 1 + s[2] * 1) >> 1
-    "urhadd    v1.8b, v1.8b, v2.8b                     \n"
-
-    // a2 = (src[2] * 1 + s[3] * 3) >> 2
-    "ushll     v4.8h, v2.8b, #0                        \n"
-    "umlal     v4.8h, v3.8b, v20.8b                    \n"
-    "uqrshrn   v2.8b, v4.8h, #2                        \n"
-
-    MEMACCESS(1)
-    "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24                \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width),        // %2
-    "+r"(src_stride)        // %3
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc"
-  );
+                               uint8_t* dst_ptr,
+                               int dst_width) {
+  asm volatile(
+      "movi      v20.8b, #3                              \n"
+      "add       %3, %3, %0                              \n"
+      "1:                                                \n"
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"  // src line 0
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32    \n"  // src line 1
+      "subs         %w2, %w2, #24                        \n"
+      // average src line 0 with src line 1
+      "urhadd    v0.8b, v0.8b, v4.8b                     \n"
+      "urhadd    v1.8b, v1.8b, v5.8b                     \n"
+      "urhadd    v2.8b, v2.8b, v6.8b                     \n"
+      "urhadd    v3.8b, v3.8b, v7.8b                     \n"
+
+      // a0 = (src[0] * 3 + s[1] * 1) >> 2
+      "ushll     v4.8h, v1.8b, #0                        \n"
+      "umlal     v4.8h, v0.8b, v20.8b                    \n"
+      "uqrshrn   v0.8b, v4.8h, #2                        \n"
+
+      // a1 = (src[1] * 1 + s[2] * 1) >> 1
+      "urhadd    v1.8b, v1.8b, v2.8b                     \n"
+
+      // a2 = (src[2] * 1 + s[3] * 3) >> 2
+      "ushll     v4.8h, v2.8b, #0                        \n"
+      "umlal     v4.8h, v3.8b, v20.8b                    \n"
+      "uqrshrn   v2.8b, v4.8h, #2                        \n"
+
+      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),    // %0
+        "+r"(dst_ptr),    // %1
+        "+r"(dst_width),  // %2
+        "+r"(src_stride)  // %3
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "memory", "cc");
 }
 
-static uvec8 kShuf38 =
-  { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 };
-static uvec8 kShuf38_2 =
-  { 0, 16, 32, 2, 18, 33, 4, 20, 34, 6, 22, 35, 0, 0, 0, 0 };
-static vec16 kMult38_Div6 =
-  { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12,
-    65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 };
-static vec16 kMult38_Div9 =
-  { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18,
-    65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 };
+static const uvec8 kShuf38 = {0,  3,  6,  8,  11, 14, 16, 19,
+                              22, 24, 27, 30, 0,  0,  0,  0};
+static const uvec8 kShuf38_2 = {0,  16, 32, 2,  18, 33, 4, 20,
+                                34, 6,  22, 35, 0,  0,  0, 0};
+static const vec16 kMult38_Div6 = {65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12, 65536 / 12,
+                                   65536 / 12, 65536 / 12};
+static const vec16 kMult38_Div9 = {65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18, 65536 / 18,
+                                   65536 / 18, 65536 / 18};
 
 // 32 -> 12
-void ScaleRowDown38_NEON(const uint8* src_ptr,
+void ScaleRowDown38_NEON(const uint8_t* src_ptr,
                          ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
-  asm volatile (
-    MEMACCESS(3)
-    "ld1       {v3.16b}, [%3]                          \n"
-  "1:                                                  \n"
-    MEMACCESS(0)
-    "ld1       {v0.16b,v1.16b}, [%0], #32             \n"
-    "subs      %w2, %w2, #12                           \n"
-    "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b        \n"
-    MEMACCESS(1)
-    "st1       {v2.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v2.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(dst_width)         // %2
-  : "r"(&kShuf38)           // %3
-  : "v0", "v1", "v2", "v3", "memory", "cc"
-  );
+                         uint8_t* dst_ptr,
+                         int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "ld1       {v3.16b}, [%3]                          \n"
+      "1:                                                \n"
+      "ld1       {v0.16b,v1.16b}, [%0], #32              \n"
+      "subs      %w2, %w2, #12                           \n"
+      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
+      "st1       {v2.8b}, [%1], #8                       \n"
+      "st1       {v2.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst_ptr),   // %1
+        "+r"(dst_width)  // %2
+      : "r"(&kShuf38)    // %3
+      : "v0", "v1", "v2", "v3", "memory", "cc");
 }
 
 // 32x3 -> 12x1
-void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
+void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
                                       ptrdiff_t src_stride,
-                                      uint8* dst_ptr, int dst_width) {
-  const uint8* src_ptr1 = src_ptr + src_stride * 2;
+                                      uint8_t* dst_ptr,
+                                      int dst_width) {
+  const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
   ptrdiff_t tmp_src_stride = src_stride;
 
-  asm volatile (
-    MEMACCESS(5)
-    "ld1       {v29.8h}, [%5]                          \n"
-    MEMACCESS(6)
-    "ld1       {v30.16b}, [%6]                         \n"
-    MEMACCESS(7)
-    "ld1       {v31.8h}, [%7]                          \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    MEMACCESS(4)
-    "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32              \n"
-    "subs      %w4, %w4, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v20.8b, v0.8b, v1.8b                    \n"
-    "trn2      v21.8b, v0.8b, v1.8b                    \n"
-    "trn1      v22.8b, v4.8b, v5.8b                    \n"
-    "trn2      v23.8b, v4.8b, v5.8b                    \n"
-    "trn1      v24.8b, v16.8b, v17.8b                  \n"
-    "trn2      v25.8b, v16.8b, v17.8b                  \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-    "trn1      v16.8b, v18.8b, v19.8b                  \n"
-    "trn2      v17.8b, v18.8b, v19.8b                  \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v20.4h, v20.8b                          \n"
-    "uaddlp    v21.4h, v21.8b                          \n"
-    "uaddlp    v22.4h, v22.8b                          \n"
-    "uaddlp    v23.4h, v23.8b                          \n"
-    "uaddlp    v24.4h, v24.8b                          \n"
-    "uaddlp    v25.4h, v25.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-
-    // combine source lines
-    "add       v20.4h, v20.4h, v22.4h                  \n"
-    "add       v21.4h, v21.4h, v23.4h                  \n"
-    "add       v20.4h, v20.4h, v24.4h                  \n"
-    "add       v21.4h, v21.4h, v25.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-    "add       v2.4h, v2.4h, v17.4h                    \n"
-
-    // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
-    //             + s[6 + st * 1] + s[7 + st * 1]
-    //             + s[6 + st * 2] + s[7 + st * 2]) / 6
-    "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
-    "xtn       v2.8b,  v2.8h                           \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-    "ushll     v16.8h, v16.8b, #0                      \n"
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // combine source lines
-    "add       v0.8h, v0.8h, v16.8h                    \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v20.8h, v20.8h, v0.8h                   \n"
-    "add       v21.8h, v21.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
-    "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),          // %0
-    "+r"(dst_ptr),          // %1
-    "+r"(tmp_src_stride),   // %2
-    "+r"(src_ptr1),         // %3
-    "+r"(dst_width)         // %4
-  : "r"(&kMult38_Div6),     // %5
-    "r"(&kShuf38_2),        // %6
-    "r"(&kMult38_Div9)      // %7
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29",
-    "v30", "v31", "memory", "cc"
-  );
+  asm volatile(
+      "ld1       {v29.8h}, [%5]                          \n"
+      "ld1       {v30.16b}, [%6]                         \n"
+      "ld1       {v31.8h}, [%7]                          \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "ld4       {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32  \n"
+      "subs      %w4, %w4, #12                           \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v20.8b, v0.8b, v1.8b                    \n"
+      "trn2      v21.8b, v0.8b, v1.8b                    \n"
+      "trn1      v22.8b, v4.8b, v5.8b                    \n"
+      "trn2      v23.8b, v4.8b, v5.8b                    \n"
+      "trn1      v24.8b, v16.8b, v17.8b                  \n"
+      "trn2      v25.8b, v16.8b, v17.8b                  \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+      "trn1      v16.8b, v18.8b, v19.8b                  \n"
+      "trn2      v17.8b, v18.8b, v19.8b                  \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v20.4h, v20.8b                          \n"
+      "uaddlp    v21.4h, v21.8b                          \n"
+      "uaddlp    v22.4h, v22.8b                          \n"
+      "uaddlp    v23.4h, v23.8b                          \n"
+      "uaddlp    v24.4h, v24.8b                          \n"
+      "uaddlp    v25.4h, v25.8b                          \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
+
+      // combine source lines
+      "add       v20.4h, v20.4h, v22.4h                  \n"
+      "add       v21.4h, v21.4h, v23.4h                  \n"
+      "add       v20.4h, v20.4h, v24.4h                  \n"
+      "add       v21.4h, v21.4h, v25.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
+      "add       v2.4h, v2.4h, v17.4h                    \n"
+
+      // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0]
+      //             + s[6 + st * 1] + s[7 + st * 1]
+      //             + s[6 + st * 2] + s[7 + st * 2]) / 6
+      "sqrdmulh  v2.8h, v2.8h, v29.8h                    \n"
+      "xtn       v2.8b,  v2.8h                           \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+      "ushll     v16.8h, v16.8b, #0                      \n"
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+      // combine source lines
+      "add       v0.8h, v0.8h, v16.8h                    \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
+
+      // 0+1+2, 3+4+5
+      "add       v20.8h, v20.8h, v0.8h                   \n"
+      "add       v21.8h, v21.8h, v4.8h                   \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v20.8h, v31.8h                   \n"
+      "sqrdmulh  v1.8h, v21.8h, v31.8h                   \n"
+
+      // Align for table lookup, vtbl requires registers to be adjacent
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v30.16b \n"
+
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(src_ptr1),        // %3
+        "+r"(dst_width)        // %4
+      : "r"(&kMult38_Div6),    // %5
+        "r"(&kShuf38_2),       // %6
+        "r"(&kMult38_Div9)     // %7
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v29", "v30", "v31",
+        "memory", "cc");
 }
 
 // 32x2 -> 12x1
-void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
+void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
                                ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+                               uint8_t* dst_ptr,
+                               int dst_width) {
   // TODO(fbarchard): use src_stride directly for clang 3.5+.
   ptrdiff_t tmp_src_stride = src_stride;
-  asm volatile (
-    MEMACCESS(4)
-    "ld1       {v30.8h}, [%4]                          \n"
-    MEMACCESS(5)
-    "ld1       {v31.16b}, [%5]                         \n"
-    "add       %2, %2, %0                              \n"
-  "1:                                                  \n"
-
-    // 00 40 01 41 02 42 03 43
-    // 10 50 11 51 12 52 13 53
-    // 20 60 21 61 22 62 23 63
-    // 30 70 31 71 32 72 33 73
-    MEMACCESS(0)
-    "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32                \n"
-    MEMACCESS(3)
-    "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32                \n"
-    "subs      %w3, %w3, #12                           \n"
-
-    // Shuffle the input data around to get align the data
-    //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
-    // 00 10 01 11 02 12 03 13
-    // 40 50 41 51 42 52 43 53
-    "trn1      v16.8b, v0.8b, v1.8b                    \n"
-    "trn2      v17.8b, v0.8b, v1.8b                    \n"
-    "trn1      v18.8b, v4.8b, v5.8b                    \n"
-    "trn2      v19.8b, v4.8b, v5.8b                    \n"
-
-    // 20 30 21 31 22 32 23 33
-    // 60 70 61 71 62 72 63 73
-    "trn1      v0.8b, v2.8b, v3.8b                     \n"
-    "trn2      v1.8b, v2.8b, v3.8b                     \n"
-    "trn1      v4.8b, v6.8b, v7.8b                     \n"
-    "trn2      v5.8b, v6.8b, v7.8b                     \n"
-
-    // 00+10 01+11 02+12 03+13
-    // 40+50 41+51 42+52 43+53
-    "uaddlp    v16.4h, v16.8b                          \n"
-    "uaddlp    v17.4h, v17.8b                          \n"
-    "uaddlp    v18.4h, v18.8b                          \n"
-    "uaddlp    v19.4h, v19.8b                          \n"
-
-    // 60+70 61+71 62+72 63+73
-    "uaddlp    v1.4h, v1.8b                            \n"
-    "uaddlp    v5.4h, v5.8b                            \n"
-
-    // combine source lines
-    "add       v16.4h, v16.4h, v18.4h                  \n"
-    "add       v17.4h, v17.4h, v19.4h                  \n"
-    "add       v2.4h, v1.4h, v5.4h                     \n"
-
-    // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
-    "uqrshrn   v2.8b, v2.8h, #2                        \n"
-
-    // Shuffle 2,3 reg around so that 2 can be added to the
-    //  0,1 reg and 3 can be added to the 4,5 reg. This
-    //  requires expanding from u8 to u16 as the 0,1 and 4,5
-    //  registers are already expanded. Then do transposes
-    //  to get aligned.
-    // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
-
-    // combine source lines
-    "uaddl     v0.8h, v0.8b, v4.8b                     \n"
-
-    // xx 20 xx 21 xx 22 xx 23
-    // xx 30 xx 31 xx 32 xx 33
-    "trn1      v1.8h, v0.8h, v0.8h                     \n"
-    "trn2      v4.8h, v0.8h, v0.8h                     \n"
-    "xtn       v0.4h, v1.4s                            \n"
-    "xtn       v4.4h, v4.4s                            \n"
-
-    // 0+1+2, 3+4+5
-    "add       v16.8h, v16.8h, v0.8h                   \n"
-    "add       v17.8h, v17.8h, v4.8h                   \n"
-
-    // Need to divide, but can't downshift as the the value
-    //  isn't a power of 2. So multiply by 65536 / n
-    //  and take the upper 16 bits.
-    "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
-    "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
-
-    // Align for table lookup, vtbl requires registers to
-    //  be adjacent
-
-    "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
-
-    MEMACCESS(1)
-    "st1       {v3.8b}, [%1], #8                       \n"
-    MEMACCESS(1)
-    "st1       {v3.s}[2], [%1], #4                     \n"
-    "b.gt      1b                                      \n"
-  : "+r"(src_ptr),         // %0
-    "+r"(dst_ptr),         // %1
-    "+r"(tmp_src_stride),  // %2
-    "+r"(dst_width)        // %3
-  : "r"(&kMult38_Div6),    // %4
-    "r"(&kShuf38_2)        // %5
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
-    "v18", "v19", "v30", "v31", "memory", "cc"
-  );
+  asm volatile(
+      "ld1       {v30.8h}, [%4]                          \n"
+      "ld1       {v31.16b}, [%5]                         \n"
+      "add       %2, %2, %0                              \n"
+      "1:                                                \n"
+
+      // 00 40 01 41 02 42 03 43
+      // 10 50 11 51 12 52 13 53
+      // 20 60 21 61 22 62 23 63
+      // 30 70 31 71 32 72 33 73
+      "ld4       {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32    \n"
+      "ld4       {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32    \n"
+      "subs      %w3, %w3, #12                           \n"
+
+      // Shuffle the input data around to get align the data
+      //  so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7
+      // 00 10 01 11 02 12 03 13
+      // 40 50 41 51 42 52 43 53
+      "trn1      v16.8b, v0.8b, v1.8b                    \n"
+      "trn2      v17.8b, v0.8b, v1.8b                    \n"
+      "trn1      v18.8b, v4.8b, v5.8b                    \n"
+      "trn2      v19.8b, v4.8b, v5.8b                    \n"
+
+      // 20 30 21 31 22 32 23 33
+      // 60 70 61 71 62 72 63 73
+      "trn1      v0.8b, v2.8b, v3.8b                     \n"
+      "trn2      v1.8b, v2.8b, v3.8b                     \n"
+      "trn1      v4.8b, v6.8b, v7.8b                     \n"
+      "trn2      v5.8b, v6.8b, v7.8b                     \n"
+
+      // 00+10 01+11 02+12 03+13
+      // 40+50 41+51 42+52 43+53
+      "uaddlp    v16.4h, v16.8b                          \n"
+      "uaddlp    v17.4h, v17.8b                          \n"
+      "uaddlp    v18.4h, v18.8b                          \n"
+      "uaddlp    v19.4h, v19.8b                          \n"
+
+      // 60+70 61+71 62+72 63+73
+      "uaddlp    v1.4h, v1.8b                            \n"
+      "uaddlp    v5.4h, v5.8b                            \n"
+
+      // combine source lines
+      "add       v16.4h, v16.4h, v18.4h                  \n"
+      "add       v17.4h, v17.4h, v19.4h                  \n"
+      "add       v2.4h, v1.4h, v5.4h                     \n"
+
+      // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4
+      "uqrshrn   v2.8b, v2.8h, #2                        \n"
+
+      // Shuffle 2,3 reg around so that 2 can be added to the
+      //  0,1 reg and 3 can be added to the 4,5 reg. This
+      //  requires expanding from u8 to u16 as the 0,1 and 4,5
+      //  registers are already expanded. Then do transposes
+      //  to get aligned.
+      // xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33
+
+      // combine source lines
+      "uaddl     v0.8h, v0.8b, v4.8b                     \n"
+
+      // xx 20 xx 21 xx 22 xx 23
+      // xx 30 xx 31 xx 32 xx 33
+      "trn1      v1.8h, v0.8h, v0.8h                     \n"
+      "trn2      v4.8h, v0.8h, v0.8h                     \n"
+      "xtn       v0.4h, v1.4s                            \n"
+      "xtn       v4.4h, v4.4s                            \n"
+
+      // 0+1+2, 3+4+5
+      "add       v16.8h, v16.8h, v0.8h                   \n"
+      "add       v17.8h, v17.8h, v4.8h                   \n"
+
+      // Need to divide, but can't downshift as the the value
+      //  isn't a power of 2. So multiply by 65536 / n
+      //  and take the upper 16 bits.
+      "sqrdmulh  v0.8h, v16.8h, v30.8h                   \n"
+      "sqrdmulh  v1.8h, v17.8h, v30.8h                   \n"
+
+      // Align for table lookup, vtbl requires registers to
+      //  be adjacent
+
+      "tbl       v3.16b, {v0.16b, v1.16b, v2.16b}, v31.16b \n"
+
+      "st1       {v3.8b}, [%1], #8                       \n"
+      "st1       {v3.s}[2], [%1], #4                     \n"
+      "b.gt      1b                                      \n"
+      : "+r"(src_ptr),         // %0
+        "+r"(dst_ptr),         // %1
+        "+r"(tmp_src_stride),  // %2
+        "+r"(dst_width)        // %3
+      : "r"(&kMult38_Div6),    // %4
+        "r"(&kShuf38_2)        // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19", "v30", "v31", "memory", "cc");
 }
 
-void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                    uint16* dst_ptr, int src_width, int src_height) {
-  const uint8* src_tmp;
-  asm volatile (
-  "1:                                          \n"
-    "mov       %0, %1                          \n"
-    "mov       w12, %w5                        \n"
-    "eor       v2.16b, v2.16b, v2.16b          \n"
-    "eor       v3.16b, v3.16b, v3.16b          \n"
-  "2:                                          \n"
-    // load 16 pixels into q0
-    MEMACCESS(0)
-    "ld1       {v0.16b}, [%0], %3              \n"
-    "uaddw2    v3.8h, v3.8h, v0.16b            \n"
-    "uaddw     v2.8h, v2.8h, v0.8b             \n"
-    "subs      w12, w12, #1                    \n"
-    "b.gt      2b                              \n"
-    MEMACCESS(2)
-    "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
-    "add      %1, %1, #16                      \n"
-    "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
-    "b.gt     1b                               \n"
-  : "=&r"(src_tmp),    // %0
-    "+r"(src_ptr),     // %1
-    "+r"(dst_ptr),     // %2
-    "+r"(src_stride),  // %3
-    "+r"(src_width),   // %4
-    "+r"(src_height)   // %5
-  :
-  : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void ScaleAddRows_NEON(const uint8_t* src_ptr,
+                       ptrdiff_t src_stride,
+                       uint16_t* dst_ptr,
+                       int src_width,
+                       int src_height) {
+  const uint8_t* src_tmp;
+  asm volatile(
+      "1:                                        \n"
+      "mov       %0, %1                          \n"
+      "mov       w12, %w5                        \n"
+      "eor       v2.16b, v2.16b, v2.16b          \n"
+      "eor       v3.16b, v3.16b, v3.16b          \n"
+      "2:                                        \n"
+      // load 16 pixels into q0
+      "ld1       {v0.16b}, [%0], %3              \n"
+      "uaddw2    v3.8h, v3.8h, v0.16b            \n"
+      "uaddw     v2.8h, v2.8h, v0.8b             \n"
+      "subs      w12, w12, #1                    \n"
+      "b.gt      2b                              \n"
+      "st1      {v2.8h, v3.8h}, [%2], #32        \n"  // store pixels
+      "add      %1, %1, #16                      \n"
+      "subs     %w4, %w4, #16                    \n"  // 16 processed per loop
+      "b.gt     1b                               \n"
+      : "=&r"(src_tmp),    // %0
+        "+r"(src_ptr),     // %1
+        "+r"(dst_ptr),     // %2
+        "+r"(src_stride),  // %3
+        "+r"(src_width),   // %4
+        "+r"(src_height)   // %5
+      :
+      : "memory", "cc", "w12", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA8_LANE(n)                                    \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5                    \n"              \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld2        {v4.b, v5.b}["#n"], [%6]      \n"
-
-void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
-                          int dst_width, int x, int dx) {
+#define LOAD2_DATA8_LANE(n)                      \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5                     \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld2        {v4.b, v5.b}[" #n "], [%6]     \n"
+
+// The NEON version mimics this formula (from row_common.cc):
+// #define BLENDER(a, b, f) (uint8_t)((int)(a) +
+//    ((((int)((f)) * ((int)(b) - (int)(a))) + 0x8000) >> 16))
+
+void ScaleFilterCols_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          int dst_width,
+                          int x,
+                          int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_ptr;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_ptr;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx
@@ -626,12 +602,11 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "ushll2    v6.4s, v6.8h, #0                \n"
     "mul       v16.4s, v16.4s, v7.4s           \n"
     "mul       v17.4s, v17.4s, v6.4s           \n"
-    "rshrn      v6.4h, v16.4s, #16             \n"
-    "rshrn2     v6.8h, v17.4s, #16             \n"
+    "rshrn     v6.4h, v16.4s, #16              \n"
+    "rshrn2    v6.8h, v17.4s, #16              \n"
     "add       v4.8h, v4.8h, v6.8h             \n"
     "xtn       v4.8b, v4.8h                    \n"
 
-    MEMACCESS(0)
     "st1       {v4.8b}, [%0], #8               \n"  // store pixels
     "add       v1.4s, v1.4s, v0.4s             \n"
     "add       v2.4s, v2.4s, v0.4s             \n"
@@ -639,7 +614,7 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
     "b.gt      1b                              \n"
   : "+r"(dst_ptr),          // %0
     "+r"(src_ptr),          // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -653,331 +628,300 @@ void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr,
 #undef LOAD2_DATA8_LANE
 
 // 16x2 -> 16x1
-void ScaleFilterRows_NEON(uint8* dst_ptr,
-                          const uint8* src_ptr, ptrdiff_t src_stride,
-                          int dst_width, int source_y_fraction) {
-    int y_fraction = 256 - source_y_fraction;
-  asm volatile (
-    "cmp          %w4, #0                      \n"
-    "b.eq         100f                         \n"
-    "add          %2, %2, %1                   \n"
-    "cmp          %w4, #64                     \n"
-    "b.eq         75f                          \n"
-    "cmp          %w4, #128                    \n"
-    "b.eq         50f                          \n"
-    "cmp          %w4, #192                    \n"
-    "b.eq         25f                          \n"
-
-    "dup          v5.8b, %w4                   \n"
-    "dup          v4.8b, %w5                   \n"
-    // General purpose row blend.
-  "1:                                          \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "umull        v6.8h, v0.8b, v4.8b          \n"
-    "umull2       v7.8h, v0.16b, v4.16b        \n"
-    "umlal        v6.8h, v1.8b, v5.8b          \n"
-    "umlal2       v7.8h, v1.16b, v5.16b        \n"
-    "rshrn        v0.8b, v6.8h, #8             \n"
-    "rshrn2       v0.16b, v7.8h, #8            \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         1b                           \n"
-    "b            99f                          \n"
-
-    // Blend 25 / 75.
-  "25:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         25b                          \n"
-    "b            99f                          \n"
-
-    // Blend 50 / 50.
-  "50:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v1.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         50b                          \n"
-    "b            99f                          \n"
-
-    // Blend 75 / 25.
-  "75:                                         \n"
-    MEMACCESS(1)
-    "ld1          {v1.16b}, [%1], #16          \n"
-    MEMACCESS(2)
-    "ld1          {v0.16b}, [%2], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    "urhadd       v0.16b, v0.16b, v1.16b       \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         75b                          \n"
-    "b            99f                          \n"
-
-    // Blend 100 / 0 - Copy row unchanged.
-  "100:                                        \n"
-    MEMACCESS(1)
-    "ld1          {v0.16b}, [%1], #16          \n"
-    "subs         %w3, %w3, #16                \n"
-    MEMACCESS(0)
-    "st1          {v0.16b}, [%0], #16          \n"
-    "b.gt         100b                         \n"
-
-  "99:                                         \n"
-    MEMACCESS(0)
-    "st1          {v0.b}[15], [%0]             \n"
-  : "+r"(dst_ptr),          // %0
-    "+r"(src_ptr),          // %1
-    "+r"(src_stride),       // %2
-    "+r"(dst_width),        // %3
-    "+r"(source_y_fraction),// %4
-    "+r"(y_fraction)        // %5
-  :
-  : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc"
-  );
+void ScaleFilterRows_NEON(uint8_t* dst_ptr,
+                          const uint8_t* src_ptr,
+                          ptrdiff_t src_stride,
+                          int dst_width,
+                          int source_y_fraction) {
+  int y_fraction = 256 - source_y_fraction;
+  asm volatile(
+      "cmp          %w4, #0                      \n"
+      "b.eq         100f                         \n"
+      "add          %2, %2, %1                   \n"
+      "cmp          %w4, #64                     \n"
+      "b.eq         75f                          \n"
+      "cmp          %w4, #128                    \n"
+      "b.eq         50f                          \n"
+      "cmp          %w4, #192                    \n"
+      "b.eq         25f                          \n"
+
+      "dup          v5.8b, %w4                   \n"
+      "dup          v4.8b, %w5                   \n"
+      // General purpose row blend.
+      "1:                                        \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "umull        v6.8h, v0.8b, v4.8b          \n"
+      "umull2       v7.8h, v0.16b, v4.16b        \n"
+      "umlal        v6.8h, v1.8b, v5.8b          \n"
+      "umlal2       v7.8h, v1.16b, v5.16b        \n"
+      "rshrn        v0.8b, v6.8h, #8             \n"
+      "rshrn2       v0.16b, v7.8h, #8            \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         1b                           \n"
+      "b            99f                          \n"
+
+      // Blend 25 / 75.
+      "25:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         25b                          \n"
+      "b            99f                          \n"
+
+      // Blend 50 / 50.
+      "50:                                       \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "ld1          {v1.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         50b                          \n"
+      "b            99f                          \n"
+
+      // Blend 75 / 25.
+      "75:                                       \n"
+      "ld1          {v1.16b}, [%1], #16          \n"
+      "ld1          {v0.16b}, [%2], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "urhadd       v0.16b, v0.16b, v1.16b       \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         75b                          \n"
+      "b            99f                          \n"
+
+      // Blend 100 / 0 - Copy row unchanged.
+      "100:                                      \n"
+      "ld1          {v0.16b}, [%1], #16          \n"
+      "subs         %w3, %w3, #16                \n"
+      "st1          {v0.16b}, [%0], #16          \n"
+      "b.gt         100b                         \n"
+
+      "99:                                       \n"
+      "st1          {v0.b}[15], [%0]             \n"
+      : "+r"(dst_ptr),            // %0
+        "+r"(src_ptr),            // %1
+        "+r"(src_stride),         // %2
+        "+r"(dst_width),          // %3
+        "+r"(source_y_fraction),  // %4
+        "+r"(y_fraction)          // %5
+      :
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "memory", "cc");
 }
 
-void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    // load even pixels into q0, odd into q1
-    MEMACCESS (0)
-    "ld2        {v0.4s, v1.4s}, [%0], #32      \n"
-    MEMACCESS (0)
-    "ld2        {v2.4s, v3.4s}, [%0], #32      \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    MEMACCESS (1)
-    "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
-    MEMACCESS (1)
-    "st1        {v3.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (dst),              // %1
-    "+r" (dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-  );
+void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
+                            ptrdiff_t src_stride,
+                            uint8_t* dst,
+                            int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "mov        v2.16b, v3.16b                 \n"
+      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),   // %0
+        "+r"(dst),       // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS (0)
-    // load 8 ARGB pixels.
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #1               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #1               \n"
-    "rshrn      v2.8b, v2.8h, #1               \n"
-    "rshrn      v3.8b, v3.8h, #1               \n"
-    MEMACCESS (1)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),         // %0
-    "+r"(dst_argb),         // %1
-    "+r"(dst_width)         // %2
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3"    // Clobber List
-  );
+void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      // load 16 ARGB pixels with even pixels into q0/q2, odd into q1/q3
+      "ld4        {v0.4s,v1.4s,v2.4s,v3.4s}, [%0], #64 \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+
+      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
+      "urhadd     v1.16b, v2.16b, v3.16b         \n"
+      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),  // %0
+        "+r"(dst_argb),  // %1
+        "+r"(dst_width)  // %2
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
+      );
 }
 
-void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst, int dst_width) {
-  asm volatile (
-    // change the stride to row 2 pointer
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS (0)
-    "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64   \n"  // load 8 ARGB pixels.
-    "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
-    "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
-    "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
-    "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
-    "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
-    MEMACCESS (1)
-    "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8 more ARGB pixels.
-    "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
-    "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
-    "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
-    "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
-    "rshrn      v0.8b, v0.8h, #2               \n"  // downshift, round and pack
-    "rshrn      v1.8b, v1.8h, #2               \n"
-    "rshrn      v2.8b, v2.8h, #2               \n"
-    "rshrn      v3.8b, v3.8h, #2               \n"
-    MEMACCESS (2)
-    "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
-    "b.gt       1b                             \n"
-  : "+r" (src_ptr),          // %0
-    "+r" (src_stride),       // %1
-    "+r" (dst),              // %2
-    "+r" (dst_width)         // %3
-  :
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"
-  );
+void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
+                               ptrdiff_t src_stride,
+                               uint8_t* dst,
+                               int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 8 ARGB
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
+      "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
+      "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
+      "uaddlp     v3.8h, v3.16b                  \n"  // A 16 bytes -> 8 shorts.
+      "ld4        {v16.16b,v17.16b,v18.16b,v19.16b}, [%1], #64 \n"  // load 8
+      "uadalp     v0.8h, v16.16b                 \n"  // B 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v17.16b                 \n"  // G 16 bytes -> 8 shorts.
+      "uadalp     v2.8h, v18.16b                 \n"  // R 16 bytes -> 8 shorts.
+      "uadalp     v3.8h, v19.16b                 \n"  // A 16 bytes -> 8 shorts.
+      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
+      "rshrn      v1.8b, v1.8h, #2               \n"
+      "rshrn      v2.8b, v2.8h, #2               \n"
+      "rshrn      v3.8b, v3.8h, #2               \n"
+      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
-void ScaleARGBRowDownEven_NEON(const uint8* src_argb,  ptrdiff_t src_stride,
-                               int src_stepx, uint8* dst_argb, int dst_width) {
-  asm volatile (
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[0], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[1], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[2], [%0], %3            \n"
-    MEMACCESS(0)
-    "ld1        {v0.s}[3], [%0], %3            \n"
-    "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(1)
-    "st1        {v0.16b}, [%1], #16            \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(dst_argb),    // %1
-    "+r"(dst_width)    // %2
-  : "r"((int64)(src_stepx * 4)) // %3
-  : "memory", "cc", "v0"
-  );
+void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
+                               ptrdiff_t src_stride,
+                               int src_stepx,
+                               uint8_t* dst_argb,
+                               int dst_width) {
+  (void)src_stride;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.s}[0], [%0], %3            \n"
+      "ld1        {v0.s}[1], [%0], %3            \n"
+      "ld1        {v0.s}[2], [%0], %3            \n"
+      "ld1        {v0.s}[3], [%0], %3            \n"
+      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
+      "st1        {v0.16b}, [%1], #16            \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(dst_argb),                // %1
+        "+r"(dst_width)                // %2
+      : "r"((int64_t)(src_stepx * 4))  // %3
+      : "memory", "cc", "v0");
 }
 
 // Reads 4 pixels at a time.
 // Alignment requirement: src_argb 4 byte aligned.
 // TODO(Yang Zhang): Might be worth another optimization pass in future.
 // It could be upgraded to 8 pixels at a time to start with.
-void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
+void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
+                                  ptrdiff_t src_stride,
                                   int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
-  asm volatile (
-    "add        %1, %1, %0                     \n"
-  "1:                                          \n"
-    MEMACCESS(0)
-    "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 blocks -> 2x1
-    MEMACCESS(1)
-    "ld1        {v1.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v2.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v3.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v4.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v5.8b}, [%1], %4              \n"
-    MEMACCESS(0)
-    "ld1        {v6.8b}, [%0], %4              \n"
-    MEMACCESS(1)
-    "ld1        {v7.8b}, [%1], %4              \n"
-    "uaddl      v0.8h, v0.8b, v1.8b            \n"
-    "uaddl      v2.8h, v2.8b, v3.8b            \n"
-    "uaddl      v4.8h, v4.8b, v5.8b            \n"
-    "uaddl      v6.8h, v6.8b, v7.8b            \n"
-    "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
-    "mov        v0.d[1], v2.d[0]               \n"
-    "mov        v2.d[0], v16.d[1]              \n"
-    "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
-    "mov        v4.d[1], v6.d[0]               \n"
-    "mov        v6.d[0], v16.d[1]              \n"
-    "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
-    "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
-    "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
-    "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
-    "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
-    MEMACCESS(2)
-    "st1     {v0.16b}, [%2], #16               \n"
-    "b.gt       1b                             \n"
-  : "+r"(src_argb),    // %0
-    "+r"(src_stride),  // %1
-    "+r"(dst_argb),    // %2
-    "+r"(dst_width)    // %3
-  : "r"((int64)(src_stepx * 4)) // %4
-  : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
-  );
+                                  uint8_t* dst_argb,
+                                  int dst_width) {
+  asm volatile(
+      "add        %1, %1, %0                     \n"
+      "1:                                        \n"
+      "ld1        {v0.8b}, [%0], %4              \n"  // Read 4 2x2 -> 2x1
+      "ld1        {v1.8b}, [%1], %4              \n"
+      "ld1        {v2.8b}, [%0], %4              \n"
+      "ld1        {v3.8b}, [%1], %4              \n"
+      "ld1        {v4.8b}, [%0], %4              \n"
+      "ld1        {v5.8b}, [%1], %4              \n"
+      "ld1        {v6.8b}, [%0], %4              \n"
+      "ld1        {v7.8b}, [%1], %4              \n"
+      "uaddl      v0.8h, v0.8b, v1.8b            \n"
+      "uaddl      v2.8h, v2.8b, v3.8b            \n"
+      "uaddl      v4.8h, v4.8b, v5.8b            \n"
+      "uaddl      v6.8h, v6.8b, v7.8b            \n"
+      "mov        v16.d[1], v0.d[1]              \n"  // ab_cd -> ac_bd
+      "mov        v0.d[1], v2.d[0]               \n"
+      "mov        v2.d[0], v16.d[1]              \n"
+      "mov        v16.d[1], v4.d[1]              \n"  // ef_gh -> eg_fh
+      "mov        v4.d[1], v6.d[0]               \n"
+      "mov        v6.d[0], v16.d[1]              \n"
+      "add        v0.8h, v0.8h, v2.8h            \n"  // (a+b)_(c+d)
+      "add        v4.8h, v4.8h, v6.8h            \n"  // (e+f)_(g+h)
+      "rshrn      v0.8b, v0.8h, #2               \n"  // first 2 pixels.
+      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
+      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
+      "st1     {v0.16b}, [%2], #16               \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_argb),                // %0
+        "+r"(src_stride),              // %1
+        "+r"(dst_argb),                // %2
+        "+r"(dst_width)                // %3
+      : "r"((int64_t)(src_stepx * 4))  // %4
+      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD1_DATA32_LANE(vn, n)                               \
-    "lsr        %5, %3, #16                    \n"             \
-    "add        %6, %1, %5, lsl #2             \n"             \
-    "add        %3, %3, %4                     \n"             \
-    MEMACCESS(6)                                               \
-    "ld1        {"#vn".s}["#n"], [%6]          \n"
-
-void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
-  int64 tmp64;
-  asm volatile (
-  "1:                                          \n"
-    LOAD1_DATA32_LANE(v0, 0)
-    LOAD1_DATA32_LANE(v0, 1)
-    LOAD1_DATA32_LANE(v0, 2)
-    LOAD1_DATA32_LANE(v0, 3)
-    LOAD1_DATA32_LANE(v1, 0)
-    LOAD1_DATA32_LANE(v1, 1)
-    LOAD1_DATA32_LANE(v1, 2)
-    LOAD1_DATA32_LANE(v1, 3)
-
-    MEMACCESS(0)
-    "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
-    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
-    "b.gt        1b                            \n"
-  : "+r"(dst_argb),     // %0
-    "+r"(src_argb),     // %1
-    "+r"(dst_width64),  // %2
-    "+r"(x64),          // %3
-    "+r"(dx64),         // %4
-    "=&r"(tmp64),       // %5
-    "+r"(src_tmp)       // %6
-  :
-  : "memory", "cc", "v0", "v1"
-  );
+#define LOAD1_DATA32_LANE(vn, n)                 \
+  "lsr        %5, %3, #16                    \n" \
+  "add        %6, %1, %5, lsl #2             \n" \
+  "add        %3, %3, %4                     \n" \
+  "ld1        {" #vn ".s}[" #n "], [%6]      \n"
+
+void ScaleARGBCols_NEON(uint8_t* dst_argb,
+                        const uint8_t* src_argb,
+                        int dst_width,
+                        int x,
+                        int dx) {
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
+  int64_t tmp64;
+  asm volatile(
+      "1:                                        \n"
+      // clang-format off
+      LOAD1_DATA32_LANE(v0, 0)
+      LOAD1_DATA32_LANE(v0, 1)
+      LOAD1_DATA32_LANE(v0, 2)
+      LOAD1_DATA32_LANE(v0, 3)
+      LOAD1_DATA32_LANE(v1, 0)
+      LOAD1_DATA32_LANE(v1, 1)
+      LOAD1_DATA32_LANE(v1, 2)
+      LOAD1_DATA32_LANE(v1, 3)
+      // clang-format on
+      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
+      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "b.gt       1b                             \n"
+      : "+r"(dst_argb),   // %0
+        "+r"(src_argb),   // %1
+        "+r"(dst_width),  // %2
+        "+r"(x64),        // %3
+        "+r"(dx64),       // %4
+        "=&r"(tmp64),     // %5
+        "+r"(src_tmp)     // %6
+      :
+      : "memory", "cc", "v0", "v1");
 }
 
 #undef LOAD1_DATA32_LANE
 
 // TODO(Yang Zhang): Investigate less load instructions for
 // the x/dx stepping
-#define LOAD2_DATA32_LANE(vn1, vn2, n)                         \
-    "lsr        %5, %3, #16                           \n"      \
-    "add        %6, %1, %5, lsl #2                    \n"      \
-    "add        %3, %3, %4                            \n"      \
-    MEMACCESS(6)                                               \
-    "ld2        {"#vn1".s, "#vn2".s}["#n"], [%6]      \n"
-
-void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
-                              int dst_width, int x, int dx) {
+#define LOAD2_DATA32_LANE(vn1, vn2, n)                  \
+  "lsr        %5, %3, #16                           \n" \
+  "add        %6, %1, %5, lsl #2                    \n" \
+  "add        %3, %3, %4                            \n" \
+  "ld2        {" #vn1 ".s, " #vn2 ".s}[" #n "], [%6]  \n"
+
+void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
+                              const uint8_t* src_argb,
+                              int dst_width,
+                              int x,
+                              int dx) {
   int dx_offset[4] = {0, 1, 2, 3};
   int* tmp = dx_offset;
-  const uint8* src_tmp = src_argb;
-  int64 dst_width64 = (int64) dst_width;  // Work around ios 64 bit warning.
-  int64 x64 = (int64) x;
-  int64 dx64 = (int64) dx;
+  const uint8_t* src_tmp = src_argb;
+  int64_t x64 = (int64_t)x;    // NOLINT
+  int64_t dx64 = (int64_t)dx;  // NOLINT
   asm volatile (
     "dup        v0.4s, %w3                     \n"  // x
     "dup        v1.4s, %w4                     \n"  // dx
@@ -1014,14 +958,13 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
     "shrn       v0.8b, v16.8h, #7              \n"
     "shrn2      v0.16b, v17.8h, #7             \n"
 
-    MEMACCESS(0)
     "st1     {v0.4s}, [%0], #16                \n"  // store pixels
     "add     v5.4s, v5.4s, v6.4s               \n"
     "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
     "b.gt    1b                                \n"
   : "+r"(dst_argb),         // %0
     "+r"(src_argb),         // %1
-    "+r"(dst_width64),      // %2
+    "+r"(dst_width),        // %2
     "+r"(x64),              // %3
     "+r"(dx64),             // %4
     "+r"(tmp),              // %5
@@ -1034,6 +977,85 @@ void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb,
 
 #undef LOAD2_DATA32_LANE
 
+// Read 16x2 average down and write 8x1.
+void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
+                              ptrdiff_t src_stride,
+                              uint16_t* dst,
+                              int dst_width) {
+  asm volatile(
+      // change the stride to row 2 pointer
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "1:                                        \n"
+      "ld1        {v0.8h, v1.8h}, [%0], #32      \n"  // load row 1 and post inc
+      "ld1        {v2.8h, v3.8h}, [%1], #32      \n"  // load row 2 and post inc
+      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop
+      "uaddlp     v0.4s, v0.8h                   \n"  // row 1 add adjacent
+      "uaddlp     v1.4s, v1.8h                   \n"
+      "uadalp     v0.4s, v2.8h                   \n"  // +row 2 add adjacent
+      "uadalp     v1.4s, v3.8h                   \n"
+      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
+      "rshrn2     v0.8h, v1.4s, #2               \n"
+      "st1        {v0.8h}, [%2], #16             \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      :
+      : "v0", "v1", "v2", "v3"  // Clobber List
+      );
+}
+
+// Read 8x2 upsample with filtering and write 16x1.
+// Actually reads an extra pixel, so 9x2.
+void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
+                         ptrdiff_t src_stride,
+                         uint16_t* dst,
+                         int dst_width) {
+  asm volatile(
+      "add        %1, %0, %1, lsl #1             \n"  // ptr + stide * 2
+      "movi       v0.8h, #9                      \n"  // constants
+      "movi       v1.4s, #3                      \n"
+
+      "1:                                        \n"
+      "ld1        {v3.8h}, [%0], %4              \n"  // TL read first 8
+      "ld1        {v4.8h}, [%0], %5              \n"  // TR read 8 offset by 1
+      "ld1        {v5.8h}, [%1], %4              \n"  // BL read 8 from next row
+      "ld1        {v6.8h}, [%1], %5              \n"  // BR offset by 1
+      "subs       %w3, %w3, #16                  \n"  // 16 dst pixels per loop
+      "umull      v16.4s, v3.4h, v0.4h           \n"
+      "umull2     v7.4s, v3.8h, v0.8h            \n"
+      "umull      v18.4s, v4.4h, v0.4h           \n"
+      "umull2     v17.4s, v4.8h, v0.8h           \n"
+      "uaddw      v16.4s, v16.4s, v6.4h          \n"
+      "uaddl2     v19.4s, v6.8h, v3.8h           \n"
+      "uaddl      v3.4s, v6.4h, v3.4h            \n"
+      "uaddw2     v6.4s, v7.4s, v6.8h            \n"
+      "uaddl2     v7.4s, v5.8h, v4.8h            \n"
+      "uaddl      v4.4s, v5.4h, v4.4h            \n"
+      "uaddw      v18.4s, v18.4s, v5.4h          \n"
+      "mla        v16.4s, v4.4s, v1.4s           \n"
+      "mla        v18.4s, v3.4s, v1.4s           \n"
+      "mla        v6.4s, v7.4s, v1.4s            \n"
+      "uaddw2     v4.4s, v17.4s, v5.8h           \n"
+      "uqrshrn    v16.4h,  v16.4s, #4            \n"
+      "mla        v4.4s, v19.4s, v1.4s           \n"
+      "uqrshrn2   v16.8h, v6.4s, #4              \n"
+      "uqrshrn    v17.4h, v18.4s, #4             \n"
+      "uqrshrn2   v17.8h, v4.4s, #4              \n"
+      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
+      "b.gt       1b                             \n"
+      : "+r"(src_ptr),     // %0
+        "+r"(src_stride),  // %1
+        "+r"(dst),         // %2
+        "+r"(dst_width)    // %3
+      : "r"(2LL),          // %4
+        "r"(14LL)          // %5
+      : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
+        "v19"  // Clobber List
+      );
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/libs/libvpx/third_party/libyuv/source/scale_win.cc b/libs/libvpx/third_party/libyuv/source/scale_win.cc
index f17097365cc..c5fc86f3e96 100644
--- a/libs/libvpx/third_party/libyuv/source/scale_win.cc
+++ b/libs/libvpx/third_party/libyuv/source/scale_win.cc
@@ -17,97 +17,93 @@ extern "C" {
 #endif
 
 // This module is for 32 bit Visual C x86 and clangcl
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86)
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
 
 // Offsets for source bytes 0 to 9
-static uvec8 kShuf0 =
-  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
-static uvec8 kShuf1 =
-  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf2 =
-  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
+                             128, 128, 128, 128, 128, 128, 128, 128};
 
 // Offsets for source bytes 0 to 10
-static uvec8 kShuf01 =
-  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
+static const uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
 
 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
-static uvec8 kShuf11 =
-  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
+static const uvec8 kShuf11 = {2, 3, 4, 5,  5,  6,  6,  7,
+                              8, 9, 9, 10, 10, 11, 12, 13};
 
 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
-static uvec8 kShuf21 =
-  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
+static const uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
+                              10, 11, 12, 13, 13, 14, 14, 15};
 
 // Coefficients for source bytes 0 to 10
-static uvec8 kMadd01 =
-  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
+static const uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
 
 // Coefficients for source bytes 10 to 21
-static uvec8 kMadd11 =
-  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
+static const uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
 
 // Coefficients for source bytes 21 to 31
-static uvec8 kMadd21 =
-  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
+static const uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
 
 // Coefficients for source bytes 21 to 31
-static vec16 kRound34 =
-  { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
 
-static uvec8 kShuf38a =
-  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
+                               128, 128, 128, 128, 128, 128, 128, 128};
 
-static uvec8 kShuf38b =
-  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
+static const uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
+                               6,   8,   11,  14,  128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 0,1,2
-static uvec8 kShufAc =
-  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
+                              128, 128, 128, 128, 128, 128, 128, 128};
 
 // Arrange words 0,3,6 into 3,4,5
-static uvec8 kShufAc3 =
-  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
+static const uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
+                               6,   7,   12,  13,  128, 128, 128, 128};
 
 // Scaling values for boxes of 3x3 and 2x3
-static uvec16 kScaleAc33 =
-  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
+static const uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
+                                  65536 / 9, 65536 / 6, 0,         0};
 
 // Arrange first value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb0 =
-  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
+                               11, 128, 14, 128, 128, 128, 128, 128};
 
 // Arrange second value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb1 =
-  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
+                               12, 128, 15, 128, 128, 128, 128, 128};
 
 // Arrange third value for pixels 0,1,2,3,4,5
-static uvec8 kShufAb2 =
-  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
+static const uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
+                               13, 128, 128, 128, 128, 128, 128, 128};
 
 // Scaling values for boxes of 3x2 and 2x2
-static uvec16 kScaleAb2 =
-  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
+static const uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
+                                 65536 / 3, 65536 / 2, 0,         0};
 
 // Reads 32 pixels, throws half away and writes 16 pixels.
-__declspec(naked)
-void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                         uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    psrlw      xmm0, 8               // isolate odd pixels.
+    psrlw      xmm0, 8          // isolate odd pixels.
     psrlw      xmm1, 8
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -120,27 +116,28 @@ void ScaleRowDown2_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x1 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                               uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_ptr,
+                                                 int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5       // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -153,20 +150,21 @@ void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x2 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                            uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     packuswb   xmm4, xmm4
-    pxor       xmm5, xmm5            // constant 0
+    pxor       xmm5, xmm5  // constant 0
 
   wloop:
     movdqu     xmm0, [eax]
@@ -174,15 +172,15 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add
+    paddw      xmm0, xmm2  // vertical add
     paddw      xmm1, xmm3
     psrlw      xmm0, 1
     psrlw      xmm1, 1
-    pavgw      xmm0, xmm5      // (x + 1) / 2
+    pavgw      xmm0, xmm5  // (x + 1) / 2
     pavgw      xmm1, xmm5
     packuswb   xmm0, xmm1
     movdqu     [edx], xmm0
@@ -197,23 +195,24 @@ void ScaleRowDown2Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN2_AVX2
 // Reads 64 pixels, throws half away and writes 32 pixels.
-__declspec(naked)
-void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpsrlw      ymm0, ymm0, 8        // isolate odd pixels.
+    vpsrlw      ymm0, ymm0, 8  // isolate odd pixels.
     vpsrlw      ymm1, ymm1, 8
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8     // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -225,30 +224,31 @@ void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x1 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                              uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
+                                                ptrdiff_t src_stride,
+                                                uint8_t* dst_ptr,
+                                                int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
     vmovdqu     ymm1, [eax + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -262,20 +262,21 @@ void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 // For rounding, average = (sum + 2) / 4
 // becomes average((sum >> 1), 0)
 // Blends 64x2 rectangle to 32x1.
-__declspec(naked)
-void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
-    mov         eax, [esp + 4 + 4]    // src_ptr
-    mov         esi, [esp + 4 + 8]    // src_stride
-    mov         edx, [esp + 4 + 12]   // dst_ptr
-    mov         ecx, [esp + 4 + 16]   // dst_width
+    mov         eax, [esp + 4 + 4]  // src_ptr
+    mov         esi, [esp + 4 + 8]  // src_stride
+    mov         edx, [esp + 4 + 12]  // dst_ptr
+    mov         ecx, [esp + 4 + 16]  // dst_width
 
-    vpcmpeqb    ymm4, ymm4, ymm4      // '1' constant, 8b
+    vpcmpeqb    ymm4, ymm4, ymm4  // '1' constant, 8b
     vpsrlw      ymm4, ymm4, 15
     vpackuswb   ymm4, ymm4, ymm4
-    vpxor       ymm5, ymm5, ymm5      // constant 0
+    vpxor       ymm5, ymm5, ymm5  // constant 0
 
   wloop:
     vmovdqu     ymm0, [eax]
@@ -283,18 +284,18 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
     lea         eax,  [eax + 64]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add
+    vpaddw      ymm0, ymm0, ymm2  // vertical add
     vpaddw      ymm1, ymm1, ymm3
-    vpsrlw      ymm0, ymm0, 1         // (x + 2) / 4 = (x / 2 + 1) / 2
+    vpsrlw      ymm0, ymm0, 1  // (x + 2) / 4 = (x / 2 + 1) / 2
     vpsrlw      ymm1, ymm1, 1
-    vpavgw      ymm0, ymm0, ymm5      // (x + 1) / 2
+    vpavgw      ymm0, ymm0, ymm5  // (x + 1) / 2
     vpavgw      ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], ymm0
     lea         edx, [edx + 32]
     sub         ecx, 32
@@ -308,15 +309,16 @@ void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 #endif  // HAS_SCALEROWDOWN2_AVX2
 
 // Point samples 32 pixels to 8 pixels.
-__declspec(naked)
-void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
+                                           ptrdiff_t src_stride,
+                                           uint8_t* dst_ptr,
+                                           int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
-    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
+    pcmpeqb    xmm5, xmm5       // generate mask 0x00ff0000
     psrld      xmm5, 24
     pslld      xmm5, 16
 
@@ -339,50 +341,51 @@ void ScaleRowDown4_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 32x4 rectangle to 8x1.
-__declspec(naked)
-void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_ptr,
+                                              int dst_width) {
   __asm {
     push       esi
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_ptr
-    mov        esi, [esp + 8 + 8]    // src_stride
-    mov        edx, [esp + 8 + 12]   // dst_ptr
-    mov        ecx, [esp + 8 + 16]   // dst_width
+    mov        eax, [esp + 8 + 4]  // src_ptr
+    mov        esi, [esp + 8 + 8]  // src_stride
+    mov        edx, [esp + 8 + 12]  // dst_ptr
+    mov        ecx, [esp + 8 + 16]  // dst_width
     lea        edi, [esi + esi * 2]  // src_stride * 3
-    pcmpeqb    xmm4, xmm4            // constant 0x0101
+    pcmpeqb    xmm4, xmm4  // constant 0x0101
     psrlw      xmm4, 15
     movdqa     xmm5, xmm4
     packuswb   xmm4, xmm4
-    psllw      xmm5, 3               // constant 0x0008
+    psllw      xmm5, 3  // constant 0x0008
 
   wloop:
-    movdqu     xmm0, [eax]           // average rows
+    movdqu     xmm0, [eax]  // average rows
     movdqu     xmm1, [eax + 16]
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
-    pmaddubsw  xmm0, xmm4      // horizontal add
+    pmaddubsw  xmm0, xmm4  // horizontal add
     pmaddubsw  xmm1, xmm4
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // vertical add rows 0, 1
+    paddw      xmm0, xmm2  // vertical add rows 0, 1
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + esi * 2]
     movdqu     xmm3, [eax + esi * 2 + 16]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 2
+    paddw      xmm0, xmm2  // add row 2
     paddw      xmm1, xmm3
     movdqu     xmm2, [eax + edi]
     movdqu     xmm3, [eax + edi + 16]
     lea        eax, [eax + 32]
     pmaddubsw  xmm2, xmm4
     pmaddubsw  xmm3, xmm4
-    paddw      xmm0, xmm2      // add row 3
+    paddw      xmm0, xmm2  // add row 3
     paddw      xmm1, xmm3
     phaddw     xmm0, xmm1
-    paddw      xmm0, xmm5      // + 8 for round
-    psrlw      xmm0, 4         // /16 for average of 4 * 4
+    paddw      xmm0, xmm5  // + 8 for round
+    psrlw      xmm0, 4  // /16 for average of 4 * 4
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
     lea        edx, [edx + 8]
@@ -397,15 +400,16 @@ void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 
 #ifdef HAS_SCALEROWDOWN4_AVX2
 // Point samples 64 pixels to 16 pixels.
-__declspec(naked)
-void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                        uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t* dst_ptr,
+                                          int dst_width) {
   __asm {
-    mov         eax, [esp + 4]        // src_ptr
-                                      // src_stride ignored
-    mov         edx, [esp + 12]       // dst_ptr
-    mov         ecx, [esp + 16]       // dst_width
-    vpcmpeqb    ymm5, ymm5, ymm5      // generate mask 0x00ff0000
+    mov         eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov         edx, [esp + 12]  // dst_ptr
+    mov         ecx, [esp + 16]  // dst_width
+    vpcmpeqb    ymm5, ymm5, ymm5  // generate mask 0x00ff0000
     vpsrld      ymm5, ymm5, 24
     vpslld      ymm5, ymm5, 16
 
@@ -416,10 +420,10 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
     vpand       ymm0, ymm0, ymm5
     vpand       ymm1, ymm1, ymm5
     vpackuswb   ymm0, ymm0, ymm1
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vpsrlw      ymm0, ymm0, 8
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8       // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -431,52 +435,53 @@ void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Blends 64x4 rectangle to 16x1.
-__declspec(naked)
-void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
-                           uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
+                                             ptrdiff_t src_stride,
+                                             uint8_t* dst_ptr,
+                                             int dst_width) {
   __asm {
     push        esi
     push        edi
-    mov         eax, [esp + 8 + 4]    // src_ptr
-    mov         esi, [esp + 8 + 8]    // src_stride
-    mov         edx, [esp + 8 + 12]   // dst_ptr
-    mov         ecx, [esp + 8 + 16]   // dst_width
+    mov         eax, [esp + 8 + 4]  // src_ptr
+    mov         esi, [esp + 8 + 8]  // src_stride
+    mov         edx, [esp + 8 + 12]  // dst_ptr
+    mov         ecx, [esp + 8 + 16]  // dst_width
     lea         edi, [esi + esi * 2]  // src_stride * 3
-    vpcmpeqb    ymm4, ymm4, ymm4            // constant 0x0101
+    vpcmpeqb    ymm4, ymm4, ymm4  // constant 0x0101
     vpsrlw      ymm4, ymm4, 15
-    vpsllw      ymm5, ymm4, 3               // constant 0x0008
+    vpsllw      ymm5, ymm4, 3  // constant 0x0008
     vpackuswb   ymm4, ymm4, ymm4
 
   wloop:
-    vmovdqu     ymm0, [eax]           // average rows
+    vmovdqu     ymm0, [eax]  // average rows
     vmovdqu     ymm1, [eax + 32]
     vmovdqu     ymm2, [eax + esi]
     vmovdqu     ymm3, [eax + esi + 32]
-    vpmaddubsw  ymm0, ymm0, ymm4      // horizontal add
+    vpmaddubsw  ymm0, ymm0, ymm4  // horizontal add
     vpmaddubsw  ymm1, ymm1, ymm4
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // vertical add rows 0, 1
+    vpaddw      ymm0, ymm0, ymm2  // vertical add rows 0, 1
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + esi * 2]
     vmovdqu     ymm3, [eax + esi * 2 + 32]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 2
+    vpaddw      ymm0, ymm0, ymm2  // add row 2
     vpaddw      ymm1, ymm1, ymm3
     vmovdqu     ymm2, [eax + edi]
     vmovdqu     ymm3, [eax + edi + 32]
     lea         eax,  [eax + 64]
     vpmaddubsw  ymm2, ymm2, ymm4
     vpmaddubsw  ymm3, ymm3, ymm4
-    vpaddw      ymm0, ymm0, ymm2      // add row 3
+    vpaddw      ymm0, ymm0, ymm2  // add row 3
     vpaddw      ymm1, ymm1, ymm3
-    vphaddw     ymm0, ymm0, ymm1      // mutates
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vphaddw
-    vpaddw      ymm0, ymm0, ymm5      // + 8 for round
-    vpsrlw      ymm0, ymm0, 4         // /32 for average of 4 * 4
+    vphaddw     ymm0, ymm0, ymm1  // mutates
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vphaddw
+    vpaddw      ymm0, ymm0, ymm5  // + 8 for round
+    vpsrlw      ymm0, ymm0, 4  // /32 for average of 4 * 4
     vpackuswb   ymm0, ymm0, ymm0
-    vpermq      ymm0, ymm0, 0xd8      // unmutate vpackuswb
+    vpermq      ymm0, ymm0, 0xd8  // unmutate vpackuswb
     vmovdqu     [edx], xmm0
     lea         edx, [edx + 16]
     sub         ecx, 16
@@ -494,14 +499,15 @@ void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride,
 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
 // Then shuffled to do the scaling.
 
-__declspec(naked)
-void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm3, xmmword ptr kShuf0
     movdqa     xmm4, xmmword ptr kShuf1
     movdqa     xmm5, xmmword ptr kShuf2
@@ -541,16 +547,16 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 // xmm7 kRound34
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -559,7 +565,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm2
@@ -568,7 +574,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm0, xmm1
     pshufb     xmm0, xmm3
@@ -577,7 +583,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm0, xmm1
@@ -598,16 +604,16 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Note that movdqa+palign may be better than movdqu.
-__declspec(naked)
-void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShuf01
     movdqa     xmm3, xmmword ptr kShuf11
     movdqa     xmm4, xmmword ptr kShuf21
@@ -616,7 +622,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     movdqa     xmm7, xmmword ptr kRound34
 
   wloop:
-    movdqu     xmm0, [eax]           // pixels 0..7
+    movdqu     xmm0, [eax]  // pixels 0..7
     movdqu     xmm1, [eax + esi]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -626,7 +632,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx], xmm0
-    movdqu     xmm0, [eax + 8]       // pixels 8..15
+    movdqu     xmm0, [eax + 8]  // pixels 8..15
     movdqu     xmm1, [eax + esi + 8]
     pavgb      xmm1, xmm0
     pavgb      xmm0, xmm1
@@ -636,7 +642,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     psrlw      xmm0, 2
     packuswb   xmm0, xmm0
     movq       qword ptr [edx + 8], xmm0
-    movdqu     xmm0, [eax + 16]      // pixels 16..23
+    movdqu     xmm0, [eax + 16]  // pixels 16..23
     movdqu     xmm1, [eax + esi + 16]
     lea        eax, [eax + 32]
     pavgb      xmm1, xmm0
@@ -660,26 +666,27 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
 // 3/8 point sampler
 
 // Scale 32 pixels to 12
-__declspec(naked)
-void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
-                          uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
+                                            ptrdiff_t src_stride,
+                                            uint8_t* dst_ptr,
+                                            int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_ptr
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_ptr
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_ptr
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_ptr
+    mov        ecx, [esp + 16]  // dst_width
     movdqa     xmm4, xmmword ptr kShuf38a
     movdqa     xmm5, xmmword ptr kShuf38b
 
   xloop:
-    movdqu     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
-    movdqu     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
+    movdqu     xmm0, [eax]  // 16 pixels -> 0,1,2,3,4,5
+    movdqu     xmm1, [eax + 16]  // 16 pixels -> 6,7,8,9,10,11
     lea        eax, [eax + 32]
     pshufb     xmm0, xmm4
     pshufb     xmm1, xmm5
     paddusb    xmm0, xmm1
 
-    movq       qword ptr [edx], xmm0  // write 12 pixels
+    movq       qword ptr [edx], xmm0       // write 12 pixels
     movhlps    xmm1, xmm0
     movd       [edx + 8], xmm1
     lea        edx, [edx + 12]
@@ -691,23 +698,23 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
 }
 
 // Scale 16x3 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAc
     movdqa     xmm3, xmmword ptr kShufAc3
     movdqa     xmm4, xmmword ptr kScaleAc33
     pxor       xmm5, xmm5
 
   xloop:
-    movdqu     xmm0, [eax]           // sum up 3 rows into xmm0/1
+    movdqu     xmm0, [eax]  // sum up 3 rows into xmm0/1
     movdqu     xmm6, [eax + esi]
     movhlps    xmm1, xmm0
     movhlps    xmm7, xmm6
@@ -725,14 +732,14 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     paddusw    xmm0, xmm6
     paddusw    xmm1, xmm7
 
-    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
+    movdqa     xmm6, xmm0  // 8 pixels -> 0,1,2 of xmm6
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     psrldq     xmm0, 2
     paddusw    xmm6, xmm0
     pshufb     xmm6, xmm2
 
-    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
+    movdqa     xmm7, xmm1  // 8 pixels -> 3,4,5 of xmm6
     psrldq     xmm1, 2
     paddusw    xmm7, xmm1
     psrldq     xmm1, 2
@@ -740,10 +747,10 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm7, xmm3
     paddusw    xmm6, xmm7
 
-    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
+    pmulhuw    xmm6, xmm4  // divide by 9,9,6, 9,9,6
     packuswb   xmm6, xmm6
 
-    movd       [edx], xmm6           // write 6 pixels
+    movd       [edx], xmm6  // write 6 pixels
     psrlq      xmm6, 16
     movd       [edx + 2], xmm6
     lea        edx, [edx + 6]
@@ -756,28 +763,28 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Scale 16x2 pixels to 6x1 with interpolation
-__declspec(naked)
-void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
-                                ptrdiff_t src_stride,
-                                uint8* dst_ptr, int dst_width) {
+__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
+                                                  ptrdiff_t src_stride,
+                                                  uint8_t* dst_ptr,
+                                                  int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_ptr
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_ptr
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_ptr
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_ptr
+    mov        ecx, [esp + 4 + 16]  // dst_width
     movdqa     xmm2, xmmword ptr kShufAb0
     movdqa     xmm3, xmmword ptr kShufAb1
     movdqa     xmm4, xmmword ptr kShufAb2
     movdqa     xmm5, xmmword ptr kScaleAb2
 
   xloop:
-    movdqu     xmm0, [eax]           // average 2 rows into xmm0
+    movdqu     xmm0, [eax]  // average 2 rows into xmm0
     movdqu     xmm1, [eax + esi]
     lea        eax, [eax + 16]
     pavgb      xmm0, xmm1
 
-    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
+    movdqa     xmm1, xmm0  // 16 pixels -> 0,1,2,3,4,5 of xmm1
     pshufb     xmm1, xmm2
     movdqa     xmm6, xmm0
     pshufb     xmm6, xmm3
@@ -785,10 +792,10 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     pshufb     xmm0, xmm4
     paddusw    xmm1, xmm0
 
-    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
+    pmulhuw    xmm1, xmm5  // divide by 3,3,2, 3,3,2
     packuswb   xmm1, xmm1
 
-    movd       [edx], xmm1           // write 6 pixels
+    movd       [edx], xmm1  // write 6 pixels
     psrlq      xmm1, 16
     movd       [edx + 2], xmm1
     lea        edx, [edx + 6]
@@ -801,26 +808,27 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
 }
 
 // Reads 16 bytes and accumulates to 16 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_SSE2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov        eax, [esp + 4]   // src_ptr
-    mov        edx, [esp + 8]   // dst_ptr
+    mov        eax, [esp + 4]  // src_ptr
+    mov        edx, [esp + 8]  // dst_ptr
     mov        ecx, [esp + 12]  // src_width
     pxor       xmm5, xmm5
 
-  // sum rows
+        // sum rows
   xloop:
-    movdqu     xmm3, [eax]       // read 16 bytes
+    movdqu     xmm3, [eax]  // read 16 bytes
     lea        eax, [eax + 16]
-    movdqu     xmm0, [edx]       // read 16 words from destination
+    movdqu     xmm0, [edx]  // read 16 words from destination
     movdqu     xmm1, [edx + 16]
     movdqa     xmm2, xmm3
     punpcklbw  xmm2, xmm5
     punpckhbw  xmm3, xmm5
-    paddusw    xmm0, xmm2        // sum 16 words
+    paddusw    xmm0, xmm2  // sum 16 words
     paddusw    xmm1, xmm3
-    movdqu     [edx], xmm0       // write 16 words to destination
+    movdqu     [edx], xmm0  // write 16 words to destination
     movdqu     [edx + 16], xmm1
     lea        edx, [edx + 32]
     sub        ecx, 16
@@ -831,24 +839,25 @@ void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 
 #ifdef HAS_SCALEADDROW_AVX2
 // Reads 32 bytes and accumulates to 32 shorts at a time.
-__declspec(naked)
-void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
+__declspec(naked) void ScaleAddRow_AVX2(const uint8_t* src_ptr,
+                                        uint16_t* dst_ptr,
+                                        int src_width) {
   __asm {
-    mov         eax, [esp + 4]   // src_ptr
-    mov         edx, [esp + 8]   // dst_ptr
+    mov         eax, [esp + 4]  // src_ptr
+    mov         edx, [esp + 8]  // dst_ptr
     mov         ecx, [esp + 12]  // src_width
     vpxor       ymm5, ymm5, ymm5
 
-  // sum rows
+        // sum rows
   xloop:
-    vmovdqu     ymm3, [eax]       // read 32 bytes
+    vmovdqu     ymm3, [eax]  // read 32 bytes
     lea         eax, [eax + 32]
     vpermq      ymm3, ymm3, 0xd8  // unmutate for vpunpck
     vpunpcklbw  ymm2, ymm3, ymm5
     vpunpckhbw  ymm3, ymm3, ymm5
-    vpaddusw    ymm0, ymm2, [edx] // sum 16 words
+    vpaddusw    ymm0, ymm2, [edx]  // sum 16 words
     vpaddusw    ymm1, ymm3, [edx + 32]
-    vmovdqu     [edx], ymm0       // write 32 words to destination
+    vmovdqu     [edx], ymm0  // write 32 words to destination
     vmovdqu     [edx + 32], ymm1
     lea         edx, [edx + 64]
     sub         ecx, 32
@@ -862,86 +871,87 @@ void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
 
 // Constant for making pixels signed to avoid pmaddubsw
 // saturation.
-static uvec8 kFsub80 =
-  { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-    0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
+static const uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+                              0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
 
 // Constant for making pixels unsigned and adding .5 for rounding.
-static uvec16 kFadd40 =
-  { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 };
+static const uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
+                               0x4040, 0x4040, 0x4040, 0x4040};
 
 // Bilinear column filtering. SSSE3 version.
-__declspec(naked)
-void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
+                                             const uint8_t* src_ptr,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        edi, [esp + 12 + 4]    // dst_ptr
-    mov        esi, [esp + 12 + 8]    // src_ptr
-    mov        ecx, [esp + 12 + 12]   // dst_width
+    mov        edi, [esp + 12 + 4]  // dst_ptr
+    mov        esi, [esp + 12 + 8]  // src_ptr
+    mov        ecx, [esp + 12 + 12]  // dst_width
     movd       xmm2, [esp + 12 + 16]  // x
     movd       xmm3, [esp + 12 + 20]  // dx
-    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
+    mov        eax, 0x04040000  // shuffle to line up fractions with pixel.
     movd       xmm5, eax
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pcmpeqb    xmm7, xmm7           // generate 0x0001
+    pcmpeqb    xmm7, xmm7  // generate 0x0001
     psrlw      xmm7, 15
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
     movd       xmm4, ebx
-    pshufb     xmm1, xmm5           // 0011
+    pshufb     xmm1, xmm5  // 0011
     punpcklwd  xmm0, xmm4
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    paddusb    xmm1, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm1, xmm0           // 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    paddusb    xmm1, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm1, xmm0  // 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
     paddw      xmm1, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm1, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm1, xmm1           // 8 bits, 2 pixels.
+    psrlw      xmm1, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm1, xmm1  // 8 bits, 2 pixels.
     movd       ebx, xmm1
     mov        [edi], bx
     lea        edi, [edi + 2]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
+            // 1 pixel remainder
     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
     movd       xmm0, ebx
-    psrlw      xmm2, 9              // 7 bit fractions.
-    pshufb     xmm2, xmm5           // 0011
+    psrlw      xmm2, 9  // 7 bit fractions.
+    pshufb     xmm2, xmm5  // 0011
     psubb      xmm0, xmmword ptr kFsub80  // make pixels signed.
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    paddusb    xmm2, xmm7           // +1 so 0..7f and 80..1
-    pmaddubsw  xmm2, xmm0           // 16 bit
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    paddusb    xmm2, xmm7  // +1 so 0..7f and 80..1
+    pmaddubsw  xmm2, xmm0  // 16 bit
     paddw      xmm2, xmmword ptr kFadd40  // make pixels unsigned and round.
-    psrlw      xmm2, 7              // 8.7 fixed point to low 8 bits.
-    packuswb   xmm2, xmm2           // 8 bits
+    psrlw      xmm2, 7  // 8.7 fixed point to low 8 bits.
+    packuswb   xmm2, xmm2  // 8 bits
     movd       ebx, xmm2
     mov        [edi], bl
 
@@ -955,13 +965,15 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 16 pixels, duplicates them and writes 32 pixels.
-__declspec(naked)
-void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
-                       int dst_width, int x, int dx) {
+__declspec(naked) void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
+                                         const uint8_t* src_ptr,
+                                         int dst_width,
+                                         int x,
+                                         int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_ptr
-    mov        eax, [esp + 8]    // src_ptr
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_ptr
+    mov        eax, [esp + 8]  // src_ptr
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -980,15 +992,15 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 
 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
-__declspec(naked)
-void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
-                            ptrdiff_t src_stride,
-                            uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
+                                              ptrdiff_t src_stride,
+                                              uint8_t* dst_argb,
+                                              int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]   // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1005,23 +1017,23 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x1 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
-    mov        eax, [esp + 4]        // src_argb
-                                     // src_stride ignored
-    mov        edx, [esp + 12]       // dst_argb
-    mov        ecx, [esp + 16]       // dst_width
+    mov        eax, [esp + 4]  // src_argb
+    // src_stride ignored
+    mov        edx, [esp + 12]  // dst_argb
+    mov        ecx, [esp + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
     movdqu     xmm1, [eax + 16]
     lea        eax,  [eax + 32]
     movdqa     xmm2, xmm0
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd       // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1033,16 +1045,16 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
 }
 
 // Blends 8x2 rectangle to 4x1.
-__declspec(naked)
-void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
-                               ptrdiff_t src_stride,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       esi
-    mov        eax, [esp + 4 + 4]    // src_argb
-    mov        esi, [esp + 4 + 8]    // src_stride
-    mov        edx, [esp + 4 + 12]   // dst_argb
-    mov        ecx, [esp + 4 + 16]   // dst_width
+    mov        eax, [esp + 4 + 4]  // src_argb
+    mov        esi, [esp + 4 + 8]  // src_stride
+    mov        edx, [esp + 4 + 12]  // dst_argb
+    mov        ecx, [esp + 4 + 16]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1050,11 +1062,11 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
     movdqu     xmm2, [eax + esi]
     movdqu     xmm3, [eax + esi + 16]
     lea        eax,  [eax + 32]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1067,18 +1079,19 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
 }
 
 // Reads 4 pixels at a time.
-__declspec(naked)
-void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
-                               int src_stepx,
-                               uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
+                                                 ptrdiff_t src_stride,
+                                                 int src_stepx,
+                                                 uint8_t* dst_argb,
+                                                 int dst_width) {
   __asm {
     push       ebx
     push       edi
-    mov        eax, [esp + 8 + 4]    // src_argb
-                                     // src_stride ignored
-    mov        ebx, [esp + 8 + 12]   // src_stepx
-    mov        edx, [esp + 8 + 16]   // dst_argb
-    mov        ecx, [esp + 8 + 20]   // dst_width
+    mov        eax, [esp + 8 + 4]   // src_argb
+    // src_stride ignored
+    mov        ebx, [esp + 8 + 12]  // src_stepx
+    mov        edx, [esp + 8 + 16]  // dst_argb
+    mov        ecx, [esp + 8 + 20]  // dst_width
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1103,21 +1116,21 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
 }
 
 // Blends four 2x2 to 4x1.
-__declspec(naked)
-void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
-                                  ptrdiff_t src_stride,
-                                  int src_stepx,
-                                  uint8* dst_argb, int dst_width) {
+__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
+                                                    ptrdiff_t src_stride,
+                                                    int src_stepx,
+                                                    uint8_t* dst_argb,
+                                                    int dst_width) {
   __asm {
     push       ebx
     push       esi
     push       edi
-    mov        eax, [esp + 12 + 4]    // src_argb
-    mov        esi, [esp + 12 + 8]    // src_stride
-    mov        ebx, [esp + 12 + 12]   // src_stepx
-    mov        edx, [esp + 12 + 16]   // dst_argb
-    mov        ecx, [esp + 12 + 20]   // dst_width
-    lea        esi, [eax + esi]       // row1 pointer
+    mov        eax, [esp + 12 + 4]  // src_argb
+    mov        esi, [esp + 12 + 8]  // src_stride
+    mov        ebx, [esp + 12 + 12]  // src_stepx
+    mov        edx, [esp + 12 + 16]  // dst_argb
+    mov        ecx, [esp + 12 + 20]  // dst_width
+    lea        esi, [eax + esi]  // row1 pointer
     lea        ebx, [ebx * 4]
     lea        edi, [ebx + ebx * 2]
 
@@ -1132,11 +1145,11 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
     movq       xmm3, qword ptr [esi + ebx * 2]
     movhps     xmm3, qword ptr [esi + edi]
     lea        esi,  [esi + ebx * 4]
-    pavgb      xmm0, xmm2            // average rows
+    pavgb      xmm0, xmm2  // average rows
     pavgb      xmm1, xmm3
-    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
-    shufps     xmm0, xmm1, 0x88      // even pixels
-    shufps     xmm2, xmm1, 0xdd      // odd pixels
+    movdqa     xmm2, xmm0  // average columns (8 to 4 pixels)
+    shufps     xmm0, xmm1, 0x88  // even pixels
+    shufps     xmm2, xmm1, 0xdd  // odd pixels
     pavgb      xmm0, xmm2
     movdqu     [edx], xmm0
     lea        edx, [edx + 16]
@@ -1151,64 +1164,66 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
 }
 
 // Column scaling unfiltered. SSE2 version.
-__declspec(naked)
-void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
-                        int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb,
+                                          const uint8_t* src_argb,
+                                          int dst_width,
+                                          int x,
+                                          int dx) {
   __asm {
     push       edi
     push       esi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
 
-    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
-    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
+    pshufd     xmm2, xmm2, 0  // x0 x0 x0 x0
+    pshufd     xmm0, xmm3, 0x11  // dx  0 dx  0
     paddd      xmm2, xmm0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
-    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
-    paddd      xmm2, xmm0            // x3 x2 x1 x0
-    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
-    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 2
+    pshufd     xmm0, xmm3, 0x05  // dx * 2, dx * 2, 0, 0
+    paddd      xmm2, xmm0  // x3 x2 x1 x0
+    paddd      xmm3, xmm3  // 0, 0, 0,  dx * 4
+    pshufd     xmm3, xmm3, 0  // dx * 4, dx * 4, dx * 4, dx * 4
 
-    pextrw     eax, xmm2, 1          // get x0 integer.
-    pextrw     edx, xmm2, 3          // get x1 integer.
+    pextrw     eax, xmm2, 1  // get x0 integer.
+    pextrw     edx, xmm2, 3  // get x1 integer.
 
     cmp        ecx, 0
     jle        xloop99
     sub        ecx, 4
     jl         xloop49
 
-    // 4 Pixel loop.
+        // 4 Pixel loop.
  xloop4:
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    pextrw     edx, xmm2, 7           // get x3 integer.
-    paddd      xmm2, xmm3             // x += dx
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    pextrw     edx, xmm2, 7  // get x3 integer.
+    paddd      xmm2, xmm3  // x += dx
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
-    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
-    punpckldq  xmm1, xmm4             // x2 x3
-    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    punpckldq  xmm1, xmm4  // x2 x3
+    punpcklqdq xmm0, xmm1  // x0 x1 x2 x3
     movdqu     [edi], xmm0
     lea        edi, [edi + 16]
-    sub        ecx, 4                 // 4 pixels
+    sub        ecx, 4  // 4 pixels
     jge        xloop4
 
  xloop49:
     test       ecx, 2
     je         xloop29
 
-    // 2 Pixels.
+        // 2 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
-    pextrw     eax, xmm2, 5           // get x2 integer.
-    punpckldq  xmm0, xmm1             // x0 x1
+    pextrw     eax, xmm2, 5  // get x2 integer.
+    punpckldq  xmm0, xmm1  // x0 x1
 
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
@@ -1217,7 +1232,7 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
     test       ecx, 1
     je         xloop99
 
-    // 1 Pixels.
+        // 1 Pixels.
     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
     movd       dword ptr [edi], xmm0
  xloop99:
@@ -1232,60 +1247,62 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
 // TODO(fbarchard): Port to Neon
 
 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
-static uvec8 kShuffleColARGB = {
-  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
-  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
+static const uvec8 kShuffleColARGB = {
+    0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
+    8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
 };
 
 // Shuffle table for duplicating 2 fractions into 8 bytes each
-static uvec8 kShuffleFractions = {
-  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
+static const uvec8 kShuffleFractions = {
+    0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
 };
 
-__declspec(naked)
-void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
-                               int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
+                                                 const uint8_t* src_argb,
+                                                 int dst_width,
+                                                 int x,
+                                                 int dx) {
   __asm {
     push       esi
     push       edi
-    mov        edi, [esp + 8 + 4]    // dst_argb
-    mov        esi, [esp + 8 + 8]    // src_argb
-    mov        ecx, [esp + 8 + 12]   // dst_width
+    mov        edi, [esp + 8 + 4]  // dst_argb
+    mov        esi, [esp + 8 + 8]  // src_argb
+    mov        ecx, [esp + 8 + 12]  // dst_width
     movd       xmm2, [esp + 8 + 16]  // x
     movd       xmm3, [esp + 8 + 20]  // dx
     movdqa     xmm4, xmmword ptr kShuffleColARGB
     movdqa     xmm5, xmmword ptr kShuffleFractions
-    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
+    pcmpeqb    xmm6, xmm6  // generate 0x007f for inverting fraction.
     psrlw      xmm6, 9
-    pextrw     eax, xmm2, 1         // get x0 integer. preroll
+    pextrw     eax, xmm2, 1  // get x0 integer. preroll
     sub        ecx, 2
     jl         xloop29
 
-    movdqa     xmm0, xmm2           // x1 = x0 + dx
+    movdqa     xmm0, xmm2  // x1 = x0 + dx
     paddd      xmm0, xmm3
-    punpckldq  xmm2, xmm0           // x0 x1
-    punpckldq  xmm3, xmm3           // dx dx
-    paddd      xmm3, xmm3           // dx * 2, dx * 2
-    pextrw     edx, xmm2, 3         // get x1 integer. preroll
+    punpckldq  xmm2, xmm0  // x0 x1
+    punpckldq  xmm3, xmm3  // dx dx
+    paddd      xmm3, xmm3  // dx * 2, dx * 2
+    pextrw     edx, xmm2, 3  // get x1 integer. preroll
 
     // 2 Pixel loop.
   xloop2:
-    movdqa     xmm1, xmm2           // x0, x1 fractions.
-    paddd      xmm2, xmm3           // x += dx
+    movdqa     xmm1, xmm2  // x0, x1 fractions.
+    paddd      xmm2, xmm3  // x += dx
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    psrlw      xmm1, 9              // 7 bit fractions.
+    psrlw      xmm1, 9  // 7 bit fractions.
     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
-    pshufb     xmm1, xmm5           // 0000000011111111
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm1, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
-    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
-    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
-    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
-    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
+    pshufb     xmm1, xmm5  // 0000000011111111
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm1, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm1  // argb_argb 16 bit, 2 pixels.
+    pextrw     eax, xmm2, 1  // get x0 integer. next iteration.
+    pextrw     edx, xmm2, 3  // get x1 integer. next iteration.
+    psrlw      xmm0, 7  // argb 8.7 fixed point to low 8 bits.
+    packuswb   xmm0, xmm0  // argb_argb 8 bits, 2 pixels.
     movq       qword ptr [edi], xmm0
     lea        edi, [edi + 8]
-    sub        ecx, 2               // 2 pixels
+    sub        ecx, 2  // 2 pixels
     jge        xloop2
 
  xloop29:
@@ -1293,15 +1310,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
     add        ecx, 2 - 1
     jl         xloop99
 
-    // 1 pixel remainder
-    psrlw      xmm2, 9              // 7 bit fractions.
+            // 1 pixel remainder
+    psrlw      xmm2, 9  // 7 bit fractions.
     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
-    pshufb     xmm2, xmm5           // 00000000
-    pshufb     xmm0, xmm4           // arrange pixels into pairs
-    pxor       xmm2, xmm6           // 0..7f and 7f..0
-    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
+    pshufb     xmm2, xmm5  // 00000000
+    pshufb     xmm0, xmm4  // arrange pixels into pairs
+    pxor       xmm2, xmm6  // 0..7f and 7f..0
+    pmaddubsw  xmm0, xmm2  // argb 16 bit, 1 pixel.
     psrlw      xmm0, 7
-    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
+    packuswb   xmm0, xmm0  // argb 8 bits, 1 pixel.
     movd       [edi], xmm0
 
  xloop99:
@@ -1313,13 +1330,15 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Reads 4 pixels, duplicates them and writes 8 pixels.
-__declspec(naked)
-void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
-                           int dst_width, int x, int dx) {
+__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
+                                             const uint8_t* src_argb,
+                                             int dst_width,
+                                             int x,
+                                             int dx) {
   __asm {
-    mov        edx, [esp + 4]    // dst_argb
-    mov        eax, [esp + 8]    // src_argb
-    mov        ecx, [esp + 12]   // dst_width
+    mov        edx, [esp + 4]  // dst_argb
+    mov        eax, [esp + 8]  // src_argb
+    mov        ecx, [esp + 12]  // dst_width
 
   wloop:
     movdqu     xmm0, [eax]
@@ -1338,12 +1357,11 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv_X86(int num, int div) {
+__declspec(naked) int FixedDiv_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     idiv       dword ptr [esp + 8]
     ret
@@ -1351,13 +1369,12 @@ int FixedDiv_X86(int num, int div) {
 }
 
 // Divide num by div and return as 16.16 fixed point result.
-__declspec(naked)
-int FixedDiv1_X86(int num, int div) {
+__declspec(naked) int FixedDiv1_X86(int num, int div) {
   __asm {
-    mov        eax, [esp + 4]    // num
-    mov        ecx, [esp + 8]    // denom
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
+    mov        eax, [esp + 4]  // num
+    mov        ecx, [esp + 8]  // denom
+    cdq  // extend num to 64 bits
+    shld       edx, eax, 16  // 32.16
     shl        eax, 16
     sub        eax, 0x00010001
     sbb        edx, 0
diff --git a/libs/libvpx/third_party/libyuv/source/video_common.cc b/libs/libvpx/third_party/libyuv/source/video_common.cc
index 00fb71e18bf..92384c050cd 100644
--- a/libs/libvpx/third_party/libyuv/source/video_common.cc
+++ b/libs/libvpx/third_party/libyuv/source/video_common.cc
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "libyuv/video_common.h"
 
 #ifdef __cplusplus
@@ -16,40 +15,39 @@ namespace libyuv {
 extern "C" {
 #endif
 
-#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
-
 struct FourCCAliasEntry {
-  uint32 alias;
-  uint32 canonical;
+  uint32_t alias;
+  uint32_t canonical;
 };
 
-static const struct FourCCAliasEntry kFourCCAliases[] = {
-  {FOURCC_IYUV, FOURCC_I420},
-  {FOURCC_YU12, FOURCC_I420},
-  {FOURCC_YU16, FOURCC_I422},
-  {FOURCC_YU24, FOURCC_I444},
-  {FOURCC_YUYV, FOURCC_YUY2},
-  {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
-  {FOURCC_HDYC, FOURCC_UYVY},
-  {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
-  {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
-  {FOURCC_DMB1, FOURCC_MJPG},
-  {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
-  {FOURCC_RGB3, FOURCC_RAW },
-  {FOURCC_BGR3, FOURCC_24BG},
-  {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
-  {FOURCC_CM24, FOURCC_RAW },  // kCMPixelFormat_24RGB
-  {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
-  {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
-  {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
+#define NUM_ALIASES 18
+static const struct FourCCAliasEntry kFourCCAliases[NUM_ALIASES] = {
+    {FOURCC_IYUV, FOURCC_I420},
+    {FOURCC_YU12, FOURCC_I420},
+    {FOURCC_YU16, FOURCC_I422},
+    {FOURCC_YU24, FOURCC_I444},
+    {FOURCC_YUYV, FOURCC_YUY2},
+    {FOURCC_YUVS, FOURCC_YUY2},  // kCMPixelFormat_422YpCbCr8_yuvs
+    {FOURCC_HDYC, FOURCC_UYVY},
+    {FOURCC_2VUY, FOURCC_UYVY},  // kCMPixelFormat_422YpCbCr8
+    {FOURCC_JPEG, FOURCC_MJPG},  // Note: JPEG has DHT while MJPG does not.
+    {FOURCC_DMB1, FOURCC_MJPG},
+    {FOURCC_BA81, FOURCC_BGGR},  // deprecated.
+    {FOURCC_RGB3, FOURCC_RAW},
+    {FOURCC_BGR3, FOURCC_24BG},
+    {FOURCC_CM32, FOURCC_BGRA},  // kCMPixelFormat_32ARGB
+    {FOURCC_CM24, FOURCC_RAW},   // kCMPixelFormat_24RGB
+    {FOURCC_L555, FOURCC_RGBO},  // kCMPixelFormat_16LE555
+    {FOURCC_L565, FOURCC_RGBP},  // kCMPixelFormat_16LE565
+    {FOURCC_5551, FOURCC_RGBO},  // kCMPixelFormat_16LE5551
 };
 // TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
 //  {FOURCC_BGRA, FOURCC_ARGB},  // kCMPixelFormat_32BGRA
 
 LIBYUV_API
-uint32 CanonicalFourCC(uint32 fourcc) {
+uint32_t CanonicalFourCC(uint32_t fourcc) {
   int i;
-  for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+  for (i = 0; i < NUM_ALIASES; ++i) {
     if (kFourCCAliases[i].alias == fourcc) {
       return kFourCCAliases[i].canonical;
     }
@@ -62,4 +60,3 @@ uint32 CanonicalFourCC(uint32 fourcc) {
 }  // extern "C"
 }  // namespace libyuv
 #endif
-
diff --git a/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py b/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py
new file mode 100644
index 00000000000..4b640e3e48a
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/genY4M/genY4M.py
@@ -0,0 +1,76 @@
+import argparse
+from os import listdir, path
+from PIL import Image
+import sys
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--frame_path", default="../data/frame/", type=str)
+parser.add_argument("--frame_rate", default="25:1", type=str)
+parser.add_argument("--interlacing", default="Ip", type=str)
+parser.add_argument("--pix_ratio", default="0:0", type=str)
+parser.add_argument("--color_space", default="4:2:0", type=str)
+parser.add_argument("--output", default="output.y4m", type=str)
+
+
+def generate(args, frames):
+  if len(frames) == 0:
+    return
+  #sort the frames based on the frame index
+  frames = sorted(frames, key=lambda x: x[0])
+  #convert the frames to YUV form
+  frames = [f.convert("YCbCr") for _, f in frames]
+  #write the header
+  header = "YUV4MPEG2 W%d H%d F%s %s A%s" % (frames[0].width, frames[0].height,
+                                             args.frame_rate, args.interlacing,
+                                             args.pix_ratio)
+  cs = args.color_space.split(":")
+  header += " C%s%s%s\n" % (cs[0], cs[1], cs[2])
+  #estimate the sample step based on subsample value
+  subsamples = [int(c) for c in cs]
+  r_step = [1, int(subsamples[2] == 0) + 1, int(subsamples[2] == 0) + 1]
+  c_step = [1, 4 // subsamples[1], 4 // subsamples[1]]
+  #write in frames
+  with open(args.output, "wb") as y4m:
+    y4m.write(header)
+    for f in frames:
+      y4m.write("FRAME\n")
+      px = f.load()
+      for k in xrange(3):
+        for i in xrange(0, f.height, r_step[k]):
+          for j in xrange(0, f.width, c_step[k]):
+            yuv = px[j, i]
+            y4m.write(chr(yuv[k]))
+
+
+if __name__ == "__main__":
+  args = parser.parse_args()
+  frames = []
+  frames_mv = []
+  for filename in listdir(args.frame_path):
+    name, ext = filename.split(".")
+    if ext == "png":
+      name_parse = name.split("_")
+      idx = int(name_parse[-1])
+      img = Image.open(path.join(args.frame_path, filename))
+      if name_parse[-2] == "mv":
+        frames_mv.append((idx, img))
+      else:
+        frames.append((idx, img))
+  if len(frames) == 0:
+    print "No frames in directory: " + args.frame_path
+    sys.exit()
+  print("----------------------Y4M Info----------------------")
+  print("width:  %d" % frames[0][1].width)
+  print("height: %d" % frames[0][1].height)
+  print("#frame: %d" % len(frames))
+  print("frame rate: %s" % args.frame_rate)
+  print("interlacing: %s" % args.interlacing)
+  print("pixel ratio: %s" % args.pix_ratio)
+  print("color space: %s" % args.color_space)
+  print("----------------------------------------------------")
+
+  print("Generating ...")
+  generate(args, frames)
+  if len(frames_mv) != 0:
+    args.output = args.output.replace(".y4m", "_mv.y4m")
+    generate(args, frames_mv)
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde
new file mode 100644
index 00000000000..7249ee972eb
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/BVH.pde
@@ -0,0 +1,163 @@
+/*
+ *AABB bounding box
+ *Bouding Volume Hierarchy
+ */
+class BoundingBox {
+  float min_x, min_y, min_z, max_x, max_y, max_z;
+  PVector center;
+  BoundingBox() {
+    min_x = Float.POSITIVE_INFINITY;
+    min_y = Float.POSITIVE_INFINITY;
+    min_z = Float.POSITIVE_INFINITY;
+    max_x = Float.NEGATIVE_INFINITY;
+    max_y = Float.NEGATIVE_INFINITY;
+    max_z = Float.NEGATIVE_INFINITY;
+    center = new PVector();
+  }
+  // build a bounding box for a triangle
+  void create(Triangle t) {
+    min_x = min(t.p1.x, min(t.p2.x, t.p3.x));
+    max_x = max(t.p1.x, max(t.p2.x, t.p3.x));
+
+    min_y = min(t.p1.y, min(t.p2.y, t.p3.y));
+    max_y = max(t.p1.y, max(t.p2.y, t.p3.y));
+
+    min_z = min(t.p1.z, min(t.p2.z, t.p3.z));
+    max_z = max(t.p1.z, max(t.p2.z, t.p3.z));
+    center.x = (max_x + min_x) / 2;
+    center.y = (max_y + min_y) / 2;
+    center.z = (max_z + min_z) / 2;
+  }
+  // merge two bounding boxes
+  void add(BoundingBox bbx) {
+    min_x = min(min_x, bbx.min_x);
+    min_y = min(min_y, bbx.min_y);
+    min_z = min(min_z, bbx.min_z);
+
+    max_x = max(max_x, bbx.max_x);
+    max_y = max(max_y, bbx.max_y);
+    max_z = max(max_z, bbx.max_z);
+    center.x = (max_x + min_x) / 2;
+    center.y = (max_y + min_y) / 2;
+    center.z = (max_z + min_z) / 2;
+  }
+  // get bounding box center axis value
+  float getCenterAxisValue(int axis) {
+    if (axis == 1) {
+      return center.x;
+    } else if (axis == 2) {
+      return center.y;
+    }
+    // when axis == 3
+    return center.z;
+  }
+  // check if a ray is intersected with the bounding box
+  boolean intersect(Ray r) {
+    float tmin, tmax;
+    if (r.dir.x >= 0) {
+      tmin = (min_x - r.ori.x) * (1.0f / r.dir.x);
+      tmax = (max_x - r.ori.x) * (1.0f / r.dir.x);
+    } else {
+      tmin = (max_x - r.ori.x) * (1.0f / r.dir.x);
+      tmax = (min_x - r.ori.x) * (1.0f / r.dir.x);
+    }
+
+    float tymin, tymax;
+    if (r.dir.y >= 0) {
+      tymin = (min_y - r.ori.y) * (1.0f / r.dir.y);
+      tymax = (max_y - r.ori.y) * (1.0f / r.dir.y);
+    } else {
+      tymin = (max_y - r.ori.y) * (1.0f / r.dir.y);
+      tymax = (min_y - r.ori.y) * (1.0f / r.dir.y);
+    }
+
+    if (tmax < tymin || tymax < tmin) {
+      return false;
+    }
+
+    tmin = tmin < tymin ? tymin : tmin;
+    tmax = tmax > tymax ? tymax : tmax;
+
+    float tzmin, tzmax;
+    if (r.dir.z >= 0) {
+      tzmin = (min_z - r.ori.z) * (1.0f / r.dir.z);
+      tzmax = (max_z - r.ori.z) * (1.0f / r.dir.z);
+    } else {
+      tzmin = (max_z - r.ori.z) * (1.0f / r.dir.z);
+      tzmax = (min_z - r.ori.z) * (1.0f / r.dir.z);
+    }
+    if (tmax < tzmin || tmin > tzmax) {
+      return false;
+    }
+    return true;
+  }
+}
+// Bounding Volume Hierarchy
+class BVH {
+  // Binary Tree
+  BVH left, right;
+  BoundingBox overall_bbx;
+  ArrayList<Triangle> mesh;
+  BVH(ArrayList<Triangle> mesh) {
+    this.mesh = mesh;
+    overall_bbx = new BoundingBox();
+    left = null;
+    right = null;
+    int mesh_size = this.mesh.size();
+    if (mesh_size <= 1) {
+      return;
+    }
+    // random select an axis
+    int axis = int(random(100)) % 3 + 1;
+    // build bounding box and save the selected center component
+    float[] axis_values = new float[mesh_size];
+    for (int i = 0; i < mesh_size; i++) {
+      Triangle t = this.mesh.get(i);
+      overall_bbx.add(t.bbx);
+      axis_values[i] = t.bbx.getCenterAxisValue(axis);
+    }
+    // find the median value of selected center component as pivot
+    axis_values = sort(axis_values);
+    float pivot;
+    if (mesh_size % 2 == 1) {
+      pivot = axis_values[mesh_size / 2];
+    } else {
+      pivot =
+          0.5f * (axis_values[mesh_size / 2 - 1] + axis_values[mesh_size / 2]);
+    }
+    // Build left node and right node by partitioning the mesh based on triangle
+    // bounding box center component value
+    ArrayList<Triangle> left_mesh = new ArrayList<Triangle>();
+    ArrayList<Triangle> right_mesh = new ArrayList<Triangle>();
+    for (int i = 0; i < mesh_size; i++) {
+      Triangle t = this.mesh.get(i);
+      if (t.bbx.getCenterAxisValue(axis) < pivot) {
+        left_mesh.add(t);
+      } else if (t.bbx.getCenterAxisValue(axis) > pivot) {
+        right_mesh.add(t);
+      } else if (left_mesh.size() < right_mesh.size()) {
+        left_mesh.add(t);
+      } else {
+        right_mesh.add(t);
+      }
+    }
+    left = new BVH(left_mesh);
+    right = new BVH(right_mesh);
+  }
+  // check if a ray intersect with current volume
+  boolean intersect(Ray r, float[] param) {
+    if (mesh.size() == 0) {
+      return false;
+    }
+    if (mesh.size() == 1) {
+      Triangle t = mesh.get(0);
+      return t.intersect(r, param);
+    }
+    if (!overall_bbx.intersect(r)) {
+      return false;
+    }
+    boolean left_res = left.intersect(r, param);
+    boolean right_res = right.intersect(r, param);
+    return left_res || right_res;
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde
new file mode 100644
index 00000000000..b39dae3a195
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Camera.pde
@@ -0,0 +1,138 @@
+class Camera {
+  // camera's field of view
+  float fov;
+  // camera's position, look at point and axis
+  PVector pos, center, axis;
+  PVector init_pos, init_center, init_axis;
+  float move_speed;
+  float rot_speed;
+  Camera(float fov, PVector pos, PVector center, PVector axis) {
+    this.fov = fov;
+    this.pos = pos;
+    this.center = center;
+    this.axis = axis;
+    this.axis.normalize();
+    move_speed = 0.001;
+    rot_speed = 0.01 * PI;
+    init_pos = pos.copy();
+    init_center = center.copy();
+    init_axis = axis.copy();
+  }
+
+  Camera copy() {
+    Camera cam = new Camera(fov, pos.copy(), center.copy(), axis.copy());
+    return cam;
+  }
+
+  PVector project(PVector pos) {
+    PVector proj = MatxVec3(getCameraMat(), PVector.sub(pos, this.pos));
+    proj.x = (float)height / 2.0 * proj.x / proj.z / tan(fov / 2.0f);
+    proj.y = (float)height / 2.0 * proj.y / proj.z / tan(fov / 2.0f);
+    proj.z = proj.z;
+    return proj;
+  }
+
+  float[] getCameraMat() {
+    float[] mat = new float[9];
+    PVector dir = PVector.sub(center, pos);
+    dir.normalize();
+    PVector left = dir.cross(axis);
+    left.normalize();
+    // processing camera system does not follow right hand rule
+    mat[0] = -left.x;
+    mat[1] = -left.y;
+    mat[2] = -left.z;
+    mat[3] = axis.x;
+    mat[4] = axis.y;
+    mat[5] = axis.z;
+    mat[6] = dir.x;
+    mat[7] = dir.y;
+    mat[8] = dir.z;
+
+    return mat;
+  }
+
+  void run() {
+    PVector dir, left;
+    if (mousePressed) {
+      float angleX = (float)mouseX / width * PI - PI / 2;
+      float angleY = (float)mouseY / height * PI - PI;
+      PVector diff = PVector.sub(center, pos);
+      float radius = diff.mag();
+      pos.x = radius * sin(angleY) * sin(angleX) + center.x;
+      pos.y = radius * cos(angleY) + center.y;
+      pos.z = radius * sin(angleY) * cos(angleX) + center.z;
+      dir = PVector.sub(center, pos);
+      dir.normalize();
+      PVector up = new PVector(0, 1, 0);
+      left = up.cross(dir);
+      left.normalize();
+      axis = dir.cross(left);
+      axis.normalize();
+    }
+
+    if (keyPressed) {
+      switch (key) {
+        case 'w':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          pos = PVector.add(pos, PVector.mult(dir, move_speed));
+          center = PVector.add(center, PVector.mult(dir, move_speed));
+          break;
+        case 's':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          pos = PVector.sub(pos, PVector.mult(dir, move_speed));
+          center = PVector.sub(center, PVector.mult(dir, move_speed));
+          break;
+        case 'a':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          left = axis.cross(dir);
+          left.normalize();
+          pos = PVector.add(pos, PVector.mult(left, move_speed));
+          center = PVector.add(center, PVector.mult(left, move_speed));
+          break;
+        case 'd':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          left = axis.cross(dir);
+          left.normalize();
+          pos = PVector.sub(pos, PVector.mult(left, move_speed));
+          center = PVector.sub(center, PVector.mult(left, move_speed));
+          break;
+        case 'r':
+          dir = PVector.sub(center, pos);
+          dir.normalize();
+          float[] mat = getRotationMat3x3(rot_speed, dir.x, dir.y, dir.z);
+          axis = MatxVec3(mat, axis);
+          axis.normalize();
+          break;
+        case 'b':
+          pos = init_pos.copy();
+          center = init_center.copy();
+          axis = init_axis.copy();
+          break;
+        case '+': move_speed *= 2.0f; break;
+        case '-': move_speed /= 2.0; break;
+        case CODED:
+          if (keyCode == UP) {
+            pos = PVector.add(pos, PVector.mult(axis, move_speed));
+            center = PVector.add(center, PVector.mult(axis, move_speed));
+          } else if (keyCode == DOWN) {
+            pos = PVector.sub(pos, PVector.mult(axis, move_speed));
+            center = PVector.sub(center, PVector.mult(axis, move_speed));
+          }
+      }
+    }
+  }
+  void open() {
+    perspective(fov, float(width) / height, 1e-6, 1e5);
+    camera(pos.x, pos.y, pos.z, center.x, center.y, center.z, axis.x, axis.y,
+           axis.z);
+  }
+  void close() {
+    ortho(-width, 0, -height, 0);
+    camera(0, 0, 0, 0, 0, 1, 0, 1, 0);
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde
new file mode 100644
index 00000000000..883a8f83103
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/MotionField.pde
@@ -0,0 +1,94 @@
+class MotionField {
+  int block_size;
+  ArrayList<PVector> motion_field;
+  MotionField(int block_size) {
+    this.block_size = block_size;
+    motion_field = new ArrayList<PVector>();
+  }
+
+  void update(Camera last_cam, Camera current_cam, PointCloud point_cloud,
+              BVH bvh) {
+    // clear motion field
+    motion_field = new ArrayList<PVector>();
+    int r_num = height / block_size, c_num = width / block_size;
+    for (int i = 0; i < r_num * c_num; i++)
+      motion_field.add(new PVector(0, 0, 0));
+    // estimate motion vector of each point in point cloud
+    for (int i = 0; i < point_cloud.size(); i++) {
+      PVector p = point_cloud.getPosition(i);
+      PVector p0 = current_cam.project(p);
+      PVector p1 = last_cam.project(p);
+      int row = int((p0.y + height / 2.0f) / block_size);
+      int col = int((p0.x + width / 2.0f) / block_size);
+      if (row >= 0 && row < r_num && col >= 0 && col < c_num) {
+        PVector accu = motion_field.get(row * c_num + col);
+        accu.x += p1.x - p0.x;
+        accu.y += p1.y - p0.y;
+        accu.z += 1;
+      }
+    }
+    // if some blocks do not have point, then use ray tracing to see if they are
+    // in triangles
+    for (int i = 0; i < r_num; i++)
+      for (int j = 0; j < c_num; j++) {
+        PVector accu = motion_field.get(i * c_num + j);
+        if (accu.z > 0) {
+          continue;
+        }
+        // use the center of the block to generate view ray
+        float cx = j * block_size + block_size / 2.0f - width / 2.0f;
+        float cy = i * block_size + block_size / 2.0f - height / 2.0f;
+        float cz = 0.5f * height / tan(current_cam.fov / 2.0f);
+        PVector dir = new PVector(cx, cy, cz);
+        float[] camMat = current_cam.getCameraMat();
+        dir = MatxVec3(transpose3x3(camMat), dir);
+        dir.normalize();
+        Ray r = new Ray(current_cam.pos, dir);
+        // ray tracing
+        float[] param = new float[4];
+        param[0] = Float.POSITIVE_INFINITY;
+        if (bvh.intersect(r, param)) {
+          PVector p = new PVector(param[1], param[2], param[3]);
+          PVector p0 = current_cam.project(p);
+          PVector p1 = last_cam.project(p);
+          accu.x += p1.x - p0.x;
+          accu.y += p1.y - p0.y;
+          accu.z += 1;
+        }
+      }
+    // estimate the motion vector of each block
+    for (int i = 0; i < r_num * c_num; i++) {
+      PVector mv = motion_field.get(i);
+      if (mv.z > 0) {
+        motion_field.set(i, new PVector(mv.x / mv.z, mv.y / mv.z, 0));
+      }
+    }
+  }
+
+  void render() {
+    int r_num = height / block_size, c_num = width / block_size;
+    for (int i = 0; i < r_num; i++)
+      for (int j = 0; j < c_num; j++) {
+        PVector mv = motion_field.get(i * c_num + j);
+        float ox = j * block_size + 0.5f * block_size;
+        float oy = i * block_size + 0.5f * block_size;
+        stroke(255, 0, 0);
+        line(ox, oy, ox + mv.x, oy + mv.y);
+      }
+  }
+
+  void save(String path) {
+    int r_num = height / block_size;
+    int c_num = width / block_size;
+    String[] mvs = new String[r_num];
+    for (int i = 0; i < r_num; i++) {
+      mvs[i] = "";
+      for (int j = 0; j < c_num; j++) {
+        PVector mv = motion_field.get(i * c_num + j);
+        mvs[i] += str(mv.x) + "," + str(mv.y);
+        if (j != c_num - 1) mvs[i] += ";";
+      }
+    }
+    saveStrings(path, mvs);
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde
new file mode 100644
index 00000000000..714a6f3a0b3
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/PointCloud.pde
@@ -0,0 +1,138 @@
+class PointCloud {
+  ArrayList<PVector> points;  // array to save points
+  IntList point_colors;       // array to save points color
+  PVector cloud_mass;
+  float[] depth;
+  boolean[] real;
+  PointCloud() {
+    // initialize
+    points = new ArrayList<PVector>();
+    point_colors = new IntList();
+    cloud_mass = new PVector(0, 0, 0);
+    depth = new float[width * height];
+    real = new boolean[width * height];
+  }
+
+  void generate(PImage rgb_img, PImage depth_img, Transform trans) {
+    if (depth_img.width != width || depth_img.height != height ||
+        rgb_img.width != width || rgb_img.height != height) {
+      println("rgb and depth file dimension should be same with window size");
+      exit();
+    }
+    // clear depth and real
+    for (int i = 0; i < width * height; i++) {
+      depth[i] = 0;
+      real[i] = false;
+    }
+    for (int v = 0; v < height; v++)
+      for (int u = 0; u < width; u++) {
+        // get depth value (red channel)
+        color depth_px = depth_img.get(u, v);
+        depth[v * width + u] = depth_px & 0x0000FFFF;
+        if (int(depth[v * width + u]) != 0) {
+          real[v * width + u] = true;
+        }
+        point_colors.append(rgb_img.get(u, v));
+      }
+    for (int v = 0; v < height; v++)
+      for (int u = 0; u < width; u++) {
+        if (int(depth[v * width + u]) == 0) {
+          interpolateDepth(v, u);
+        }
+        // add transformed pixel as well as pixel color to the list
+        PVector pos = trans.transform(u, v, int(depth[v * width + u]));
+        points.add(pos);
+        // accumulate z value
+        cloud_mass = PVector.add(cloud_mass, pos);
+      }
+  }
+  void fillInDepthAlongPath(float d, Node node) {
+    node = node.parent;
+    while (node != null) {
+      int i = node.row;
+      int j = node.col;
+      if (depth[i * width + j] == 0) {
+        depth[i * width + j] = d;
+      }
+      node = node.parent;
+    }
+  }
+  // interpolate
+  void interpolateDepth(int row, int col) {
+    if (row < 0 || row >= height || col < 0 || col >= width ||
+        int(depth[row * width + col]) != 0) {
+      return;
+    }
+    ArrayList<Node> queue = new ArrayList<Node>();
+    queue.add(new Node(row, col, null));
+    boolean[] visited = new boolean[width * height];
+    for (int i = 0; i < width * height; i++) visited[i] = false;
+    visited[row * width + col] = true;
+    // Using BFS to Find the Nearest Neighbor
+    while (queue.size() > 0) {
+      // pop
+      Node node = queue.get(0);
+      queue.remove(0);
+      int i = node.row;
+      int j = node.col;
+      // if current position have a real depth
+      if (depth[i * width + j] != 0 && real[i * width + j]) {
+        fillInDepthAlongPath(depth[i * width + j], node);
+        break;
+      } else {
+        // search unvisited 8 neighbors
+        for (int r = max(0, i - 1); r < min(height, i + 2); r++) {
+          for (int c = max(0, j - 1); c < min(width, j + 2); c++) {
+            if (!visited[r * width + c]) {
+              visited[r * width + c] = true;
+              queue.add(new Node(r, c, node));
+            }
+          }
+        }
+      }
+    }
+  }
+  // get point cloud size
+  int size() { return points.size(); }
+  // get ith position
+  PVector getPosition(int i) {
+    if (i >= points.size()) {
+      println("point position: index " + str(i) + " exceeds");
+      exit();
+    }
+    return points.get(i);
+  }
+  // get ith color
+  color getColor(int i) {
+    if (i >= point_colors.size()) {
+      println("point color: index " + str(i) + " exceeds");
+      exit();
+    }
+    return point_colors.get(i);
+  }
+  // get cloud center
+  PVector getCloudCenter() {
+    if (points.size() > 0) {
+      return PVector.div(cloud_mass, points.size());
+    }
+    return new PVector(0, 0, 0);
+  }
+  // merge two clouds
+  void merge(PointCloud point_cloud) {
+    for (int i = 0; i < point_cloud.size(); i++) {
+      points.add(point_cloud.getPosition(i));
+      point_colors.append(point_cloud.getColor(i));
+    }
+    cloud_mass = PVector.add(cloud_mass, point_cloud.cloud_mass);
+  }
+}
+
+class Node {
+  int row, col;
+  Node parent;
+  Node(int row, int col, Node parent) {
+    this.row = row;
+    this.col = col;
+    this.parent = parent;
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde
new file mode 100644
index 00000000000..ef4be691c29
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Ray_Tracing.pde
@@ -0,0 +1,61 @@
+// Triangle
+class Triangle {
+  // position
+  PVector p1, p2, p3;
+  // color
+  color c1, c2, c3;
+  BoundingBox bbx;
+  Triangle(PVector p1, PVector p2, PVector p3, color c1, color c2, color c3) {
+    this.p1 = p1;
+    this.p2 = p2;
+    this.p3 = p3;
+    this.c1 = c1;
+    this.c2 = c2;
+    this.c3 = c3;
+    bbx = new BoundingBox();
+    bbx.create(this);
+  }
+  // check to see if a ray intersects with the triangle
+  boolean intersect(Ray r, float[] param) {
+    PVector p21 = PVector.sub(p2, p1);
+    PVector p31 = PVector.sub(p3, p1);
+    PVector po1 = PVector.sub(r.ori, p1);
+
+    PVector dxp31 = r.dir.cross(p31);
+    PVector po1xp21 = po1.cross(p21);
+    float denom = p21.dot(dxp31);
+    float t = p31.dot(po1xp21) / denom;
+    float alpha = po1.dot(dxp31) / denom;
+    float beta = r.dir.dot(po1xp21) / denom;
+
+    boolean res = t > 0 && alpha > 0 && alpha < 1 && beta > 0 && beta < 1 &&
+                  alpha + beta < 1;
+    // depth test
+    if (res && t < param[0]) {
+      param[0] = t;
+      param[1] = alpha * p1.x + beta * p2.x + (1 - alpha - beta) * p3.x;
+      param[2] = alpha * p1.y + beta * p2.y + (1 - alpha - beta) * p3.y;
+      param[3] = alpha * p1.z + beta * p2.z + (1 - alpha - beta) * p3.z;
+    }
+    return res;
+  }
+  void render() {
+    beginShape(TRIANGLES);
+    fill(c1);
+    vertex(p1.x, p1.y, p1.z);
+    fill(c2);
+    vertex(p2.x, p2.y, p2.z);
+    fill(c3);
+    vertex(p3.x, p3.y, p3.z);
+    endShape();
+  }
+}
+// Ray
+class Ray {
+  // origin and direction
+  PVector ori, dir;
+  Ray(PVector ori, PVector dir) {
+    this.ori = ori;
+    this.dir = dir;
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde
new file mode 100644
index 00000000000..cf79ab71418
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Scene.pde
@@ -0,0 +1,59 @@
+class Scene {
+  PointCloud point_cloud;
+  ArrayList<Triangle> mesh;
+  BVH bvh;
+  MotionField motion_field;
+  Camera last_cam;
+  Camera current_cam;
+  int frame_count;
+
+  Scene(Camera camera, PointCloud point_cloud, MotionField motion_field) {
+    this.point_cloud = point_cloud;
+    this.motion_field = motion_field;
+    mesh = new ArrayList<Triangle>();
+    for (int v = 0; v < height - 1; v++)
+      for (int u = 0; u < width - 1; u++) {
+        PVector p1 = point_cloud.getPosition(v * width + u);
+        PVector p2 = point_cloud.getPosition(v * width + u + 1);
+        PVector p3 = point_cloud.getPosition((v + 1) * width + u + 1);
+        PVector p4 = point_cloud.getPosition((v + 1) * width + u);
+        color c1 = point_cloud.getColor(v * width + u);
+        color c2 = point_cloud.getColor(v * width + u + 1);
+        color c3 = point_cloud.getColor((v + 1) * width + u + 1);
+        color c4 = point_cloud.getColor((v + 1) * width + u);
+        mesh.add(new Triangle(p1, p2, p3, c1, c2, c3));
+        mesh.add(new Triangle(p3, p4, p1, c3, c4, c1));
+      }
+    bvh = new BVH(mesh);
+    last_cam = camera.copy();
+    current_cam = camera;
+    frame_count = 0;
+  }
+
+  void run() {
+    last_cam = current_cam.copy();
+    current_cam.run();
+    motion_field.update(last_cam, current_cam, point_cloud, bvh);
+    frame_count += 1;
+  }
+
+  void render(boolean show_motion_field) {
+    // build mesh
+    current_cam.open();
+    noStroke();
+    for (int i = 0; i < mesh.size(); i++) {
+      Triangle t = mesh.get(i);
+      t.render();
+    }
+    if (show_motion_field) {
+      current_cam.close();
+      motion_field.render();
+    }
+  }
+
+  void save(String path) { saveFrame(path + "_" + str(frame_count) + ".png"); }
+
+  void saveMotionField(String path) {
+    motion_field.save(path + "_" + str(frame_count) + ".txt");
+  }
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde
new file mode 100644
index 00000000000..af2204e8cf2
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Transform.pde
@@ -0,0 +1,82 @@
+class Transform {
+  float[] inv_rot;  // inverse of rotation matrix
+  PVector inv_mov;  // inverse of movement vector
+  float focal;      // the focal distacne of real camera
+  int w, h;         // the width and height of the frame
+  float normalier;  // nomalization factor of depth
+  Transform(float tx, float ty, float tz, float qx, float qy, float qz,
+            float qw, float fov, int w, int h, float normalier) {
+    // currently, we did not use the info of real camera's position and
+    // quaternion maybe we will use it in the future when combine all frames
+    float[] rot = quaternion2Mat3x3(qx, qy, qz, qw);
+    inv_rot = transpose3x3(rot);
+    inv_mov = new PVector(-tx, -ty, -tz);
+    this.focal = 0.5f * h / tan(fov / 2.0);
+    this.w = w;
+    this.h = h;
+    this.normalier = normalier;
+  }
+
+  PVector transform(int i, int j, float d) {
+    // transfer from camera view to world view
+    float z = d / normalier;
+    float x = (i - w / 2.0f) * z / focal;
+    float y = (j - h / 2.0f) * z / focal;
+    return new PVector(x, y, z);
+  }
+}
+
+// get rotation matrix by using rotation axis and angle
+float[] getRotationMat3x3(float angle, float ax, float ay, float az) {
+  float[] mat = new float[9];
+  float c = cos(angle);
+  float s = sin(angle);
+  mat[0] = c + ax * ax * (1 - c);
+  mat[1] = ax * ay * (1 - c) - az * s;
+  mat[2] = ax * az * (1 - c) + ay * s;
+  mat[3] = ay * ax * (1 - c) + az * s;
+  mat[4] = c + ay * ay * (1 - c);
+  mat[5] = ay * az * (1 - c) - ax * s;
+  mat[6] = az * ax * (1 - c) - ay * s;
+  mat[7] = az * ay * (1 - c) + ax * s;
+  mat[8] = c + az * az * (1 - c);
+  return mat;
+}
+
+// get rotation matrix by using quaternion
+float[] quaternion2Mat3x3(float qx, float qy, float qz, float qw) {
+  float[] mat = new float[9];
+  mat[0] = 1 - 2 * qy * qy - 2 * qz * qz;
+  mat[1] = 2 * qx * qy - 2 * qz * qw;
+  mat[2] = 2 * qx * qz + 2 * qy * qw;
+  mat[3] = 2 * qx * qy + 2 * qz * qw;
+  mat[4] = 1 - 2 * qx * qx - 2 * qz * qz;
+  mat[5] = 2 * qy * qz - 2 * qx * qw;
+  mat[6] = 2 * qx * qz - 2 * qy * qw;
+  mat[7] = 2 * qy * qz + 2 * qx * qw;
+  mat[8] = 1 - 2 * qx * qx - 2 * qy * qy;
+  return mat;
+}
+
+// tranpose a 3x3 matrix
+float[] transpose3x3(float[] mat) {
+  float[] Tmat = new float[9];
+  for (int i = 0; i < 3; i++)
+    for (int j = 0; j < 3; j++) {
+      Tmat[i * 3 + j] = mat[j * 3 + i];
+    }
+  return Tmat;
+}
+
+// multiply a matrix with vector
+PVector MatxVec3(float[] mat, PVector v) {
+  float[] vec = v.array();
+  float[] res = new float[3];
+  for (int i = 0; i < 3; i++) {
+    res[i] = 0.0f;
+    for (int j = 0; j < 3; j++) {
+      res[i] += mat[i * 3 + j] * vec[j];
+    }
+  }
+  return new PVector(res[0], res[1], res[2]);
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde
new file mode 100644
index 00000000000..19d124a0b34
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/Util.pde
@@ -0,0 +1,28 @@
+// show grids
+void showGrids(int block_size) {
+  ortho(-width, 0, -height, 0);
+  camera(0, 0, 0, 0, 0, 1, 0, 1, 0);
+  stroke(0, 0, 255);
+  for (int i = 0; i < height; i += block_size) {
+    line(0, i, width, i);
+  }
+  for (int i = 0; i < width; i += block_size) {
+    line(i, 0, i, height);
+  }
+}
+
+// save the point clould information
+void savePointCloud(PointCloud point_cloud, String file_name) {
+  String[] positions = new String[point_cloud.points.size()];
+  String[] colors = new String[point_cloud.points.size()];
+  for (int i = 0; i < point_cloud.points.size(); i++) {
+    PVector point = point_cloud.getPosition(i);
+    color point_color = point_cloud.getColor(i);
+    positions[i] = str(point.x) + ' ' + str(point.y) + ' ' + str(point.z);
+    colors[i] = str(((point_color >> 16) & 0xFF) / 255.0) + ' ' +
+                str(((point_color >> 8) & 0xFF) / 255.0) + ' ' +
+                str((point_color & 0xFF) / 255.0);
+  }
+  saveStrings(file_name + "_pos.txt", positions);
+  saveStrings(file_name + "_color.txt", colors);
+}
diff --git a/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde
new file mode 100644
index 00000000000..22a495432df
--- /dev/null
+++ b/libs/libvpx/tools/3D-Reconstruction/sketch_3D_reconstruction/sketch_3D_reconstruction.pde
@@ -0,0 +1,74 @@
+/*The dataset is from
+ *Computer Vision Group
+ *TUM Department of Informatics Technical
+ *University of Munich
+ *https://vision.in.tum.de/data/datasets/rgbd-dataset/download#freiburg1_xyz
+ */
+Scene scene;
+void setup() {
+  size(640, 480, P3D);
+  // default settings
+  int frame_no = 0;            // frame number
+  float fov = PI / 3;          // field of view
+  int block_size = 8;          // block size
+  float normalizer = 5000.0f;  // normalizer
+  // initialize
+  PointCloud point_cloud = new PointCloud();
+  // synchronized rgb, depth and ground truth
+  String head = "../data/";
+  String[] rgb_depth_gt = loadStrings(head + "rgb_depth_groundtruth.txt");
+  // read in rgb and depth image file paths as well as corresponding camera
+  // posiiton and quaternion
+  String[] info = split(rgb_depth_gt[frame_no], ' ');
+  String rgb_path = head + info[1];
+  String depth_path = head + info[3];
+  float tx = float(info[7]), ty = float(info[8]),
+        tz = float(info[9]);  // real camera position
+  float qx = float(info[10]), qy = float(info[11]), qz = float(info[12]),
+        qw = float(info[13]);  // quaternion
+
+  // build transformer
+  Transform trans =
+      new Transform(tx, ty, tz, qx, qy, qz, qw, fov, width, height, normalizer);
+  PImage rgb = loadImage(rgb_path);
+  PImage depth = loadImage(depth_path);
+  // generate point cloud
+  point_cloud.generate(rgb, depth, trans);
+  // initialize camera
+  Camera camera = new Camera(fov, new PVector(0, 0, 0), new PVector(0, 0, 1),
+                             new PVector(0, 1, 0));
+  // initialize motion field
+  MotionField motion_field = new MotionField(block_size);
+  // initialize scene
+  scene = new Scene(camera, point_cloud, motion_field);
+}
+boolean inter = false;
+void draw() {
+  background(0);
+  // run camera dragged mouse to rotate camera
+  // w: go forward
+  // s: go backward
+  // a: go left
+  // d: go right
+  // up arrow: go up
+  // down arrow: go down
+  //+ increase move speed
+  //- decrease move speed
+  // r: rotate the camera
+  // b: reset to initial position
+  scene.run();  // true: make interpolation; false: do not make
+                // interpolation
+  if (keyPressed && key == 'o') {
+    inter = true;
+  }
+  scene.render(
+      false);  // true: turn on motion field; false: turn off motion field
+  // save frame with no motion field
+  scene.save("../data/frame/raw");
+  background(0);
+  scene.render(true);
+  showGrids(scene.motion_field.block_size);
+  // save frame with motion field
+  scene.save("../data/frame/raw_mv");
+  scene.saveMotionField("../data/frame/mv");
+}
diff --git a/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py b/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py
new file mode 100644
index 00000000000..513faa435f9
--- /dev/null
+++ b/libs/libvpx/tools/non_greedy_mv/non_greedy_mv.py
@@ -0,0 +1,186 @@
+import sys
+import matplotlib.pyplot as plt
+from matplotlib.collections import LineCollection
+from matplotlib import colors as mcolors
+import numpy as np
+import math
+
+
+def draw_mv_ls(axis, mv_ls, mode=0):
+  colors = np.array([(1., 0., 0., 1.)])
+  segs = np.array([
+      np.array([[ptr[0], ptr[1]], [ptr[0] + ptr[2], ptr[1] + ptr[3]]])
+      for ptr in mv_ls
+  ])
+  line_segments = LineCollection(
+      segs, linewidths=(1.,), colors=colors, linestyle='solid')
+  axis.add_collection(line_segments)
+  if mode == 0:
+    axis.scatter(mv_ls[:, 0], mv_ls[:, 1], s=2, c='b')
+  else:
+    axis.scatter(
+        mv_ls[:, 0] + mv_ls[:, 2], mv_ls[:, 1] + mv_ls[:, 3], s=2, c='b')
+
+
+def draw_pred_block_ls(axis, mv_ls, bs, mode=0):
+  colors = np.array([(0., 0., 0., 1.)])
+  segs = []
+  for ptr in mv_ls:
+    if mode == 0:
+      x = ptr[0]
+      y = ptr[1]
+    else:
+      x = ptr[0] + ptr[2]
+      y = ptr[1] + ptr[3]
+    x_ls = [x, x + bs, x + bs, x, x]
+    y_ls = [y, y, y + bs, y + bs, y]
+
+    segs.append(np.column_stack([x_ls, y_ls]))
+  line_segments = LineCollection(
+      segs, linewidths=(.5,), colors=colors, linestyle='solid')
+  axis.add_collection(line_segments)
+
+
+def read_frame(fp, no_swap=0):
+  plane = [None, None, None]
+  for i in range(3):
+    line = fp.readline()
+    word_ls = line.split()
+    word_ls = [int(item) for item in word_ls]
+    rows = word_ls[0]
+    cols = word_ls[1]
+
+    line = fp.readline()
+    word_ls = line.split()
+    word_ls = [int(item) for item in word_ls]
+
+    plane[i] = np.array(word_ls).reshape(rows, cols)
+    if i > 0:
+      plane[i] = plane[i].repeat(2, axis=0).repeat(2, axis=1)
+  plane = np.array(plane)
+  if no_swap == 0:
+    plane = np.swapaxes(np.swapaxes(plane, 0, 1), 1, 2)
+  return plane
+
+
+def yuv_to_rgb(yuv):
+  #mat = np.array([
+  #    [1.164,   0   , 1.596  ],
+  #    [1.164, -0.391, -0.813],
+  #    [1.164, 2.018 , 0     ] ]
+  #               )
+  #c = np.array([[ -16 , -16 , -16  ],
+  #              [ 0   , -128, -128 ],
+  #              [ -128, -128,   0  ]])
+
+  mat = np.array([[1, 0, 1.4075], [1, -0.3445, -0.7169], [1, 1.7790, 0]])
+  c = np.array([[0, 0, 0], [0, -128, -128], [-128, -128, 0]])
+  mat_c = np.dot(mat, c)
+  v = np.array([mat_c[0, 0], mat_c[1, 1], mat_c[2, 2]])
+  mat = mat.transpose()
+  rgb = np.dot(yuv, mat) + v
+  rgb = rgb.astype(int)
+  rgb = rgb.clip(0, 255)
+  return rgb / 255.
+
+
+def read_feature_score(fp, mv_rows, mv_cols):
+  line = fp.readline()
+  word_ls = line.split()
+  feature_score = np.array([math.log(float(v) + 1, 2) for v in word_ls])
+  feature_score = feature_score.reshape(mv_rows, mv_cols)
+  return feature_score
+
+def read_mv_mode_arr(fp, mv_rows, mv_cols):
+  line = fp.readline()
+  word_ls = line.split()
+  mv_mode_arr = np.array([int(v) for v in word_ls])
+  mv_mode_arr = mv_mode_arr.reshape(mv_rows, mv_cols)
+  return mv_mode_arr
+
+
+def read_frame_dpl_stats(fp):
+  line = fp.readline()
+  word_ls = line.split()
+  frame_idx = int(word_ls[1])
+  mi_rows = int(word_ls[3])
+  mi_cols = int(word_ls[5])
+  bs = int(word_ls[7])
+  ref_frame_idx = int(word_ls[9])
+  rf_idx = int(word_ls[11])
+  gf_frame_offset = int(word_ls[13])
+  ref_gf_frame_offset = int(word_ls[15])
+  mi_size = bs / 8
+  mv_ls = []
+  mv_rows = int((math.ceil(mi_rows * 1. / mi_size)))
+  mv_cols = int((math.ceil(mi_cols * 1. / mi_size)))
+  for i in range(mv_rows * mv_cols):
+    line = fp.readline()
+    word_ls = line.split()
+    row = int(word_ls[0]) * 8.
+    col = int(word_ls[1]) * 8.
+    mv_row = int(word_ls[2]) / 8.
+    mv_col = int(word_ls[3]) / 8.
+    mv_ls.append([col, row, mv_col, mv_row])
+  mv_ls = np.array(mv_ls)
+  feature_score = read_feature_score(fp, mv_rows, mv_cols)
+  mv_mode_arr = read_mv_mode_arr(fp, mv_rows, mv_cols)
+  img = yuv_to_rgb(read_frame(fp))
+  ref = yuv_to_rgb(read_frame(fp))
+  return rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr
+
+
+def read_dpl_stats_file(filename, frame_num=0):
+  fp = open(filename)
+  line = fp.readline()
+  width = 0
+  height = 0
+  data_ls = []
+  while (line):
+    if line[0] == '=':
+      data_ls.append(read_frame_dpl_stats(fp))
+    line = fp.readline()
+    if frame_num > 0 and len(data_ls) == frame_num:
+      break
+  return data_ls
+
+
+if __name__ == '__main__':
+  filename = sys.argv[1]
+  data_ls = read_dpl_stats_file(filename, frame_num=5)
+  for rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, mv_ls, img, ref, bs, feature_score, mv_mode_arr in data_ls:
+    fig, axes = plt.subplots(2, 2)
+
+    axes[0][0].imshow(img)
+    draw_mv_ls(axes[0][0], mv_ls)
+    draw_pred_block_ls(axes[0][0], mv_ls, bs, mode=0)
+    #axes[0].grid(color='k', linestyle='-')
+    axes[0][0].set_ylim(img.shape[0], 0)
+    axes[0][0].set_xlim(0, img.shape[1])
+
+    if ref is not None:
+      axes[0][1].imshow(ref)
+      draw_mv_ls(axes[0][1], mv_ls, mode=1)
+      draw_pred_block_ls(axes[0][1], mv_ls, bs, mode=1)
+      #axes[1].grid(color='k', linestyle='-')
+      axes[0][1].set_ylim(ref.shape[0], 0)
+      axes[0][1].set_xlim(0, ref.shape[1])
+
+    axes[1][0].imshow(feature_score)
+    #feature_score_arr = feature_score.flatten()
+    #feature_score_max = feature_score_arr.max()
+    #feature_score_min = feature_score_arr.min()
+    #step = (feature_score_max - feature_score_min) / 20.
+    #feature_score_bins = np.arange(feature_score_min, feature_score_max, step)
+    #axes[1][1].hist(feature_score_arr, bins=feature_score_bins)
+    im = axes[1][1].imshow(mv_mode_arr)
+    #axes[1][1].figure.colorbar(im, ax=axes[1][1])
+
+    print rf_idx, frame_idx, ref_frame_idx, gf_frame_offset, ref_gf_frame_offset, len(mv_ls)
+
+    flatten_mv_mode = mv_mode_arr.flatten()
+    zero_mv_count = sum(flatten_mv_mode == 0);
+    new_mv_count = sum(flatten_mv_mode == 1);
+    ref_mv_count = sum(flatten_mv_mode == 2) + sum(flatten_mv_mode == 3);
+    print zero_mv_count, new_mv_count, ref_mv_count
+    plt.show()
diff --git a/libs/libvpx/tools/set_analyzer_env.sh b/libs/libvpx/tools/set_analyzer_env.sh
new file mode 100644
index 00000000000..4bdbba6523c
--- /dev/null
+++ b/libs/libvpx/tools/set_analyzer_env.sh
@@ -0,0 +1,142 @@
+##  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  Sourcing this file sets environment variables to simplify setting up
+##  sanitizer builds and testing.
+
+sanitizer="${1}"
+
+case "${sanitizer}" in
+  address) ;;
+  cfi) ;;
+  integer) ;;
+  memory) ;;
+  thread) ;;
+  undefined) ;;
+  clear)
+    echo "Clearing environment:"
+    set -x
+    unset CC CXX LD AR
+    unset CFLAGS CXXFLAGS LDFLAGS
+    unset ASAN_OPTIONS MSAN_OPTIONS TSAN_OPTIONS UBSAN_OPTIONS
+    set +x
+    return
+    ;;
+  *)
+    echo "Usage: source set_analyzer_env.sh [<sanitizer>|clear]"
+    echo "  Supported sanitizers:"
+    echo "    address cfi integer memory thread undefined"
+    return 1
+    ;;
+esac
+
+if [ ! $(which clang) ]; then
+  # TODO(johannkoenig): Support gcc analyzers.
+  echo "ERROR: 'clang' must be in your PATH"
+  return 1
+fi
+
+# Warnings.
+if [ "${sanitizer}" = "undefined" -o "${sanitizer}" = "integer" ]; then
+  echo "WARNING: When building the ${sanitizer} sanitizer for 32 bit targets"
+  echo "you must run:"
+  echo "export LDFLAGS=\"\${LDFLAGS} --rtlib=compiler-rt -lgcc_s\""
+  echo "See http://llvm.org/bugs/show_bug.cgi?id=17693 for details."
+fi
+
+if [ "${sanitizer}" = "undefined" ]; then
+  major_version=$(clang --version | head -n 1 \
+    | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.)
+  if [ ${major_version} -eq 5 ]; then
+    echo "WARNING: clang v5 has a problem with vp9 x86_64 high bit depth"
+    echo "configurations. It can take ~40 minutes to compile"
+    echo "vpx_dsp/x86/fwd_txfm_sse2.c"
+    echo "clang v4 did not have this issue."
+  fi
+fi
+
+echo "It is recommended to configure with '--enable-debug' to improve stack"
+echo "traces. On mac builds, run 'dysmutil' on the output binaries (vpxenc,"
+echo "test_libvpx, etc) to link the stack traces to source code lines."
+
+# Build configuration.
+cflags="-fsanitize=${sanitizer}"
+ldflags="-fsanitize=${sanitizer}"
+
+# http://code.google.com/p/webm/issues/detail?id=570
+cflags="${cflags} -fno-strict-aliasing"
+# Useful backtraces.
+cflags="${cflags} -fno-omit-frame-pointer"
+# Exact backtraces.
+cflags="${cflags} -fno-optimize-sibling-calls"
+
+if [ "${sanitizer}" = "cfi" ]; then
+  # https://clang.llvm.org/docs/ControlFlowIntegrity.html
+  cflags="${cflags} -fno-sanitize-trap=cfi -flto -fvisibility=hidden"
+  ldflags="${ldflags} -fno-sanitize-trap=cfi -flto -fuse-ld=gold"
+  export AR="llvm-ar"
+fi
+
+# TODO(http://crbug.com/webm/1615): -fsanitize=implicit-integer-truncation
+# causes conversion warnings in many of the x86 intrinsics and elsewhere.
+if [ "${sanitizer}" = "integer" ]; then
+  major_version=$(clang --version | head -n 1 \
+    | grep -o -E "[[:digit:]]\.[[:digit:]]\.[[:digit:]]" | cut -f1 -d.)
+  if [ ${major_version} -ge 7 ]; then
+    cflags="${cflags} -fno-sanitize=implicit-integer-truncation"
+    ldflags="${ldflags} -fno-sanitize=implicit-integer-truncation"
+  fi
+fi
+
+set -x
+export CC="clang"
+export CXX="clang++"
+export LD="clang++"
+
+export CFLAGS="${cflags}"
+export CXXFLAGS="${cflags}"
+export LDFLAGS="${ldflags}"
+set +x
+
+# Execution configuration.
+sanitizer_options=""
+sanitizer_options="${sanitizer_options}:handle_segv=1"
+sanitizer_options="${sanitizer_options}:handle_abort=1"
+sanitizer_options="${sanitizer_options}:handle_sigfpe=1"
+sanitizer_options="${sanitizer_options}:fast_unwind_on_fatal=1"
+sanitizer_options="${sanitizer_options}:allocator_may_return_null=1"
+
+case "${sanitizer}" in
+  address)
+    sanitizer_options="${sanitizer_options}:detect_stack_use_after_return=1"
+    sanitizer_options="${sanitizer_options}:max_uar_stack_size_log=17"
+    set -x
+    export ASAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+  cfi)
+    # No environment settings
+    ;;
+  memory)
+    set -x
+    export MSAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+  thread)
+    # The thread sanitizer uses an entirely independent set of options.
+    set -x
+    export TSAN_OPTIONS="halt_on_error=1"
+    set +x
+    ;;
+  undefined|integer)
+    sanitizer_options="${sanitizer_options}:print_stacktrace=1"
+    set -x
+    export UBSAN_OPTIONS="${sanitizer_options}"
+    set +x
+    ;;
+esac
diff --git a/libs/libvpx/tools/tiny_ssim.c b/libs/libvpx/tools/tiny_ssim.c
index 5e8ca02b49d..ff4634ade4f 100644
--- a/libs/libvpx/tools/tiny_ssim.c
+++ b/libs/libvpx/tools/tiny_ssim.c
@@ -34,6 +34,10 @@ static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
   unsigned int row, col;
   uint64_t total_sse = 0;
   int diff;
+  if (orig == NULL || recon == NULL) {
+    assert(0);
+    return 0;
+  }
 
   for (row = 0; row < rows; row++) {
     for (col = 0; col < cols; col++) {
@@ -46,13 +50,18 @@ static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
   }
   return total_sse;
 }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
                                  int recon_stride, unsigned int cols,
                                  unsigned int rows) {
   unsigned int row, col;
   uint64_t total_sse = 0;
   int diff;
+  if (orig == NULL || recon == NULL) {
+    assert(0);
+    return 0;
+  }
 
   for (row = 0; row < rows; row++) {
     for (col = 0; col < cols; col++) {
@@ -91,40 +100,43 @@ typedef struct input_file {
   int w;
   int h;
   int bit_depth;
+  int frame_size;
 } input_file_t;
 
 // Open a file and determine if its y4m or raw.  If y4m get the header.
 static int open_input_file(const char *file_name, input_file_t *input, int w,
                            int h, int bit_depth) {
   char y4m_buf[4];
-  size_t r1;
+  input->w = w;
+  input->h = h;
+  input->bit_depth = bit_depth;
   input->type = RAW_YUV;
   input->buf = NULL;
   input->file = strcmp(file_name, "-") ? fopen(file_name, "rb") : stdin;
   if (input->file == NULL) return -1;
-  r1 = fread(y4m_buf, 1, 4, input->file);
-  if (r1 == 4) {
-    if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M;
-    switch (input->type) {
-      case Y4M:
-        y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
-        input->w = input->y4m.pic_w;
-        input->h = input->y4m.pic_h;
-        input->bit_depth = input->y4m.bit_depth;
-        // Y4M alloc's its own buf. Init this to avoid problems if we never
-        // read frames.
-        memset(&input->img, 0, sizeof(input->img));
-        break;
-      case RAW_YUV:
-        fseek(input->file, 0, SEEK_SET);
-        input->w = w;
-        input->h = h;
-        if (bit_depth < 9)
-          input->buf = malloc(w * h * 3 / 2);
-        else
-          input->buf = malloc(w * h * 3);
-        break;
-    }
+  if (fread(y4m_buf, 1, 4, input->file) != 4) return -1;
+  if (memcmp(y4m_buf, "YUV4", 4) == 0) input->type = Y4M;
+  switch (input->type) {
+    case Y4M:
+      y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
+      input->w = input->y4m.pic_w;
+      input->h = input->y4m.pic_h;
+      input->bit_depth = input->y4m.bit_depth;
+      // Y4M alloc's its own buf. Init this to avoid problems if we never
+      // read frames.
+      memset(&input->img, 0, sizeof(input->img));
+      break;
+    case RAW_YUV:
+      fseek(input->file, 0, SEEK_SET);
+      input->w = w;
+      input->h = h;
+      // handle odd frame sizes
+      input->frame_size = w * h + ((w + 1) / 2) * ((h + 1) / 2) * 2;
+      if (bit_depth > 8) {
+        input->frame_size *= 2;
+      }
+      input->buf = malloc(input->frame_size);
+      break;
   }
   return 0;
 }
@@ -150,15 +162,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y,
       break;
     case RAW_YUV:
       if (bd < 9) {
-        r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
         *y = in->buf;
         *u = in->buf + in->w * in->h;
-        *v = in->buf + 5 * in->w * in->h / 4;
+        *v = *u + ((1 + in->w) / 2) * ((1 + in->h) / 2);
       } else {
-        r1 = fread(in->buf, in->w * in->h * 3, 1, in->file);
+        r1 = fread(in->buf, in->frame_size, 1, in->file);
         *y = in->buf;
-        *u = in->buf + in->w * in->h / 2;
-        *v = *u + in->w * in->h / 2;
+        *u = in->buf + (in->w * in->h) * 2;
+        *v = *u + 2 * ((1 + in->w) / 2) * ((1 + in->h) / 2);
       }
       break;
   }
@@ -166,24 +178,15 @@ static size_t read_input_file(input_file_t *in, unsigned char **y,
   return r1;
 }
 
-void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                      uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                      uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+static void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
   int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
+  if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL ||
+      sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) {
+    assert(0);
+    return;
   }
-}
-void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                    uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                    uint32_t *sum_sq_r, uint32_t *sum_sxr) {
-  int i, j;
   for (i = 0; i < 8; i++, s += sp, r += rp) {
     for (j = 0; j < 8; j++) {
       *sum_s += s[j];
@@ -195,10 +198,17 @@ void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
   }
 }
 
-void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp,
-                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                                  int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                  uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                  uint32_t *sum_sxr) {
   int i, j;
+  if (s == NULL || r == NULL || sum_s == NULL || sum_r == NULL ||
+      sum_sq_s == NULL || sum_sq_r == NULL || sum_sxr == NULL) {
+    assert(0);
+    return;
+  }
   for (i = 0; i < 8; i++, s += sp, r += rp) {
     for (j = 0; j < 8; j++) {
       *sum_s += s[j];
@@ -209,11 +219,12 @@ void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp,
     }
   }
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
+  double ssim_n, ssim_d;
   int64_t c1 = 0, c2 = 0;
   if (bd == 8) {
     // scale the constants by number of pixels
@@ -229,14 +240,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
@@ -245,14 +256,15 @@ static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
   return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
+                              int rp, uint32_t bd) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                         &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, bd);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // We are using a 8x8 moving window with starting location of each 8x8 window
 // on the 4x4 pixel grid. Such arrangement allows the windows to overlap
@@ -276,9 +288,10 @@ static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
   return ssim_total;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
 static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                            int stride_img1, int stride_img2, int width,
-                           int height, uint32_t bd, uint32_t shift) {
+                           int height, uint32_t bd) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -287,9 +300,9 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
   for (i = 0; i <= height - 8;
        i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
     for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
+      double v =
+          highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                          CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd);
       ssim_total += v;
       samples++;
     }
@@ -297,277 +310,7 @@ static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
   ssim_total /= samples;
   return ssim_total;
 }
-
-// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
-//
-// Re working out the math ->
-//
-// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
-//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
-//
-// mean(x) = sum(x) / n
-//
-// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
-//
-// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
-//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
-//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
-//
-// factoring out n*n
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
-//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
-//
-// Replace c1 with n*n * c1 for the final step that leads to this code:
-// The final step scales by 12 bits so we don't lose precision in the constants.
-
-static double ssimv_similarity(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
-                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
-
-  // Since these variables are unsigned sums, convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-
-// The first term of the ssim metric is a luminance factor.
-//
-// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
-//
-// This luminance factor is super sensitive to the dark side of luminance
-// values and completely insensitive on the white side.  check out 2 sets
-// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
-// 2*250*252/ (250^2+252^2) => .99999997
-//
-// As a result in this tweaked version of the calculation in which the
-// luminance is taken as percentage off from peak possible.
-//
-// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
-//
-static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
-  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
-
-  // Since these variables are unsigned, sums convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                        int img2_pitch, Ssimv *sv) {
-  ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
-                 &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
-}
-
-double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                        int img2_pitch, int width, int height, Ssimv *sv2,
-                        Metrics *m, int do_inconsistency) {
-  double dssim_total = 0;
-  double ssim_total = 0;
-  double ssim2_total = 0;
-  double inconsistency_total = 0;
-  int i, j;
-  int c = 0;
-  double norm;
-  double old_ssim_total = 0;
-
-  // We can sample points as frequently as we like start with 1 per 4x4.
-  for (i = 0; i < height;
-       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
-    for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
-      double ssim;
-      double ssim2;
-      double dssim;
-      uint32_t var_new;
-      uint32_t var_old;
-      uint32_t mean_new;
-      uint32_t mean_old;
-      double ssim_new;
-      double ssim_old;
-
-      // Not sure there's a great way to handle the edge pixels
-      // in ssim when using a window. Seems biased against edge pixels
-      // however you handle this. This uses only samples that are
-      // fully in the frame.
-      if (j + 8 <= width && i + 8 <= height) {
-        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
-      }
-
-      ssim = ssimv_similarity(&sv, 64);
-      ssim2 = ssimv_similarity2(&sv, 64);
-
-      sv.ssim = ssim2;
-
-      // dssim is calculated to use as an actual error metric and
-      // is scaled up to the same range as sum square error.
-      // Since we are subsampling every 16th point maybe this should be
-      // *16 ?
-      dssim = 255 * 255 * (1 - ssim2) / 2;
-
-      // Here I introduce a new error metric: consistency-weighted
-      // SSIM-inconsistency.  This metric isolates frames where the
-      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
-      // sharper or blurrier than the others. Higher values indicate a
-      // temporally inconsistent SSIM. There are two ideas at work:
-      //
-      // 1) 'SSIM-inconsistency': the total inconsistency value
-      // reflects how much SSIM values are changing between this
-      // source / reference frame pair and the previous pair.
-      //
-      // 2) 'consistency-weighted': weights de-emphasize areas in the
-      // frame where the scene content has changed. Changes in scene
-      // content are detected via changes in local variance and local
-      // mean.
-      //
-      // Thus the overall measure reflects how inconsistent the SSIM
-      // values are, over consistent regions of the frame.
-      //
-      // The metric has three terms:
-      //
-      // term 1 -> uses change in scene Variance to weight error score
-      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term 2 -> uses change in local scene luminance to weight error
-      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term3 -> measures inconsistency in ssim scores between frames
-      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
-      //
-      // This term compares the ssim score for the same location in 2
-      // subsequent frames.
-      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
-      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
-      mean_new = sv.sum_s;
-      mean_old = sv2[c].sum_s;
-      ssim_new = sv.ssim;
-      ssim_old = sv2[c].ssim;
-
-      if (do_inconsistency) {
-        // We do the metric once for every 4x4 block in the image. Since
-        // we are scaling the error to SSE for use in a psnr calculation
-        // 1.0 = 4x4x255x255 the worst error we can possibly have.
-        static const double kScaling = 4. * 4 * 255 * 255;
-
-        // The constants have to be non 0 to avoid potential divide by 0
-        // issues other than that they affect kind of a weighting between
-        // the terms.  No testing of what the right terms should be has been
-        // done.
-        static const double c1 = 1, c2 = 1, c3 = 1;
-
-        // This measures how much consistent variance is in two consecutive
-        // source frames. 1.0 means they have exactly the same variance.
-        const double variance_term =
-            (2.0 * var_old * var_new + c1) /
-            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
-
-        // This measures how consistent the local mean are between two
-        // consecutive frames. 1.0 means they have exactly the same mean.
-        const double mean_term =
-            (2.0 * mean_old * mean_new + c2) /
-            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
-
-        // This measures how consistent the ssims of two
-        // consecutive frames is. 1.0 means they are exactly the same.
-        double ssim_term =
-            pow((2.0 * ssim_old * ssim_new + c3) /
-                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
-                5);
-
-        double this_inconsistency;
-
-        // Floating point math sometimes makes this > 1 by a tiny bit.
-        // We want the metric to scale between 0 and 1.0 so we can convert
-        // it to an snr scaled value.
-        if (ssim_term > 1) ssim_term = 1;
-
-        // This converts the consistency metric to an inconsistency metric
-        // ( so we can scale it like psnr to something like sum square error.
-        // The reason for the variance and mean terms is the assumption that
-        // if there are big changes in the source we shouldn't penalize
-        // inconsistency in ssim scores a bit less as it will be less visible
-        // to the user.
-        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
-
-        this_inconsistency *= kScaling;
-        inconsistency_total += this_inconsistency;
-      }
-      sv2[c] = sv;
-      ssim_total += ssim;
-      ssim2_total += ssim2;
-      dssim_total += dssim;
-
-      old_ssim_total += ssim_old;
-    }
-    old_ssim_total += 0;
-  }
-
-  norm = 1. / (width / 4) / (height / 4);
-  ssim_total *= norm;
-  ssim2_total *= norm;
-  m->ssim2 = ssim2_total;
-  m->ssim = ssim_total;
-  if (old_ssim_total == 0) inconsistency_total = 0;
-
-  m->ssimc = inconsistency_total;
-
-  m->dssim = dssim_total;
-  return inconsistency_total;
-}
-
-double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                        const YV12_BUFFER_CONFIG *dest, double *weight,
-                        uint32_t bd, uint32_t in_bd) {
-  double a, b, c;
-  double ssimv;
-  uint32_t shift = 0;
-
-  assert(bd >= in_bd);
-  shift = bd - in_bd;
-
-  a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
-                   dest->y_stride, source->y_crop_width, source->y_crop_height,
-                   in_bd, shift);
-
-  b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
-                   dest->uv_stride, source->uv_crop_width,
-                   source->uv_crop_height, in_bd, shift);
-
-  c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
-                   dest->uv_stride, source->uv_crop_width,
-                   source->uv_crop_height, in_bd, shift);
-
-  ssimv = a * .8 + .1 * (b + c);
-
-  *weight = 1;
-
-  return ssimv;
-}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 int main(int argc, char *argv[]) {
   FILE *framestats = NULL;
@@ -583,13 +326,14 @@ int main(int argc, char *argv[]) {
   input_file_t in[2];
   double peak = 255.0;
 
+  memset(in, 0, sizeof(in));
+
   if (argc < 2) {
     fprintf(stderr,
             "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}"
             "[WxH tl_skip={0,1,3} frame_stats_file bits]\n",
             argv[0]);
-    return_value = 1;
-    goto clean_up;
+    return 1;
   }
 
   if (argc > 3) {
@@ -601,7 +345,7 @@ int main(int argc, char *argv[]) {
   }
 
   if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) {
-    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
+    fprintf(stderr, "File %s can't be opened or parsed!\n", argv[1]);
     goto clean_up;
   }
 
@@ -613,7 +357,7 @@ int main(int argc, char *argv[]) {
   }
   if (bit_depth == 10) peak = 1023.0;
 
-  if (bit_depth == 12) peak = 4095;
+  if (bit_depth == 12) peak = 4095.0;
 
   if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) {
     fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
@@ -628,9 +372,19 @@ int main(int argc, char *argv[]) {
     goto clean_up;
   }
 
-  // Number of frames to skip from file1.yuv for every frame used. Normal values
-  // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding
-  // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding.
+  if (in[0].bit_depth != in[1].bit_depth) {
+    fprintf(stderr,
+            "Failing: Image bit depths don't match or are unspecified!\n");
+    return_value = 1;
+    goto clean_up;
+  }
+
+  bit_depth = in[0].bit_depth;
+
+  // Number of frames to skip from file1.yuv for every frame used. Normal
+  // values 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL
+  // encoding in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer
+  // encoding.
   if (argc > 4) {
     sscanf(argv[4], "%d", &tl_skip);
     if (argc > 5) {
@@ -644,12 +398,6 @@ int main(int argc, char *argv[]) {
     }
   }
 
-  if (w & 1 || h & 1) {
-    fprintf(stderr, "Invalid size %dx%d\n", w, h);
-    return_value = 1;
-    goto clean_up;
-  }
-
   while (1) {
     size_t r1, r2;
     unsigned char *y[2], *u[2], *v[2];
@@ -683,7 +431,7 @@ int main(int argc, char *argv[]) {
     psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
   } else {                                                                     \
     ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
-                        w, w, h, bit_depth, bit_depth - 8);                    \
+                        w, w, h, bit_depth);                                   \
     psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
                               CAST_TO_SHORTPTR(buf1), w, w, h);                \
   }
@@ -691,7 +439,7 @@ int main(int argc, char *argv[]) {
 #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
   ssim = ssim2(buf0, buf1, w, w, w, h);             \
   psnr = calc_plane_error(buf0, w, buf1, w, w, h);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (n_frames == allocated_frames) {
       allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
@@ -703,8 +451,10 @@ int main(int argc, char *argv[]) {
       psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
     }
     psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
-    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], w / 2, h / 2);
-    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], w / 2, h / 2);
+    psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,
+                  (h + 1) / 2);
+    psnr_and_ssim(ssimv[n_frames], psnrv[n_frames], v[0], v[1], (w + 1) / 2,
+                  (h + 1) / 2);
 
     n_frames++;
   }
diff --git a/libs/libvpx/tools_common.c b/libs/libvpx/tools_common.c
index 6f14c255617..59978b7f93a 100644
--- a/libs/libvpx/tools_common.c
+++ b/libs/libvpx/tools_common.c
@@ -46,6 +46,14 @@
     va_end(ap);                        \
   } while (0)
 
+#if CONFIG_ENCODERS
+/* Swallow warnings about unused results of fread/fwrite */
+static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
+  return fread(ptr, size, nmemb, stream);
+}
+#define fread wrap_fread
+#endif
+
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
 #if defined(_WIN32) || defined(__OS2__)
@@ -200,8 +208,6 @@ const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc) {
 
 #endif  // CONFIG_DECODERS
 
-// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
-// of vpx_image_t support
 int vpx_img_plane_width(const vpx_image_t *img, int plane) {
   if (plane > 0 && img->x_chroma_shift > 0)
     return (img->d_w + 1) >> img->x_chroma_shift;
@@ -266,6 +272,88 @@ double sse_to_psnr(double samples, double peak, double sse) {
   }
 }
 
+#if CONFIG_ENCODERS
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
+  int shortread = 0;
+
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
+    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
+  } else {
+    shortread = read_yuv_frame(input_ctx, img);
+  }
+
+  return !shortread;
+}
+
+int file_is_y4m(const char detect[4]) {
+  if (memcmp(detect, "YUV4", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+int fourcc_is_ivf(const char detect[4]) {
+  if (memcmp(detect, "DKIF", 4) == 0) {
+    return 1;
+  }
+  return 0;
+}
+
+void open_input_file(struct VpxInputContext *input) {
+  /* Parse certain options from the input file, if possible */
+  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
+                                             : set_binary_mode(stdin);
+
+  if (!input->file) fatal("Failed to open input file");
+
+  if (!fseeko(input->file, 0, SEEK_END)) {
+    /* Input file is seekable. Figure out how long it is, so we can get
+     * progress info.
+     */
+    input->length = ftello(input->file);
+    rewind(input->file);
+  }
+
+  /* Default to 1:1 pixel aspect ratio. */
+  input->pixel_aspect_ratio.numerator = 1;
+  input->pixel_aspect_ratio.denominator = 1;
+
+  /* For RAW input sources, these bytes will applied on the first frame
+   *  in read_frame().
+   */
+  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
+  input->detect.position = 0;
+
+  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
+    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
+                       input->only_i420) >= 0) {
+      input->file_type = FILE_TYPE_Y4M;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
+      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
+      input->fmt = input->y4m.vpx_fmt;
+      input->bit_depth = input->y4m.bit_depth;
+    } else {
+      fatal("Unsupported Y4M stream.");
+    }
+  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
+    fatal("IVF is not supported as input.");
+  } else {
+    input->file_type = FILE_TYPE_RAW;
+  }
+}
+
+void close_input_file(struct VpxInputContext *input) {
+  fclose(input->file);
+  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
+}
+#endif
+
 // TODO(debargha): Consolidate the functions below into a separate file.
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
@@ -459,3 +547,225 @@ void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift) {
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
+    l_w *= 2;
+    c_w *= 2;
+  }
+#endif
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+#define mmin(a, b) ((a) < (b) ? (a) : (b))
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void find_mismatch_high(const vpx_image_t *const img1,
+                        const vpx_image_t *const img2, int yloc[4], int uloc[4],
+                        int vloc[4]) {
+  uint16_t *plane1, *plane2;
+  uint32_t stride1, stride2;
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y];
+  stride1 = img1->stride[VPX_PLANE_Y] / 2;
+  stride2 = img2->stride[VPX_PLANE_Y] / 2;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_U];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_U];
+  stride1 = img1->stride[VPX_PLANE_U] / 2;
+  stride2 = img2->stride[VPX_PLANE_U] / 2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  plane1 = (uint16_t *)img1->planes[VPX_PLANE_V];
+  plane2 = (uint16_t *)img2->planes[VPX_PLANE_V];
+  stride1 = img1->stride[VPX_PLANE_V] / 2;
+  stride2 = img2->stride[VPX_PLANE_V] / 2;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(plane1 + (i + k) * stride1 + j + l) !=
+              *(plane2 + (i + k) * stride2 + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
+            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2,
+                   int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/libs/libvpx/tools_common.h b/libs/libvpx/tools_common.h
index e41de3195f5..4526d9f165c 100644
--- a/libs/libvpx/tools_common.h
+++ b/libs/libvpx/tools_common.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TOOLS_COMMON_H_
-#define TOOLS_COMMON_H_
+#ifndef VPX_TOOLS_COMMON_H_
+#define VPX_TOOLS_COMMON_H_
 
 #include <stdio.h>
 
@@ -33,6 +33,7 @@ typedef int64_t FileOffset;
 #define ftello ftello64
 typedef off64_t FileOffset;
 #elif CONFIG_OS_SUPPORT
+#include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM
  * executables (.axf) with RVCT. */
@@ -144,8 +145,6 @@ const VpxInterface *get_vpx_decoder_by_index(int i);
 const VpxInterface *get_vpx_decoder_by_name(const char *name);
 const VpxInterface *get_vpx_decoder_by_fourcc(uint32_t fourcc);
 
-// TODO(dkovalev): move this function to vpx_image.{c, h}, so it will be part
-// of vpx_image_t support
 int vpx_img_plane_width(const vpx_image_t *img, int plane);
 int vpx_img_plane_height(const vpx_image_t *img, int plane);
 void vpx_img_write(const vpx_image_t *img, FILE *file);
@@ -153,14 +152,31 @@ int vpx_img_read(vpx_image_t *img, FILE *file);
 
 double sse_to_psnr(double samples, double peak, double mse);
 
+#if CONFIG_ENCODERS
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img);
+int file_is_y4m(const char detect[4]);
+int fourcc_is_ivf(const char detect[4]);
+void open_input_file(struct VpxInputContext *input);
+void close_input_file(struct VpxInputContext *input);
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_img_upshift(vpx_image_t *dst, vpx_image_t *src, int input_shift);
 void vpx_img_downshift(vpx_image_t *dst, vpx_image_t *src, int down_shift);
 void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src);
 #endif
 
+int compare_img(const vpx_image_t *const img1, const vpx_image_t *const img2);
+#if CONFIG_VP9_HIGHBITDEPTH
+void find_mismatch_high(const vpx_image_t *const img1,
+                        const vpx_image_t *const img2, int yloc[4], int uloc[4],
+                        int vloc[4]);
+#endif
+void find_mismatch(const vpx_image_t *const img1, const vpx_image_t *const img2,
+                   int yloc[4], int uloc[4], int vloc[4]);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
 
-#endif  // TOOLS_COMMON_H_
+#endif  // VPX_TOOLS_COMMON_H_
diff --git a/libs/libvpx/usage_cx.dox b/libs/libvpx/usage_cx.dox
index 92b0d34ef4d..b2220cfddeb 100644
--- a/libs/libvpx/usage_cx.dox
+++ b/libs/libvpx/usage_cx.dox
@@ -8,6 +8,8 @@
     \ref usage_deadline.
 
 
+    \if samples
     \ref samples
+    \endif
 
 */
diff --git a/libs/libvpx/usage_dx.dox b/libs/libvpx/usage_dx.dox
index 883ce24926f..85063f705b6 100644
--- a/libs/libvpx/usage_dx.dox
+++ b/libs/libvpx/usage_dx.dox
@@ -11,7 +11,9 @@
     \ref usage_postproc based on the amount of free CPU time. For more
     information on the <code>deadline</code> parameter, see \ref usage_deadline.
 
+    \if samples
     \ref samples
+    \endif
 
 
     \section usage_cb Callback Based Decoding
diff --git a/libs/libvpx/video_common.h b/libs/libvpx/video_common.h
index 44b27a83901..77eb9fac0cd 100644
--- a/libs/libvpx/video_common.h
+++ b/libs/libvpx/video_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_COMMON_H_
-#define VIDEO_COMMON_H_
+#ifndef VPX_VIDEO_COMMON_H_
+#define VPX_VIDEO_COMMON_H_
 
 #include "./tools_common.h"
 
@@ -20,4 +20,4 @@ typedef struct {
   struct VpxRational time_base;
 } VpxVideoInfo;
 
-#endif  // VIDEO_COMMON_H_
+#endif  // VPX_VIDEO_COMMON_H_
diff --git a/libs/libvpx/video_reader.c b/libs/libvpx/video_reader.c
index a0ba2521c61..16822eff3c1 100644
--- a/libs/libvpx/video_reader.c
+++ b/libs/libvpx/video_reader.c
@@ -30,17 +30,37 @@ VpxVideoReader *vpx_video_reader_open(const char *filename) {
   char header[32];
   VpxVideoReader *reader = NULL;
   FILE *const file = fopen(filename, "rb");
-  if (!file) return NULL;  // Can't open file
+  if (!file) {
+    fprintf(stderr, "%s can't be opened.\n", filename);  // Can't open file
+    return NULL;
+  }
 
-  if (fread(header, 1, 32, file) != 32) return NULL;  // Can't read file header
+  if (fread(header, 1, 32, file) != 32) {
+    fprintf(stderr, "File header on %s can't be read.\n",
+            filename);  // Can't read file header
+    return NULL;
+  }
+  if (memcmp(kIVFSignature, header, 4) != 0) {
+    fprintf(stderr, "The IVF signature on %s is wrong.\n",
+            filename);  // Wrong IVF signature
 
-  if (memcmp(kIVFSignature, header, 4) != 0)
-    return NULL;  // Wrong IVF signature
+    return NULL;
+  }
+  if (mem_get_le16(header + 4) != 0) {
+    fprintf(stderr, "%s uses the wrong IVF version.\n",
+            filename);  // Wrong IVF version
 
-  if (mem_get_le16(header + 4) != 0) return NULL;  // Wrong IVF version
+    return NULL;
+  }
 
   reader = calloc(1, sizeof(*reader));
-  if (!reader) return NULL;  // Can't allocate VpxVideoReader
+  if (!reader) {
+    fprintf(
+        stderr,
+        "Can't allocate VpxVideoReader\n");  // Can't allocate VpxVideoReader
+
+    return NULL;
+  }
 
   reader->file = file;
   reader->info.codec_fourcc = mem_get_le32(header + 8);
diff --git a/libs/libvpx/video_reader.h b/libs/libvpx/video_reader.h
index 73c25b00a7d..1f5c8088bb8 100644
--- a/libs/libvpx/video_reader.h
+++ b/libs/libvpx/video_reader.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_READER_H_
-#define VIDEO_READER_H_
+#ifndef VPX_VIDEO_READER_H_
+#define VPX_VIDEO_READER_H_
 
 #include "./video_common.h"
 
@@ -48,4 +48,4 @@ const VpxVideoInfo *vpx_video_reader_get_info(VpxVideoReader *reader);
 }  // extern "C"
 #endif
 
-#endif  // VIDEO_READER_H_
+#endif  // VPX_VIDEO_READER_H_
diff --git a/libs/libvpx/video_writer.c b/libs/libvpx/video_writer.c
index 56d428b0720..6e9a848bc3d 100644
--- a/libs/libvpx/video_writer.c
+++ b/libs/libvpx/video_writer.c
@@ -37,11 +37,15 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
   if (container == kContainerIVF) {
     VpxVideoWriter *writer = NULL;
     FILE *const file = fopen(filename, "wb");
-    if (!file) return NULL;
-
+    if (!file) {
+      fprintf(stderr, "%s can't be written to.\n", filename);
+      return NULL;
+    }
     writer = malloc(sizeof(*writer));
-    if (!writer) return NULL;
-
+    if (!writer) {
+      fprintf(stderr, "Can't allocate VpxVideoWriter.\n");
+      return NULL;
+    }
     writer->frame_count = 0;
     writer->info = *info;
     writer->file = file;
@@ -50,7 +54,7 @@ VpxVideoWriter *vpx_video_writer_open(const char *filename,
 
     return writer;
   }
-
+  fprintf(stderr, "VpxVideoWriter supports only IVF.\n");
   return NULL;
 }
 
diff --git a/libs/libvpx/video_writer.h b/libs/libvpx/video_writer.h
index a769811c440..b4d242b920f 100644
--- a/libs/libvpx/video_writer.h
+++ b/libs/libvpx/video_writer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VIDEO_WRITER_H_
-#define VIDEO_WRITER_H_
+#ifndef VPX_VIDEO_WRITER_H_
+#define VPX_VIDEO_WRITER_H_
 
 #include "./video_common.h"
 
@@ -41,4 +41,4 @@ int vpx_video_writer_write_frame(VpxVideoWriter *writer, const uint8_t *buffer,
 }  // extern "C"
 #endif
 
-#endif  // VIDEO_WRITER_H_
+#endif  // VPX_VIDEO_WRITER_H_
diff --git a/libs/libvpx/vp8/common/alloccommon.h b/libs/libvpx/vp8/common/alloccommon.h
index 5d0840c670c..2d376bbac31 100644
--- a/libs/libvpx/vp8/common/alloccommon.h
+++ b/libs/libvpx/vp8/common/alloccommon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ALLOCCOMMON_H_
-#define VP8_COMMON_ALLOCCOMMON_H_
+#ifndef VPX_VP8_COMMON_ALLOCCOMMON_H_
+#define VPX_VP8_COMMON_ALLOCCOMMON_H_
 
 #include "onyxc_int.h"
 
@@ -21,10 +21,10 @@ void vp8_create_common(VP8_COMMON *oci);
 void vp8_remove_common(VP8_COMMON *oci);
 void vp8_de_alloc_frame_buffers(VP8_COMMON *oci);
 int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height);
-void vp8_setup_version(VP8_COMMON *oci);
+void vp8_setup_version(VP8_COMMON *cm);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ALLOCCOMMON_H_
+#endif  // VPX_VP8_COMMON_ALLOCCOMMON_H_
diff --git a/libs/libvpx/vp8/common/arm/loopfilter_arm.c b/libs/libvpx/vp8/common/arm/loopfilter_arm.c
index e12f65a0429..48a19720483 100644
--- a/libs/libvpx/vp8/common/arm/loopfilter_arm.c
+++ b/libs/libvpx/vp8/common/arm/loopfilter_arm.c
@@ -8,28 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/onyxc_int.h"
 
-typedef void loopfilter_y_neon(unsigned char *src, int pitch,
-                               unsigned char blimit, unsigned char limit,
-                               unsigned char thresh);
-typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
-                                unsigned char blimit, unsigned char limit,
-                                unsigned char thresh, unsigned char *v);
-
-extern loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
-
-extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
-extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
-extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
-
 /* NEON loopfilter functions */
 /* Horizontal MB filtering */
 void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr,
diff --git a/libs/libvpx/vp8/common/arm/loopfilter_arm.h b/libs/libvpx/vp8/common/arm/loopfilter_arm.h
new file mode 100644
index 00000000000..6cf660d2289
--- /dev/null
+++ b/libs/libvpx/vp8/common/arm/loopfilter_arm.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+#define VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
+
+typedef void loopfilter_y_neon(unsigned char *src, int pitch,
+                               unsigned char blimit, unsigned char limit,
+                               unsigned char thresh);
+typedef void loopfilter_uv_neon(unsigned char *u, int pitch,
+                                unsigned char blimit, unsigned char limit,
+                                unsigned char thresh, unsigned char *v);
+
+loopfilter_y_neon vp8_loop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_loop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_loop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_loop_filter_vertical_edge_uv_neon;
+
+loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon;
+loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon;
+loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon;
+loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon;
+
+#endif  // VPX_VP8_COMMON_ARM_LOOPFILTER_ARM_H_
diff --git a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
index 8520ab5ca01..590956dde1d 100644
--- a/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -10,7 +10,9 @@
 
 #include <arm_neon.h>
 #include <string.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
diff --git a/libs/libvpx/vp8/common/arm/neon/copymem_neon.c b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c
index c1d293b58d8..c89b47d628b 100644
--- a/libs/libvpx/vp8/common/arm/neon/copymem_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/copymem_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_copy_mem8x4_neon(unsigned char *src, int src_stride,
                           unsigned char *dst, int dst_stride) {
   uint8x8_t vtmp;
diff --git a/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
index 6edff3c69f3..791aaea2ae6 100644
--- a/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/dequantizeb_neon.c
@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
 #include "vp8/common/blockd.h"
 
 void vp8_dequantize_b_neon(BLOCKD *d, short *DQC) {
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c
index d61dde86cf5..5c26ce67a43 100644
--- a/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/idct_blk_neon.c
@@ -8,15 +8,226 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include <arm_neon.h>
 
-/* place these declarations here because we don't want to maintain them
- * outside of this scope
- */
-void idct_dequant_full_2x_neon(short *q, short *dq, unsigned char *dst,
-                               int stride);
-void idct_dequant_0_2x_neon(short *q, short dq, unsigned char *dst, int stride);
+#include "./vp8_rtcd.h"
+
+static void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
+                                   int stride) {
+  unsigned char *dst0;
+  int i, a0, a1;
+  int16x8x2_t q2Add;
+  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
+  uint8x8_t d2u8, d4u8;
+  uint16x8_t q1u16, q2u16;
+
+  a0 = ((q[0] * dq) + 4) >> 3;
+  a1 = ((q[16] * dq) + 4) >> 3;
+  q[0] = q[16] = 0;
+  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
+  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
+
+  for (i = 0; i < 2; i++, dst += 4) {
+    dst0 = dst;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
+
+    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d2s32));
+    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
+                     vreinterpret_u8_s32(d4s32));
+
+    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
+    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
+
+    d2s32 = vreinterpret_s32_u8(d2u8);
+    d4s32 = vreinterpret_s32_u8(d4u8);
+
+    dst0 = dst;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
+    dst0 += stride;
+    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
+  }
+}
+
+static const int16_t cospi8sqrt2minus1 = 20091;
+static const int16_t sinpi8sqrt2 = 17734;
+// because the lowest bit in 0x8a8c is 0, we can pre-shift this
+
+static void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq,
+                                      unsigned char *dst, int stride) {
+  unsigned char *dst0, *dst1;
+  int32x2_t d28, d29, d30, d31;
+  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
+  int16x8_t qEmpty = vdupq_n_s16(0);
+  int32x4x2_t q2tmp0, q2tmp1;
+  int16x8x2_t q2tmp2, q2tmp3;
+  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
+
+  d28 = d29 = d30 = d31 = vdup_n_s32(0);
+
+  // load dq
+  q0 = vld1q_s16(dq);
+  dq += 8;
+  q1 = vld1q_s16(dq);
+
+  // load q
+  q2 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q3 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q4 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+  q += 8;
+  q5 = vld1q_s16(q);
+  vst1q_s16(q, qEmpty);
+
+  // load src from dst
+  dst0 = dst;
+  dst1 = dst + 4;
+  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
+  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
+
+  q2 = vmulq_s16(q2, q0);
+  q3 = vmulq_s16(q3, q1);
+  q4 = vmulq_s16(q4, q0);
+  q5 = vmulq_s16(q5, q1);
+
+  // vswp
+  dLow0 = vget_low_s16(q2);
+  dHigh0 = vget_high_s16(q2);
+  dLow1 = vget_low_s16(q4);
+  dHigh1 = vget_high_s16(q4);
+  q2 = vcombine_s16(dLow0, dLow1);
+  q4 = vcombine_s16(dHigh0, dHigh1);
+
+  dLow0 = vget_low_s16(q3);
+  dHigh0 = vget_high_s16(q3);
+  dLow1 = vget_low_s16(q5);
+  dHigh1 = vget_high_s16(q5);
+  q3 = vcombine_s16(dLow0, dLow1);
+  q5 = vcombine_s16(dHigh0, dHigh1);
+
+  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
+  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
+  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
+  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
+
+  q10 = vqaddq_s16(q2, q3);
+  q11 = vqsubq_s16(q2, q3);
+
+  q8 = vshrq_n_s16(q8, 1);
+  q9 = vshrq_n_s16(q9, 1);
+
+  q4 = vqaddq_s16(q4, q8);
+  q5 = vqaddq_s16(q5, q9);
+
+  q2 = vqsubq_s16(q6, q5);
+  q3 = vqaddq_s16(q7, q4);
+
+  q4 = vqaddq_s16(q10, q3);
+  q5 = vqaddq_s16(q11, q2);
+  q6 = vqsubq_s16(q11, q2);
+  q7 = vqsubq_s16(q10, q3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  // loop 2
+  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
+  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
+  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
+  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
+
+  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
+
+  q10 = vshrq_n_s16(q10, 1);
+  q11 = vshrq_n_s16(q11, 1);
+
+  q10 = vqaddq_s16(q2tmp2.val[1], q10);
+  q11 = vqaddq_s16(q2tmp3.val[1], q11);
+
+  q8 = vqsubq_s16(q8, q11);
+  q9 = vqaddq_s16(q9, q10);
+
+  q4 = vqaddq_s16(q2, q9);
+  q5 = vqaddq_s16(q3, q8);
+  q6 = vqsubq_s16(q3, q8);
+  q7 = vqsubq_s16(q2, q9);
+
+  q4 = vrshrq_n_s16(q4, 3);
+  q5 = vrshrq_n_s16(q5, 3);
+  q6 = vrshrq_n_s16(q6, 3);
+  q7 = vrshrq_n_s16(q7, 3);
+
+  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
+  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
+  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
+                     vreinterpretq_s16_s32(q2tmp1.val[0]));
+  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
+                     vreinterpretq_s16_s32(q2tmp1.val[1]));
+
+  q4 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
+  q5 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
+  q6 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
+  q7 = vreinterpretq_s16_u16(
+      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
+
+  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
+  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
+  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
+  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
+
+  dst0 = dst;
+  dst1 = dst + 4;
+  vst1_lane_s32((int32_t *)dst0, d28, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d28, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d29, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d29, 1);
+  dst1 += stride;
+
+  vst1_lane_s32((int32_t *)dst0, d30, 0);
+  dst0 += stride;
+  vst1_lane_s32((int32_t *)dst1, d30, 1);
+  dst1 += stride;
+  vst1_lane_s32((int32_t *)dst0, d31, 0);
+  vst1_lane_s32((int32_t *)dst1, d31, 1);
+}
 
 void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
                                        int stride, char *eobs) {
@@ -43,42 +254,42 @@ void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv, int stride,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int stride,
                                         char *eobs) {
   if (((short *)(eobs))[0]) {
     if (((short *)eobs)[0] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstu, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_u, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstu, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
   }
 
   q += 32;
-  dstu += 4 * stride;
+  dst_u += 4 * stride;
 
   if (((short *)(eobs))[1]) {
     if (((short *)eobs)[1] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstu, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_u, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstu, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_u, stride);
   }
 
   q += 32;
 
   if (((short *)(eobs))[2]) {
     if (((short *)eobs)[2] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstv, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_v, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstv, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
   }
 
   q += 32;
-  dstv += 4 * stride;
+  dst_v += 4 * stride;
 
   if (((short *)(eobs))[3]) {
     if (((short *)eobs)[3] & 0xfefe)
-      idct_dequant_full_2x_neon(q, dq, dstv, stride);
+      idct_dequant_full_2x_neon(q, dq, dst_v, stride);
     else
-      idct_dequant_0_2x_neon(q, dq[0], dstv, stride);
+      idct_dequant_0_2x_neon(q, dq[0], dst_v, stride);
   }
 }
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
deleted file mode 100644
index c83102a5cc6..00000000000
--- a/libs/libvpx/vp8/common/arm/neon/idct_dequant_0_2x_neon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-void idct_dequant_0_2x_neon(int16_t *q, int16_t dq, unsigned char *dst,
-                            int stride) {
-  unsigned char *dst0;
-  int i, a0, a1;
-  int16x8x2_t q2Add;
-  int32x2_t d2s32 = vdup_n_s32(0), d4s32 = vdup_n_s32(0);
-  uint8x8_t d2u8, d4u8;
-  uint16x8_t q1u16, q2u16;
-
-  a0 = ((q[0] * dq) + 4) >> 3;
-  a1 = ((q[16] * dq) + 4) >> 3;
-  q[0] = q[16] = 0;
-  q2Add.val[0] = vdupq_n_s16((int16_t)a0);
-  q2Add.val[1] = vdupq_n_s16((int16_t)a1);
-
-  for (i = 0; i < 2; i++, dst += 4) {
-    dst0 = dst;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1);
-
-    q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d2s32));
-    q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]),
-                     vreinterpret_u8_s32(d4s32));
-
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16));
-    d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16));
-
-    d2s32 = vreinterpret_s32_u8(d2u8);
-    d4s32 = vreinterpret_s32_u8(d4u8);
-
-    dst0 = dst;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d2s32, 1);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 0);
-    dst0 += stride;
-    vst1_lane_s32((int32_t *)dst0, d4s32, 1);
-  }
-  return;
-}
diff --git a/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c b/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
deleted file mode 100644
index f30671cc3f1..00000000000
--- a/libs/libvpx/vp8/common/arm/neon/idct_dequant_full_2x_neon.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 17734;
-// because the lowest bit in 0x8a8c is 0, we can pre-shift this
-
-void idct_dequant_full_2x_neon(int16_t *q, int16_t *dq, unsigned char *dst,
-                               int stride) {
-  unsigned char *dst0, *dst1;
-  int32x2_t d28, d29, d30, d31;
-  int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
-  int16x8_t qEmpty = vdupq_n_s16(0);
-  int32x4x2_t q2tmp0, q2tmp1;
-  int16x8x2_t q2tmp2, q2tmp3;
-  int16x4_t dLow0, dLow1, dHigh0, dHigh1;
-
-  d28 = d29 = d30 = d31 = vdup_n_s32(0);
-
-  // load dq
-  q0 = vld1q_s16(dq);
-  dq += 8;
-  q1 = vld1q_s16(dq);
-
-  // load q
-  q2 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q3 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q4 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-  q += 8;
-  q5 = vld1q_s16(q);
-  vst1q_s16(q, qEmpty);
-
-  // load src from dst
-  dst0 = dst;
-  dst1 = dst + 4;
-  d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0);
-  d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1);
-
-  q2 = vmulq_s16(q2, q0);
-  q3 = vmulq_s16(q3, q1);
-  q4 = vmulq_s16(q4, q0);
-  q5 = vmulq_s16(q5, q1);
-
-  // vswp
-  dLow0 = vget_low_s16(q2);
-  dHigh0 = vget_high_s16(q2);
-  dLow1 = vget_low_s16(q4);
-  dHigh1 = vget_high_s16(q4);
-  q2 = vcombine_s16(dLow0, dLow1);
-  q4 = vcombine_s16(dHigh0, dHigh1);
-
-  dLow0 = vget_low_s16(q3);
-  dHigh0 = vget_high_s16(q3);
-  dLow1 = vget_low_s16(q5);
-  dHigh1 = vget_high_s16(q5);
-  q3 = vcombine_s16(dLow0, dLow1);
-  q5 = vcombine_s16(dHigh0, dHigh1);
-
-  q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2);
-  q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2);
-  q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1);
-  q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1);
-
-  q10 = vqaddq_s16(q2, q3);
-  q11 = vqsubq_s16(q2, q3);
-
-  q8 = vshrq_n_s16(q8, 1);
-  q9 = vshrq_n_s16(q9, 1);
-
-  q4 = vqaddq_s16(q4, q8);
-  q5 = vqaddq_s16(q5, q9);
-
-  q2 = vqsubq_s16(q6, q5);
-  q3 = vqaddq_s16(q7, q4);
-
-  q4 = vqaddq_s16(q10, q3);
-  q5 = vqaddq_s16(q11, q2);
-  q6 = vqsubq_s16(q11, q2);
-  q7 = vqsubq_s16(q10, q3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  // loop 2
-  q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2);
-  q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2);
-  q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1);
-  q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1);
-
-  q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-  q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]);
-
-  q10 = vshrq_n_s16(q10, 1);
-  q11 = vshrq_n_s16(q11, 1);
-
-  q10 = vqaddq_s16(q2tmp2.val[1], q10);
-  q11 = vqaddq_s16(q2tmp3.val[1], q11);
-
-  q8 = vqsubq_s16(q8, q11);
-  q9 = vqaddq_s16(q9, q10);
-
-  q4 = vqaddq_s16(q2, q9);
-  q5 = vqaddq_s16(q3, q8);
-  q6 = vqsubq_s16(q3, q8);
-  q7 = vqsubq_s16(q2, q9);
-
-  q4 = vrshrq_n_s16(q4, 3);
-  q5 = vrshrq_n_s16(q5, 3);
-  q6 = vrshrq_n_s16(q6, 3);
-  q7 = vrshrq_n_s16(q7, 3);
-
-  q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6));
-  q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7));
-  q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]),
-                     vreinterpretq_s16_s32(q2tmp1.val[0]));
-  q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]),
-                     vreinterpretq_s16_s32(q2tmp1.val[1]));
-
-  q4 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28)));
-  q5 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29)));
-  q6 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30)));
-  q7 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31)));
-
-  d28 = vreinterpret_s32_u8(vqmovun_s16(q4));
-  d29 = vreinterpret_s32_u8(vqmovun_s16(q5));
-  d30 = vreinterpret_s32_u8(vqmovun_s16(q6));
-  d31 = vreinterpret_s32_u8(vqmovun_s16(q7));
-
-  dst0 = dst;
-  dst1 = dst + 4;
-  vst1_lane_s32((int32_t *)dst0, d28, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d28, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d29, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d29, 1);
-  dst1 += stride;
-
-  vst1_lane_s32((int32_t *)dst0, d30, 0);
-  dst0 += stride;
-  vst1_lane_s32((int32_t *)dst1, d30, 1);
-  dst1 += stride;
-  vst1_lane_s32((int32_t *)dst0, d31, 0);
-  vst1_lane_s32((int32_t *)dst1, d31, 1);
-  return;
-}
diff --git a/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c
index 6c4bcc134b2..91600bfc005 100644
--- a/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/iwalsh_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_inv_walsh4x4_neon(int16_t *input, int16_t *mb_dqcoeff) {
   int16x8_t q0s16, q1s16, q2s16, q3s16;
   int16x4_t d4s16, d5s16, d6s16, d7s16;
diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
index a1682197057..df983b23a3d 100644
--- a/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 
 static INLINE void vp8_loop_filter_simple_horizontal_edge_neon(
     unsigned char *s, int p, const unsigned char *blimit) {
diff --git a/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
index 80a222d2480..fbc83ae290d 100644
--- a/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
index 65eec300ff3..fafaf2d451c 100644
--- a/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/mbloopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 
 static INLINE void vp8_mbloop_filter_neon(uint8x16_t qblimit,  // mblimit
                                           uint8x16_t qlimit,   // limit
diff --git a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
index aa2567df792..48e86d32781 100644
--- a/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/sixtappredict_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 #include <string.h>
 #include "./vpx_config.h"
+#include "./vp8_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_ports/mem.h"
 
diff --git a/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
index d7286739dad..ebc004a0487 100644
--- a/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
+++ b/libs/libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c
@@ -9,7 +9,9 @@
  */
 
 #include <arm_neon.h>
+
 #include "./vpx_config.h"
+#include "vp8/common/arm/loopfilter_arm.h"
 #include "vpx_ports/arm.h"
 
 static INLINE void vp8_loop_filter_neon(uint8x16_t qblimit,  // flimit
diff --git a/libs/libvpx/vp8/common/blockd.c b/libs/libvpx/vp8/common/blockd.c
index f47c5bae158..22905c10a6f 100644
--- a/libs/libvpx/vp8/common/blockd.c
+++ b/libs/libvpx/vp8/common/blockd.c
@@ -11,9 +11,9 @@
 #include "blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
-const unsigned char vp8_block2left[25] = {
-  0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-const unsigned char vp8_block2above[25] = {
-  0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8
-};
+const unsigned char vp8_block2left[25] = { 0, 0, 0, 0, 1, 1, 1, 1, 2,
+                                           2, 2, 2, 3, 3, 3, 3, 4, 4,
+                                           5, 5, 6, 6, 7, 7, 8 };
+const unsigned char vp8_block2above[25] = { 0, 1, 2, 3, 0, 1, 2, 3, 0,
+                                            1, 2, 3, 0, 1, 2, 3, 4, 5,
+                                            4, 5, 6, 7, 6, 7, 8 };
diff --git a/libs/libvpx/vp8/common/blockd.h b/libs/libvpx/vp8/common/blockd.h
index 1a3aad16afd..f8d1539739b 100644
--- a/libs/libvpx/vp8/common/blockd.h
+++ b/libs/libvpx/vp8/common/blockd.h
@@ -8,11 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_BLOCKD_H_
-#define VP8_COMMON_BLOCKD_H_
+#ifndef VPX_VP8_COMMON_BLOCKD_H_
+#define VPX_VP8_COMMON_BLOCKD_H_
 
 void vpx_log(const char *format, ...);
 
+#include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_config.h"
 #include "vpx_scale/yv12config.h"
 #include "mv.h"
@@ -37,7 +38,9 @@ extern "C" {
 #define SEGMENT_DELTADATA 0
 #define SEGMENT_ABSDATA 1
 
-typedef struct { int r, c; } POS;
+typedef struct {
+  int r, c;
+} POS;
 
 #define PLANE_TYPE_Y_NO_DC 0
 #define PLANE_TYPE_Y2 1
@@ -180,6 +183,9 @@ typedef struct {
   unsigned int low_res_ref_frames[MAX_REF_FRAMES];
   // The video frame counter value for the key frame, for lowest resolution.
   unsigned int key_frame_counter_value;
+  // Flags to signal skipped encoding of previous and base layer stream.
+  unsigned int skip_encoding_prev_stream;
+  unsigned int skip_encoding_base_stream;
   LOWER_RES_MB_INFO *mb_info;
 } LOWER_RES_FRAME_INFO;
 #endif
@@ -196,8 +202,9 @@ typedef struct blockd {
   union b_mode_info bmi;
 } BLOCKD;
 
-typedef void (*vp8_subpix_fn_t)(unsigned char *src, int src_pitch, int xofst,
-                                int yofst, unsigned char *dst, int dst_pitch);
+typedef void (*vp8_subpix_fn_t)(unsigned char *src_ptr, int src_pixels_per_line,
+                                int xoffset, int yoffset,
+                                unsigned char *dst_ptr, int dst_pitch);
 
 typedef struct macroblockd {
   DECLARE_ALIGNED(16, unsigned char, predictor[384]);
@@ -283,6 +290,8 @@ typedef struct macroblockd {
 
   int corrupted;
 
+  struct vpx_internal_error_info error_info;
+
 #if ARCH_X86 || ARCH_X86_64
   /* This is an intermediate buffer currently used in sub-pixel motion search
    * to keep a copy of the reference area. This buffer can be used for other
@@ -299,4 +308,4 @@ extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_BLOCKD_H_
+#endif  // VPX_VP8_COMMON_BLOCKD_H_
diff --git a/libs/libvpx/vp8/common/coefupdateprobs.h b/libs/libvpx/vp8/common/coefupdateprobs.h
index 9b01bba312d..b342096b552 100644
--- a/libs/libvpx/vp8/common/coefupdateprobs.h
+++ b/libs/libvpx/vp8/common/coefupdateprobs.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_COEFUPDATEPROBS_H_
-#define VP8_COMMON_COEFUPDATEPROBS_H_
+#ifndef VPX_VP8_COMMON_COEFUPDATEPROBS_H_
+#define VPX_VP8_COMMON_COEFUPDATEPROBS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -194,4 +194,4 @@ const vp8_prob vp8_coef_update_probs
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_COEFUPDATEPROBS_H_
+#endif  // VPX_VP8_COMMON_COEFUPDATEPROBS_H_
diff --git a/libs/libvpx/vp8/common/common.h b/libs/libvpx/vp8/common/common.h
index bbfc4f39348..2c30e8d6c51 100644
--- a/libs/libvpx/vp8/common/common.h
+++ b/libs/libvpx/vp8/common/common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_COMMON_H_
-#define VP8_COMMON_COMMON_H_
+#ifndef VPX_VP8_COMMON_COMMON_H_
+#define VPX_VP8_COMMON_COMMON_H_
 
 #include <assert.h>
 
@@ -31,18 +31,18 @@ extern "C" {
 
 /* Use this for variably-sized arrays. */
 
-#define vp8_copy_array(Dest, Src, N)       \
-  {                                        \
-    assert(sizeof(*Dest) == sizeof(*Src)); \
-    memcpy(Dest, Src, N * sizeof(*Src));   \
+#define vp8_copy_array(Dest, Src, N)           \
+  {                                            \
+    assert(sizeof(*(Dest)) == sizeof(*(Src))); \
+    memcpy(Dest, Src, (N) * sizeof(*(Src)));   \
   }
 
-#define vp8_zero(Dest) memset(&Dest, 0, sizeof(Dest));
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest));
 
-#define vp8_zero_array(Dest, N) memset(Dest, 0, N * sizeof(*Dest));
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)));
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_COMMON_H_
+#endif  // VPX_VP8_COMMON_COMMON_H_
diff --git a/libs/libvpx/vp8/common/default_coef_probs.h b/libs/libvpx/vp8/common/default_coef_probs.h
index 8c861ac8763..b25e4a45a3c 100644
--- a/libs/libvpx/vp8/common/default_coef_probs.h
+++ b/libs/libvpx/vp8/common/default_coef_probs.h
@@ -6,10 +6,10 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */
 
-#ifndef VP8_COMMON_DEFAULT_COEF_PROBS_H_
-#define VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#ifndef VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#define VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -157,4 +157,4 @@ static const vp8_prob default_coef_probs
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_DEFAULT_COEF_PROBS_H_
+#endif  // VPX_VP8_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/libs/libvpx/vp8/common/entropy.c b/libs/libvpx/vp8/common/entropy.c
index f61fa9e8e4b..fc4a3539fdf 100644
--- a/libs/libvpx/vp8/common/entropy.c
+++ b/libs/libvpx/vp8/common/entropy.c
@@ -28,9 +28,9 @@ DECLARE_ALIGNED(16, const unsigned char, vp8_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-DECLARE_ALIGNED(16, const unsigned char, vp8_coef_bands[16]) = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7
-};
+DECLARE_ALIGNED(16, const unsigned char,
+                vp8_coef_bands[16]) = { 0, 1, 2, 3, 6, 4, 5, 6,
+                                        6, 6, 6, 6, 6, 6, 6, 7 };
 
 DECLARE_ALIGNED(16, const unsigned char,
                 vp8_prev_token_class[MAX_ENTROPY_TOKENS]) = {
@@ -41,9 +41,9 @@ DECLARE_ALIGNED(16, const int, vp8_default_zig_zag1d[16]) = {
   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(16, const short, vp8_default_inv_zig_zag[16]) = {
-  1, 2, 6, 7, 3, 5, 8, 13, 4, 9, 12, 14, 10, 11, 15, 16
-};
+DECLARE_ALIGNED(16, const short,
+                vp8_default_inv_zig_zag[16]) = { 1, 2, 6,  7,  3,  5,  8,  13,
+                                                 4, 9, 12, 14, 10, 11, 15, 16 };
 
 /* vp8_default_zig_zag_mask generated with:
 
@@ -129,9 +129,9 @@ static const vp8_tree_index cat2[4] = { 2, 2, 0, 0 };
 static const vp8_tree_index cat3[6] = { 2, 2, 4, 4, 0, 0 };
 static const vp8_tree_index cat4[8] = { 2, 2, 4, 4, 6, 6, 0, 0 };
 static const vp8_tree_index cat5[10] = { 2, 2, 4, 4, 6, 6, 8, 8, 0, 0 };
-static const vp8_tree_index cat6[22] = {
-  2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16, 16, 18, 18, 20, 20, 0, 0
-};
+static const vp8_tree_index cat6[22] = { 2,  2,  4,  4,  6,  6,  8,  8,
+                                         10, 10, 12, 12, 14, 14, 16, 16,
+                                         18, 18, 20, 20, 0,  0 };
 
 const vp8_extra_bit_struct vp8_extra_bits[12] = {
   { 0, 0, 0, 0 },         { 0, 0, 0, 1 },          { 0, 0, 0, 2 },
diff --git a/libs/libvpx/vp8/common/entropy.h b/libs/libvpx/vp8/common/entropy.h
index d088560011b..fbdb7bcfca8 100644
--- a/libs/libvpx/vp8/common/entropy.h
+++ b/libs/libvpx/vp8/common/entropy.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPY_H_
-#define VP8_COMMON_ENTROPY_H_
+#ifndef VPX_VP8_COMMON_ENTROPY_H_
+#define VPX_VP8_COMMON_ENTROPY_H_
 
 #include "treecoder.h"
 #include "blockd.h"
@@ -105,4 +105,4 @@ void vp8_coef_tree_initialize(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPY_H_
+#endif  // VPX_VP8_COMMON_ENTROPY_H_
diff --git a/libs/libvpx/vp8/common/entropymode.c b/libs/libvpx/vp8/common/entropymode.c
index 239492a8cb8..f61e0c2e2bc 100644
--- a/libs/libvpx/vp8/common/entropymode.c
+++ b/libs/libvpx/vp8/common/entropymode.c
@@ -75,9 +75,9 @@ const vp8_tree_index vp8_ymode_tree[8] = {
   -DC_PRED, 2, 4, 6, -V_PRED, -H_PRED, -TM_PRED, -B_PRED
 };
 
-const vp8_tree_index vp8_kf_ymode_tree[8] = {
-  -B_PRED, 2, 4, 6, -DC_PRED, -V_PRED, -H_PRED, -TM_PRED
-};
+const vp8_tree_index vp8_kf_ymode_tree[8] = { -B_PRED, 2,        4,
+                                              6,       -DC_PRED, -V_PRED,
+                                              -H_PRED, -TM_PRED };
 
 const vp8_tree_index vp8_uv_mode_tree[6] = { -DC_PRED, 2,       -V_PRED,
                                              4,        -H_PRED, -TM_PRED };
@@ -99,6 +99,6 @@ void vp8_init_mbmode_probs(VP8_COMMON *x) {
   memcpy(x->fc.sub_mv_ref_prob, sub_mv_ref_prob, sizeof(sub_mv_ref_prob));
 }
 
-void vp8_default_bmode_probs(vp8_prob p[VP8_BINTRAMODES - 1]) {
-  memcpy(p, vp8_bmode_prob, sizeof(vp8_bmode_prob));
+void vp8_default_bmode_probs(vp8_prob dest[VP8_BINTRAMODES - 1]) {
+  memcpy(dest, vp8_bmode_prob, sizeof(vp8_bmode_prob));
 }
diff --git a/libs/libvpx/vp8/common/entropymode.h b/libs/libvpx/vp8/common/entropymode.h
index b3fad19be0e..c772cece573 100644
--- a/libs/libvpx/vp8/common/entropymode.h
+++ b/libs/libvpx/vp8/common/entropymode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPYMODE_H_
-#define VP8_COMMON_ENTROPYMODE_H_
+#ifndef VPX_VP8_COMMON_ENTROPYMODE_H_
+#define VPX_VP8_COMMON_ENTROPYMODE_H_
 
 #include "onyxc_int.h"
 #include "treecoder.h"
@@ -85,4 +85,4 @@ void vp8_kf_default_bmode_probs(
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPYMODE_H_
+#endif  // VPX_VP8_COMMON_ENTROPYMODE_H_
diff --git a/libs/libvpx/vp8/common/entropymv.h b/libs/libvpx/vp8/common/entropymv.h
index 63730009033..40039f5b2c3 100644
--- a/libs/libvpx/vp8/common/entropymv.h
+++ b/libs/libvpx/vp8/common/entropymv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ENTROPYMV_H_
-#define VP8_COMMON_ENTROPYMV_H_
+#ifndef VPX_VP8_COMMON_ENTROPYMV_H_
+#define VPX_VP8_COMMON_ENTROPYMV_H_
 
 #include "treecoder.h"
 
@@ -46,4 +46,4 @@ extern const MV_CONTEXT vp8_mv_update_probs[2], vp8_default_mv_context[2];
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ENTROPYMV_H_
+#endif  // VPX_VP8_COMMON_ENTROPYMV_H_
diff --git a/libs/libvpx/vp8/common/extend.c b/libs/libvpx/vp8/common/extend.c
index 2d67b516bef..f4dbce2cd53 100644
--- a/libs/libvpx/vp8/common/extend.c
+++ b/libs/libvpx/vp8/common/extend.c
@@ -20,8 +20,7 @@ static void copy_and_extend_plane(unsigned char *s, /* source */
                                   int et,           /* extend top border */
                                   int el,           /* extend left border */
                                   int eb,           /* extend bottom border */
-                                  int er            /* extend right border */
-                                  ) {
+                                  int er) {         /* extend right border */
   int i;
   unsigned char *src_ptr1, *src_ptr2;
   unsigned char *dest_ptr1, *dest_ptr2;
diff --git a/libs/libvpx/vp8/common/extend.h b/libs/libvpx/vp8/common/extend.h
index 7da5ce31dac..586a38a4f37 100644
--- a/libs/libvpx/vp8/common/extend.h
+++ b/libs/libvpx/vp8/common/extend.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_EXTEND_H_
-#define VP8_COMMON_EXTEND_H_
+#ifndef VPX_VP8_COMMON_EXTEND_H_
+#define VPX_VP8_COMMON_EXTEND_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -29,4 +29,4 @@ void vp8_copy_and_extend_frame_with_rect(YV12_BUFFER_CONFIG *src,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_EXTEND_H_
+#endif  // VPX_VP8_COMMON_EXTEND_H_
diff --git a/libs/libvpx/vp8/common/filter.h b/libs/libvpx/vp8/common/filter.h
index f1d5ece4a5f..6acee22b21b 100644
--- a/libs/libvpx/vp8/common/filter.h
+++ b/libs/libvpx/vp8/common/filter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_FILTER_H_
-#define VP8_COMMON_FILTER_H_
+#ifndef VPX_VP8_COMMON_FILTER_H_
+#define VPX_VP8_COMMON_FILTER_H_
 
 #include "vpx_ports/mem.h"
 
@@ -28,4 +28,4 @@ extern DECLARE_ALIGNED(16, const short, vp8_sub_pel_filters[8][6]);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_FILTER_H_
+#endif  // VPX_VP8_COMMON_FILTER_H_
diff --git a/libs/libvpx/vp8/common/findnearmv.c b/libs/libvpx/vp8/common/findnearmv.c
index f40d2c6bde5..6889fdeddef 100644
--- a/libs/libvpx/vp8/common/findnearmv.c
+++ b/libs/libvpx/vp8/common/findnearmv.c
@@ -21,19 +21,20 @@ const unsigned char vp8_mbsplit_offset[4][16] = {
    Note that we only consider one 4x4 subblock from each candidate 16x16
    macroblock.   */
 void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
-                       int_mv *nearby, int_mv *best_mv, int cnt[4],
+                       int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
                        int refframe, int *ref_frame_sign_bias) {
   const MODE_INFO *above = here - xd->mode_info_stride;
   const MODE_INFO *left = here - 1;
   const MODE_INFO *aboveleft = above - 1;
   int_mv near_mvs[4];
   int_mv *mv = near_mvs;
-  int *cntx = cnt;
+  int *cntx = near_mv_ref_cnts;
   enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 
   /* Zero accumulators */
   mv[0].as_int = mv[1].as_int = mv[2].as_int = 0;
-  cnt[0] = cnt[1] = cnt[2] = cnt[3] = 0;
+  near_mv_ref_cnts[0] = near_mv_ref_cnts[1] = near_mv_ref_cnts[2] =
+      near_mv_ref_cnts[3] = 0;
 
   /* Process above */
   if (above->mbmi.ref_frame != INTRA_FRAME) {
@@ -63,7 +64,7 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
 
       *cntx += 2;
     } else {
-      cnt[CNT_INTRA] += 2;
+      near_mv_ref_cnts[CNT_INTRA] += 2;
     }
   }
 
@@ -83,33 +84,34 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
 
       *cntx += 1;
     } else {
-      cnt[CNT_INTRA] += 1;
+      near_mv_ref_cnts[CNT_INTRA] += 1;
     }
   }
 
   /* If we have three distinct MV's ... */
-  if (cnt[CNT_SPLITMV]) {
+  if (near_mv_ref_cnts[CNT_SPLITMV]) {
     /* See if above-left MV can be merged with NEAREST */
-    if (mv->as_int == near_mvs[CNT_NEAREST].as_int) cnt[CNT_NEAREST] += 1;
+    if (mv->as_int == near_mvs[CNT_NEAREST].as_int)
+      near_mv_ref_cnts[CNT_NEAREST] += 1;
   }
 
-  cnt[CNT_SPLITMV] =
+  near_mv_ref_cnts[CNT_SPLITMV] =
       ((above->mbmi.mode == SPLITMV) + (left->mbmi.mode == SPLITMV)) * 2 +
       (aboveleft->mbmi.mode == SPLITMV);
 
   /* Swap near and nearest if necessary */
-  if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
+  if (near_mv_ref_cnts[CNT_NEAR] > near_mv_ref_cnts[CNT_NEAREST]) {
     int tmp;
-    tmp = cnt[CNT_NEAREST];
-    cnt[CNT_NEAREST] = cnt[CNT_NEAR];
-    cnt[CNT_NEAR] = tmp;
+    tmp = near_mv_ref_cnts[CNT_NEAREST];
+    near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
+    near_mv_ref_cnts[CNT_NEAR] = tmp;
     tmp = near_mvs[CNT_NEAREST].as_int;
     near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
     near_mvs[CNT_NEAR].as_int = tmp;
   }
 
   /* Use near_mvs[0] to store the "best" MV */
-  if (cnt[CNT_NEAREST] >= cnt[CNT_INTRA]) {
+  if (near_mv_ref_cnts[CNT_NEAREST] >= near_mv_ref_cnts[CNT_INTRA]) {
     near_mvs[CNT_INTRA] = near_mvs[CNT_NEAREST];
   }
 
diff --git a/libs/libvpx/vp8/common/findnearmv.h b/libs/libvpx/vp8/common/findnearmv.h
index c1eaa269800..d7db9544aa3 100644
--- a/libs/libvpx/vp8/common/findnearmv.h
+++ b/libs/libvpx/vp8/common/findnearmv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_FINDNEARMV_H_
-#define VP8_COMMON_FINDNEARMV_H_
+#ifndef VPX_VP8_COMMON_FINDNEARMV_H_
+#define VPX_VP8_COMMON_FINDNEARMV_H_
 
 #include "./vpx_config.h"
 #include "mv.h"
@@ -70,7 +70,7 @@ static INLINE unsigned int vp8_check_mv_bounds(int_mv *mv, int mb_to_left_edge,
 }
 
 void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
-                       int_mv *nearby, int_mv *best, int near_mv_ref_cts[4],
+                       int_mv *nearby, int_mv *best_mv, int near_mv_ref_cnts[4],
                        int refframe, int *ref_frame_sign_bias);
 
 int vp8_find_near_mvs_bias(MACROBLOCKD *xd, const MODE_INFO *here,
@@ -148,4 +148,4 @@ static INLINE B_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, int b,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_FINDNEARMV_H_
+#endif  // VPX_VP8_COMMON_FINDNEARMV_H_
diff --git a/libs/libvpx/vp8/common/header.h b/libs/libvpx/vp8/common/header.h
index 1df01fc6fa5..e64e2419085 100644
--- a/libs/libvpx/vp8/common/header.h
+++ b/libs/libvpx/vp8/common/header.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_HEADER_H_
-#define VP8_COMMON_HEADER_H_
+#ifndef VPX_VP8_COMMON_HEADER_H_
+#define VPX_VP8_COMMON_HEADER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -45,4 +45,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_HEADER_H_
+#endif  // VPX_VP8_COMMON_HEADER_H_
diff --git a/libs/libvpx/vp8/common/idct_blk.c b/libs/libvpx/vp8/common/idct_blk.c
index ff9f3eb7f21..ebe1774f56e 100644
--- a/libs/libvpx/vp8/common/idct_blk.c
+++ b/libs/libvpx/vp8/common/idct_blk.c
@@ -12,12 +12,6 @@
 #include "vp8_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest,
-                            int stride);
-void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred,
-                            int pred_stride, unsigned char *dst_ptr,
-                            int dst_stride);
-
 void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst,
                                     int stride, char *eobs) {
   int i, j;
@@ -39,40 +33,40 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst,
   }
 }
 
-void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dstu,
-                                     unsigned char *dstv, int stride,
+void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u,
+                                     unsigned char *dst_v, int stride,
                                      char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_c(q, dq, dstu, stride);
+        vp8_dequant_idct_add_c(q, dq, dst_u, stride);
       } else {
-        vp8_dc_only_idct_add_c(q[0] * dq[0], dstu, stride, dstu, stride);
+        vp8_dc_only_idct_add_c(q[0] * dq[0], dst_u, stride, dst_u, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstu += 4;
+      dst_u += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst_u += 4 * stride - 8;
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_c(q, dq, dstv, stride);
+        vp8_dequant_idct_add_c(q, dq, dst_v, stride);
       } else {
-        vp8_dc_only_idct_add_c(q[0] * dq[0], dstv, stride, dstv, stride);
+        vp8_dc_only_idct_add_c(q[0] * dq[0], dst_v, stride, dst_v, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstv += 4;
+      dst_v += 4;
     }
 
-    dstv += 4 * stride - 8;
+    dst_v += 4 * stride - 8;
   }
 }
diff --git a/libs/libvpx/vp8/common/invtrans.h b/libs/libvpx/vp8/common/invtrans.h
index c7af32fb675..aed7bb0600b 100644
--- a/libs/libvpx/vp8/common/invtrans.h
+++ b/libs/libvpx/vp8/common/invtrans.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_INVTRANS_H_
-#define VP8_COMMON_INVTRANS_H_
+#ifndef VPX_VP8_COMMON_INVTRANS_H_
+#define VPX_VP8_COMMON_INVTRANS_H_
 
 #include "./vpx_config.h"
 #include "vp8_rtcd.h"
@@ -54,4 +54,4 @@ static INLINE void vp8_inverse_transform_mby(MACROBLOCKD *xd) {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_INVTRANS_H_
+#endif  // VPX_VP8_COMMON_INVTRANS_H_
diff --git a/libs/libvpx/vp8/common/loopfilter.h b/libs/libvpx/vp8/common/loopfilter.h
index 7484563e06a..0733046e5af 100644
--- a/libs/libvpx/vp8/common/loopfilter.h
+++ b/libs/libvpx/vp8/common/loopfilter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_LOOPFILTER_H_
-#define VP8_COMMON_LOOPFILTER_H_
+#ifndef VPX_VP8_COMMON_LOOPFILTER_H_
+#define VPX_VP8_COMMON_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx_config.h"
@@ -100,4 +100,4 @@ void vp8_loop_filter_row_simple(struct VP8Common *cm,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_LOOPFILTER_H_
+#endif  // VPX_VP8_COMMON_LOOPFILTER_H_
diff --git a/libs/libvpx/vp8/common/loopfilter_filters.c b/libs/libvpx/vp8/common/loopfilter_filters.c
index 188e290ca7f..61a55d3c92f 100644
--- a/libs/libvpx/vp8/common/loopfilter_filters.c
+++ b/libs/libvpx/vp8/common/loopfilter_filters.c
@@ -270,28 +270,32 @@ static void vp8_simple_filter(signed char mask, uc *op1, uc *op0, uc *oq0,
   *op0 = u ^ 0x80;
 }
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *s, int p,
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr,
+                                              int y_stride,
                                               const unsigned char *blimit) {
   signed char mask = 0;
   int i = 0;
 
   do {
-    mask = vp8_simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], s[0 * p],
-                                  s[1 * p]);
-    vp8_simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
+    mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2 * y_stride],
+                                  y_ptr[-1 * y_stride], y_ptr[0 * y_stride],
+                                  y_ptr[1 * y_stride]);
+    vp8_simple_filter(mask, y_ptr - 2 * y_stride, y_ptr - 1 * y_stride, y_ptr,
+                      y_ptr + 1 * y_stride);
+    ++y_ptr;
   } while (++i < 16);
 }
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride,
                                             const unsigned char *blimit) {
   signed char mask = 0;
   int i = 0;
 
   do {
-    mask = vp8_simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
-    vp8_simple_filter(mask, s - 2, s - 1, s, s + 1);
-    s += p;
+    mask = vp8_simple_filter_mask(blimit[0], y_ptr[-2], y_ptr[-1], y_ptr[0],
+                                  y_ptr[1]);
+    vp8_simple_filter(mask, y_ptr - 2, y_ptr - 1, y_ptr, y_ptr + 1);
+    y_ptr += y_stride;
   } while (++i < 16);
 }
 
diff --git a/libs/libvpx/vp8/common/mfqe.c b/libs/libvpx/vp8/common/mfqe.c
index b6f8146b844..1fe7363f177 100644
--- a/libs/libvpx/vp8/common/mfqe.c
+++ b/libs/libvpx/vp8/common/mfqe.c
@@ -18,6 +18,7 @@
 
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vp8/common/common.h"
 #include "vp8/common/postproc.h"
 #include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
@@ -211,6 +212,7 @@ static int qualify_inter_mb(const MODE_INFO *mode_info_context, int *map) {
       { 0, 1, 4, 5 }, { 2, 3, 6, 7 }, { 8, 9, 12, 13 }, { 10, 11, 14, 15 }
     };
     int i, j;
+    vp8_zero(*map);
     for (i = 0; i < 4; ++i) {
       map[i] = 1;
       for (j = 0; j < 4 && map[j]; ++j) {
@@ -233,7 +235,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
 
   FRAME_TYPE frame_type = cm->frame_type;
   /* Point at base of Mb MODE_INFO list has motion vectors etc */
-  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+  const MODE_INFO *mode_info_context = cm->mi;
   int mb_row;
   int mb_col;
   int totmap, map[4];
diff --git a/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
index 899dc10ad96..eae852d5928 100644
--- a/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
+++ b/libs/libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c
@@ -35,41 +35,41 @@ void vp8_dequant_idct_add_y_block_dspr2(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_dspr2(short *q, short *dq,
-                                         unsigned char *dstu,
-                                         unsigned char *dstv, int stride,
+                                         unsigned char *dst_u,
+                                         unsigned char *dst_v, int stride,
                                          char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1)
-        vp8_dequant_idct_add_dspr2(q, dq, dstu, stride);
+        vp8_dequant_idct_add_dspr2(q, dq, dst_u, stride);
       else {
-        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstu, stride, dstu, stride);
+        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_u, stride, dst_u, stride);
         ((int *)q)[0] = 0;
       }
 
       q += 16;
-      dstu += 4;
+      dst_u += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst_u += 4 * stride - 8;
   }
 
   for (i = 0; i < 2; ++i) {
     for (j = 0; j < 2; ++j) {
       if (*eobs++ > 1)
-        vp8_dequant_idct_add_dspr2(q, dq, dstv, stride);
+        vp8_dequant_idct_add_dspr2(q, dq, dst_v, stride);
       else {
-        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dstv, stride, dstv, stride);
+        vp8_dc_only_idct_add_dspr2(q[0] * dq[0], dst_v, stride, dst_v, stride);
         ((int *)q)[0] = 0;
       }
 
       q += 16;
-      dstv += 4;
+      dst_v += 4;
     }
 
-    dstv += 4 * stride - 8;
+    dst_v += 4 * stride - 8;
   }
 }
 
diff --git a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
index d2c34425156..21446fb4132 100644
--- a/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
+++ b/libs/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c
@@ -934,8 +934,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   s4 = s3 + p;
 
   /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
   p2 = *((uint32_t *)(s1 - 4));
   p6 = *((uint32_t *)(s1));
   p1 = *((uint32_t *)(s2 - 4));
@@ -990,8 +990,8 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p,
       :);
 
   /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-  * mask will be zero and filtering is not needed
-  */
+   * mask will be zero and filtering is not needed
+   */
   if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
     vp8_filter_mask_vec_mips(limit, flimit, p1, p2, pm1, p0, p3, p4, p5, p6,
                              thresh, &hev, &mask);
@@ -2102,8 +2102,8 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p,
   s4 = s3 + p;
 
   /* load quad-byte vectors
-  * memory is 4 byte aligned
-  */
+   * memory is 4 byte aligned
+   */
   p2 = *((uint32_t *)(s1 - 4));
   p6 = *((uint32_t *)(s1));
   p1 = *((uint32_t *)(s2 - 4));
diff --git a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
index f6020ab4688..4fd6854c528 100644
--- a/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
+++ b/libs/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -12,7 +12,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
-                                      int stride, int8_t *eobs) {
+                                      int stride, char *eobs) {
   int i, j;
 
   for (i = 0; i < 4; i++) {
@@ -32,40 +32,39 @@ void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
   }
 }
 
-void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int stride,
-                                       int8_t *eobs) {
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int stride, char *eobs) {
   int i, j;
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_mmi(q, dq, dstu, stride);
+        vp8_dequant_idct_add_mmi(q, dq, dst_u, stride);
       } else {
-        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride);
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_u, stride, dst_u, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstu += 4;
+      dst_u += 4;
     }
 
-    dstu += 4 * stride - 8;
+    dst_u += 4 * stride - 8;
   }
 
   for (i = 0; i < 2; i++) {
     for (j = 0; j < 2; j++) {
       if (*eobs++ > 1) {
-        vp8_dequant_idct_add_mmi(q, dq, dstv, stride);
+        vp8_dequant_idct_add_mmi(q, dq, dst_v, stride);
       } else {
-        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride);
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst_v, stride, dst_v, stride);
         memset(q, 0, 2 * sizeof(q[0]));
       }
 
       q += 16;
-      dstv += 4;
+      dst_v += 4;
     }
 
-    dstv += 4 * stride - 8;
+    dst_v += 4 * stride - 8;
   }
 }
diff --git a/libs/libvpx/vp8/common/mips/msa/idct_msa.c b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
index 3d516d0f81a..efad0c29f8a 100644
--- a/libs/libvpx/vp8/common/mips/msa/idct_msa.c
+++ b/libs/libvpx/vp8/common/mips/msa/idct_msa.c
@@ -134,7 +134,7 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
   ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride);
 }
 
-void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dqcoeff) {
   v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1;
   const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
   const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
@@ -157,22 +157,22 @@ void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) {
   ADD2(tmp0, 3, tmp1, 3, out0, out1);
   out0 >>= 3;
   out1 >>= 3;
-  mb_dq_coeff[0] = __msa_copy_s_h(out0, 0);
-  mb_dq_coeff[16] = __msa_copy_s_h(out0, 4);
-  mb_dq_coeff[32] = __msa_copy_s_h(out1, 0);
-  mb_dq_coeff[48] = __msa_copy_s_h(out1, 4);
-  mb_dq_coeff[64] = __msa_copy_s_h(out0, 1);
-  mb_dq_coeff[80] = __msa_copy_s_h(out0, 5);
-  mb_dq_coeff[96] = __msa_copy_s_h(out1, 1);
-  mb_dq_coeff[112] = __msa_copy_s_h(out1, 5);
-  mb_dq_coeff[128] = __msa_copy_s_h(out0, 2);
-  mb_dq_coeff[144] = __msa_copy_s_h(out0, 6);
-  mb_dq_coeff[160] = __msa_copy_s_h(out1, 2);
-  mb_dq_coeff[176] = __msa_copy_s_h(out1, 6);
-  mb_dq_coeff[192] = __msa_copy_s_h(out0, 3);
-  mb_dq_coeff[208] = __msa_copy_s_h(out0, 7);
-  mb_dq_coeff[224] = __msa_copy_s_h(out1, 3);
-  mb_dq_coeff[240] = __msa_copy_s_h(out1, 7);
+  mb_dqcoeff[0] = __msa_copy_s_h(out0, 0);
+  mb_dqcoeff[16] = __msa_copy_s_h(out0, 4);
+  mb_dqcoeff[32] = __msa_copy_s_h(out1, 0);
+  mb_dqcoeff[48] = __msa_copy_s_h(out1, 4);
+  mb_dqcoeff[64] = __msa_copy_s_h(out0, 1);
+  mb_dqcoeff[80] = __msa_copy_s_h(out0, 5);
+  mb_dqcoeff[96] = __msa_copy_s_h(out1, 1);
+  mb_dqcoeff[112] = __msa_copy_s_h(out1, 5);
+  mb_dqcoeff[128] = __msa_copy_s_h(out0, 2);
+  mb_dqcoeff[144] = __msa_copy_s_h(out0, 6);
+  mb_dqcoeff[160] = __msa_copy_s_h(out1, 2);
+  mb_dqcoeff[176] = __msa_copy_s_h(out1, 6);
+  mb_dqcoeff[192] = __msa_copy_s_h(out0, 3);
+  mb_dqcoeff[208] = __msa_copy_s_h(out0, 7);
+  mb_dqcoeff[224] = __msa_copy_s_h(out1, 3);
+  mb_dqcoeff[240] = __msa_copy_s_h(out1, 7);
 }
 
 static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
@@ -359,27 +359,27 @@ void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq, uint8_t *dst,
   }
 }
 
-void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu,
-                                       uint8_t *dstv, int32_t stride,
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
                                        char *eobs) {
   int16_t *eobs_h = (int16_t *)eobs;
 
   if (eobs_h[0]) {
     if (eobs_h[0] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
     }
   }
 
   q += 32;
-  dstu += (stride * 4);
+  dst_u += (stride * 4);
 
   if (eobs_h[1]) {
     if (eobs_h[1] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_u, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_u, stride);
     }
   }
 
@@ -387,20 +387,20 @@ void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq, uint8_t *dstu,
 
   if (eobs_h[2]) {
     if (eobs_h[2] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
     }
   }
 
   q += 32;
-  dstv += (stride * 4);
+  dst_v += (stride * 4);
 
   if (eobs_h[3]) {
     if (eobs_h[3] & 0xfefe) {
-      dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+      dequant_idct4x4_addblk_2x_msa(q, dq, dst_v, stride);
     } else {
-      dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+      dequant_idct_addconst_2x_msa(q, dq, dst_v, stride);
     }
   }
 }
diff --git a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
index 6bec3adec39..14f83799ffd 100644
--- a/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/libs/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
-#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
 
 #include <msa.h>
 
@@ -1757,4 +1757,4 @@
                                                                 \
     tmp1_m;                                                     \
   })
-#endif /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */
+#endif  // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
diff --git a/libs/libvpx/vp8/common/modecont.c b/libs/libvpx/vp8/common/modecont.c
index d6ad9bb99ae..bab410374f6 100644
--- a/libs/libvpx/vp8/common/modecont.c
+++ b/libs/libvpx/vp8/common/modecont.c
@@ -11,28 +11,16 @@
 #include "entropy.h"
 
 const int vp8_mode_contexts[6][4] = {
-  {
-      /* 0 */
-      7, 1, 1, 143,
-  },
-  {
-      /* 1 */
-      14, 18, 14, 107,
-  },
-  {
-      /* 2 */
-      135, 64, 57, 68,
-  },
-  {
-      /* 3 */
-      60, 56, 128, 65,
-  },
-  {
-      /* 4 */
-      159, 134, 128, 34,
-  },
-  {
-      /* 5 */
-      234, 188, 128, 28,
-  },
+  { /* 0 */
+    7, 1, 1, 143 },
+  { /* 1 */
+    14, 18, 14, 107 },
+  { /* 2 */
+    135, 64, 57, 68 },
+  { /* 3 */
+    60, 56, 128, 65 },
+  { /* 4 */
+    159, 134, 128, 34 },
+  { /* 5 */
+    234, 188, 128, 28 },
 };
diff --git a/libs/libvpx/vp8/common/modecont.h b/libs/libvpx/vp8/common/modecont.h
index b58c7dc2d3d..031f74f2ff6 100644
--- a/libs/libvpx/vp8/common/modecont.h
+++ b/libs/libvpx/vp8/common/modecont.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MODECONT_H_
-#define VP8_COMMON_MODECONT_H_
+#ifndef VPX_VP8_COMMON_MODECONT_H_
+#define VPX_VP8_COMMON_MODECONT_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,4 +21,4 @@ extern const int vp8_mode_contexts[6][4];
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_MODECONT_H_
+#endif  // VPX_VP8_COMMON_MODECONT_H_
diff --git a/libs/libvpx/vp8/common/mv.h b/libs/libvpx/vp8/common/mv.h
index b6d2147af87..4cde12f201a 100644
--- a/libs/libvpx/vp8/common/mv.h
+++ b/libs/libvpx/vp8/common/mv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_MV_H_
-#define VP8_COMMON_MV_H_
+#ifndef VPX_VP8_COMMON_MV_H_
+#define VPX_VP8_COMMON_MV_H_
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -30,4 +30,4 @@ typedef union int_mv {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_MV_H_
+#endif  // VPX_VP8_COMMON_MV_H_
diff --git a/libs/libvpx/vp8/common/onyx.h b/libs/libvpx/vp8/common/onyx.h
index 72fba2ec56b..05c72df3faa 100644
--- a/libs/libvpx/vp8/common/onyx.h
+++ b/libs/libvpx/vp8/common/onyx.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYX_H_
-#define VP8_COMMON_ONYX_H_
+#ifndef VPX_VP8_COMMON_ONYX_H_
+#define VPX_VP8_COMMON_ONYX_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -247,38 +247,38 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
 
-int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags,
+int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
-int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags,
+                          int64_t end_time);
+int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, unsigned char *dest,
                             unsigned char *dest_end, int64_t *time_stamp,
                             int64_t *time_end, int flush);
-int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest,
+int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp8_ppflags_t *flags);
 
-int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags);
-int vp8_get_reference(struct VP8_COMP *comp,
+int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags);
+int vp8_get_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_set_reference(struct VP8_COMP *comp,
+int vp8_set_reference(struct VP8_COMP *cpi,
                       enum vpx_ref_frame_type ref_frame_flag,
                       YV12_BUFFER_CONFIG *sd);
-int vp8_update_entropy(struct VP8_COMP *comp, int update);
-int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows,
+int vp8_update_entropy(struct VP8_COMP *cpi, int update);
+int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int cols, int delta_q[4], int delta_lf[4],
                    unsigned int threshold[4]);
-int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map,
+int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode,
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
                           VPX_SCALING vert_mode);
-int vp8_get_quantizer(struct VP8_COMP *c);
+int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // VP8_COMMON_ONYX_H_
+#endif  // VPX_VP8_COMMON_ONYX_H_
diff --git a/libs/libvpx/vp8/common/onyxc_int.h b/libs/libvpx/vp8/common/onyxc_int.h
index 9a12c7fb67d..ef8d0076209 100644
--- a/libs/libvpx/vp8/common/onyxc_int.h
+++ b/libs/libvpx/vp8/common/onyxc_int.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYXC_INT_H_
-#define VP8_COMMON_ONYXC_INT_H_
+#ifndef VPX_VP8_COMMON_ONYXC_INT_H_
+#define VPX_VP8_COMMON_ONYXC_INT_H_
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -174,4 +174,4 @@ typedef struct VP8Common {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_ONYXC_INT_H_
+#endif  // VPX_VP8_COMMON_ONYXC_INT_H_
diff --git a/libs/libvpx/vp8/common/onyxd.h b/libs/libvpx/vp8/common/onyxd.h
index d3c1b0e972c..801ef87b20f 100644
--- a/libs/libvpx/vp8/common/onyxd.h
+++ b/libs/libvpx/vp8/common/onyxd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_ONYXD_H_
-#define VP8_COMMON_ONYXD_H_
+#ifndef VPX_VP8_COMMON_ONYXD_H_
+#define VPX_VP8_COMMON_ONYXD_H_
 
 /* Create/destroy static data structures. */
 #ifdef __cplusplus
@@ -41,23 +41,23 @@ void vp8dx_set_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst, int x);
 
 int vp8dx_get_setting(struct VP8D_COMP *comp, VP8D_SETTING oxst);
 
-int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
-                                  const uint8_t *dest, int64_t time_stamp);
-int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
+int vp8dx_receive_compressed_data(struct VP8D_COMP *pbi, size_t size,
+                                  const uint8_t *source, int64_t time_stamp);
+int vp8dx_get_raw_frame(struct VP8D_COMP *pbi, YV12_BUFFER_CONFIG *sd,
                         int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags);
 int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
-vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
+vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *pbi,
                                     enum vpx_ref_frame_type ref_frame_flag,
                                     YV12_BUFFER_CONFIG *sd);
-vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *comp,
+vpx_codec_err_t vp8dx_set_reference(struct VP8D_COMP *pbi,
                                     enum vpx_ref_frame_type ref_frame_flag,
                                     YV12_BUFFER_CONFIG *sd);
-int vp8dx_get_quantizer(const struct VP8D_COMP *c);
+int vp8dx_get_quantizer(const struct VP8D_COMP *pbi);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // VP8_COMMON_ONYXD_H_
+#endif  // VPX_VP8_COMMON_ONYXD_H_
diff --git a/libs/libvpx/vp8/common/postproc.c b/libs/libvpx/vp8/common/postproc.c
index d67ee8a57d8..2ed19c4fd57 100644
--- a/libs/libvpx/vp8/common/postproc.c
+++ b/libs/libvpx/vp8/common/postproc.c
@@ -65,7 +65,7 @@ void vp8_deblock(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
   double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065;
   int ppl = (int)(level + .5);
 
-  const MODE_INFO *mode_info_context = cm->show_frame_mi;
+  const MODE_INFO *mode_info_context = cm->mi;
   int mbr, mbc;
 
   /* The pixel thresholds are adjusted according to if or not the macroblock
@@ -151,124 +151,6 @@ void vp8_de_noise(VP8_COMMON *cm, YV12_BUFFER_CONFIG *source,
 }
 #endif  // CONFIG_POSTPROC
 
-/* Blend the macro block with a solid colored square.  Leave the
- * edges unblended to give distinction to macro blocks in areas
- * filled with the same color block.
- */
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  y += 2 * stride + 2;
-  for (i = 0; i < 12; ++i) {
-    for (j = 0; j < 12; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  u += stride + 1;
-  v += stride + 1;
-
-  for (i = 0; i < 6; ++i) {
-    for (j = 0; j < 6; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
-/* Blend only the edge of the macro block.  Leave center
- * unblended to allow for other visualizations to be layered.
- */
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                          int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  for (i = 0; i < 12; ++i) {
-    y[0] = (y[0] * alpha + y1_const) >> 16;
-    y[1] = (y[1] * alpha + y1_const) >> 16;
-    y[14] = (y[14] * alpha + y1_const) >> 16;
-    y[15] = (y[15] * alpha + y1_const) >> 16;
-    y += stride;
-  }
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 16; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-  u += stride;
-  v += stride;
-
-  for (i = 0; i < 6; ++i) {
-    u[0] = (u[0] * alpha + u1_const) >> 16;
-    v[0] = (v[0] * alpha + v1_const) >> 16;
-
-    u[7] = (u[7] * alpha + u1_const) >> 16;
-    v[7] = (v[7] * alpha + v1_const) >> 16;
-
-    u += stride;
-    v += stride;
-  }
-
-  for (j = 0; j < 8; ++j) {
-    u[j] = (u[j] * alpha + u1_const) >> 16;
-    v[j] = (v[j] * alpha + v1_const) >> 16;
-  }
-}
-
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v,
-                   int y_1, int u_1, int v_1, int alpha, int stride) {
-  int i, j;
-  int y1_const = y_1 * ((1 << 16) - alpha);
-  int u1_const = u_1 * ((1 << 16) - alpha);
-  int v1_const = v_1 * ((1 << 16) - alpha);
-
-  for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j) {
-      y[j] = (y[j] * alpha + y1_const) >> 16;
-    }
-    y += stride;
-  }
-
-  stride >>= 1;
-
-  for (i = 0; i < 2; ++i) {
-    for (j = 0; j < 2; ++j) {
-      u[j] = (u[j] * alpha + u1_const) >> 16;
-      v[j] = (v[j] * alpha + v1_const) >> 16;
-    }
-    u += stride;
-    v += stride;
-  }
-}
-
 #if CONFIG_POSTPROC
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
                         vp8_ppflags_t *ppflags) {
@@ -325,7 +207,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest,
   vpx_clear_system_state();
 
   if ((flags & VP8D_MFQE) && oci->postproc_state.last_frame_valid &&
-      oci->current_video_frame >= 2 &&
+      oci->current_video_frame > 10 &&
       oci->postproc_state.last_base_qindex < 60 &&
       oci->base_qindex - oci->postproc_state.last_base_qindex >= 20) {
     vp8_multiframe_quality_enhance(oci);
diff --git a/libs/libvpx/vp8/common/postproc.h b/libs/libvpx/vp8/common/postproc.h
index 7be112b1635..a14f5f1df1d 100644
--- a/libs/libvpx/vp8/common/postproc.h
+++ b/libs/libvpx/vp8/common/postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_POSTPROC_H_
-#define VP8_COMMON_POSTPROC_H_
+#ifndef VPX_VP8_COMMON_POSTPROC_H_
+#define VPX_VP8_COMMON_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
 struct postproc_state {
@@ -27,13 +27,13 @@ struct postproc_state {
 extern "C" {
 #endif
 int vp8_post_proc_frame(struct VP8Common *oci, YV12_BUFFER_CONFIG *dest,
-                        vp8_ppflags_t *flags);
+                        vp8_ppflags_t *ppflags);
 
-void vp8_de_noise(struct VP8Common *oci, YV12_BUFFER_CONFIG *source,
+void vp8_de_noise(struct VP8Common *cm, YV12_BUFFER_CONFIG *source,
                   YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag,
                   int uvfilter);
 
-void vp8_deblock(struct VP8Common *oci, YV12_BUFFER_CONFIG *source,
+void vp8_deblock(struct VP8Common *cm, YV12_BUFFER_CONFIG *source,
                  YV12_BUFFER_CONFIG *post, int q, int low_var_thresh, int flag);
 
 #define MFQE_PRECISION 4
@@ -43,4 +43,4 @@ void vp8_multiframe_quality_enhance(struct VP8Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_POSTPROC_H_
+#endif  // VPX_VP8_COMMON_POSTPROC_H_
diff --git a/libs/libvpx/vp8/common/ppflags.h b/libs/libvpx/vp8/common/ppflags.h
index 96e3af6c9c1..bdf08734b96 100644
--- a/libs/libvpx/vp8/common/ppflags.h
+++ b/libs/libvpx/vp8/common/ppflags.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_PPFLAGS_H_
-#define VP8_COMMON_PPFLAGS_H_
+#ifndef VPX_VP8_COMMON_PPFLAGS_H_
+#define VPX_VP8_COMMON_PPFLAGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,4 +36,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_PPFLAGS_H_
+#endif  // VPX_VP8_COMMON_PPFLAGS_H_
diff --git a/libs/libvpx/vp8/common/quant_common.h b/libs/libvpx/vp8/common/quant_common.h
index ff4203df870..049840a2725 100644
--- a/libs/libvpx/vp8/common/quant_common.h
+++ b/libs/libvpx/vp8/common/quant_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_QUANT_COMMON_H_
-#define VP8_COMMON_QUANT_COMMON_H_
+#ifndef VPX_VP8_COMMON_QUANT_COMMON_H_
+#define VPX_VP8_COMMON_QUANT_COMMON_H_
 
 #include "string.h"
 #include "blockd.h"
@@ -30,4 +30,4 @@ extern int vp8_ac_uv_quant(int QIndex, int Delta);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_QUANT_COMMON_H_
+#endif  // VPX_VP8_COMMON_QUANT_COMMON_H_
diff --git a/libs/libvpx/vp8/common/reconinter.c b/libs/libvpx/vp8/common/reconinter.c
index 48892c9b8e5..2cb0709318b 100644
--- a/libs/libvpx/vp8/common/reconinter.c
+++ b/libs/libvpx/vp8/common/reconinter.c
@@ -333,6 +333,13 @@ void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
   _16x16mv.as_mv.row &= x->fullpixel_mask;
   _16x16mv.as_mv.col &= x->fullpixel_mask;
 
+  if (2 * _16x16mv.as_mv.col < (x->mb_to_left_edge - (19 << 3)) ||
+      2 * _16x16mv.as_mv.col > x->mb_to_right_edge + (18 << 3) ||
+      2 * _16x16mv.as_mv.row < (x->mb_to_top_edge - (19 << 3)) ||
+      2 * _16x16mv.as_mv.row > x->mb_to_bottom_edge + (18 << 3)) {
+    return;
+  }
+
   pre_stride >>= 1;
   offset = (_16x16mv.as_mv.row >> 3) * pre_stride + (_16x16mv.as_mv.col >> 3);
   uptr = x->pre.u_buffer + offset;
diff --git a/libs/libvpx/vp8/common/reconinter.h b/libs/libvpx/vp8/common/reconinter.h
index 4cdd4fee0fd..974e7ce7547 100644
--- a/libs/libvpx/vp8/common/reconinter.h
+++ b/libs/libvpx/vp8/common/reconinter.h
@@ -8,30 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTER_H_
-#define VP8_COMMON_RECONINTER_H_
+#ifndef VPX_VP8_COMMON_RECONINTER_H_
+#define VPX_VP8_COMMON_RECONINTER_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x);
-extern void vp8_build_inter16x16_predictors_mb(
-    MACROBLOCKD *x, unsigned char *dst_y, unsigned char *dst_u,
-    unsigned char *dst_v, int dst_ystride, int dst_uvstride);
+void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
+void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, unsigned char *dst_y,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int dst_ystride,
+                                        int dst_uvstride);
 
-extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x,
-                                                unsigned char *dst_y,
-                                                int dst_ystride);
-extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
-                                         unsigned char *base_pre,
-                                         int pre_stride, vp8_subpix_fn_t sppf);
+void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x, unsigned char *dst_y,
+                                         int dst_ystride);
+void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, unsigned char *base_pre,
+                                  int pre_stride, vp8_subpix_fn_t sppf);
 
-extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
-extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTER_H_
+#endif  // VPX_VP8_COMMON_RECONINTER_H_
diff --git a/libs/libvpx/vp8/common/reconintra.h b/libs/libvpx/vp8/common/reconintra.h
index fd7c725f35d..029ac00a24b 100644
--- a/libs/libvpx/vp8/common/reconintra.h
+++ b/libs/libvpx/vp8/common/reconintra.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTRA_H_
-#define VP8_COMMON_RECONINTRA_H_
+#ifndef VPX_VP8_COMMON_RECONINTRA_H_
+#define VPX_VP8_COMMON_RECONINTRA_H_
 
 #include "vp8/common/blockd.h"
 
@@ -32,4 +32,4 @@ void vp8_init_intra_predictors(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTRA_H_
+#endif  // VPX_VP8_COMMON_RECONINTRA_H_
diff --git a/libs/libvpx/vp8/common/reconintra4x4.h b/libs/libvpx/vp8/common/reconintra4x4.h
index e17fc58c012..3618ec5cbeb 100644
--- a/libs/libvpx/vp8/common/reconintra4x4.h
+++ b/libs/libvpx/vp8/common/reconintra4x4.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_RECONINTRA4X4_H_
-#define VP8_COMMON_RECONINTRA4X4_H_
+#ifndef VPX_VP8_COMMON_RECONINTRA4X4_H_
+#define VPX_VP8_COMMON_RECONINTRA4X4_H_
 #include "vp8/common/blockd.h"
 
 #ifdef __cplusplus
@@ -31,7 +31,7 @@ static INLINE void intra_prediction_down_copy(MACROBLOCKD *xd,
   *dst_ptr2 = *src_ptr;
 }
 
-void vp8_intra4x4_predict(unsigned char *Above, unsigned char *yleft,
+void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
                           int left_stride, B_PREDICTION_MODE b_mode,
                           unsigned char *dst, int dst_stride,
                           unsigned char top_left);
@@ -42,4 +42,4 @@ void vp8_init_intra4x4_predictors_internal(void);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_RECONINTRA4X4_H_
+#endif  // VPX_VP8_COMMON_RECONINTRA4X4_H_
diff --git a/libs/libvpx/vp8/common/rtcd_defs.pl b/libs/libvpx/vp8/common/rtcd_defs.pl
index 3df745f75a8..8452b5e8545 100644
--- a/libs/libvpx/vp8/common/rtcd_defs.pl
+++ b/libs/libvpx/vp8/common/rtcd_defs.pl
@@ -31,10 +31,10 @@ ()
 #
 # Dequant
 #
-add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
+add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *DQC";
 specialize qw/vp8_dequantize_b mmx neon msa mmi/;
 
-add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
+add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *dest, int stride";
 specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
@@ -46,20 +46,20 @@ ()
 #
 # Loopfilter
 #
-add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
+add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
 specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
 
 
-add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
@@ -67,7 +67,7 @@ ()
 $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
 $vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
@@ -75,7 +75,7 @@ ()
 $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
 $vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
+add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
@@ -83,7 +83,7 @@ ()
 $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
 $vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
 
-add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
+add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
 specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
@@ -95,31 +95,31 @@ ()
 # IDCT
 #
 #idct16
-add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
+add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
 specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
 
 #iwalsh1
-add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
+add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *mb_dqcoeff";
 specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
 
 #iwalsh16
-add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
+add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *mb_dqcoeff";
 specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
 
 #idct1_scalar_add
-add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
+add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
 specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
 
 #
 # RECON
 #
-add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
 specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
 specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
 specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
 
 #
@@ -127,11 +127,11 @@ ()
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
-    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
 
-    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
 
-    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride";
+    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
 
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
@@ -145,29 +145,29 @@ ()
 #
 # Subpixel
 #
-add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
 
-add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
+add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict8x8 sse2 ssse3 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
+add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict8x4 sse2 neon msa/;
 
-add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
+add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
+specialize qw/vp8_bilinear_predict4x4 sse2 neon msa/;
 
 #
 # Encoder functions below this point.
@@ -177,10 +177,8 @@ ()
 #
 # Block copy
 #
-if ($opts{arch} =~ /x86/) {
-    add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n";
-    specialize qw/vp8_copy32xn sse2 sse3/;
-}
+add_proto qw/void vp8_copy32xn/, "const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height";
+specialize qw/vp8_copy32xn sse2 sse3/;
 
 #
 # Forward DCT
@@ -223,7 +221,7 @@ ()
 $vp8_full_search_sad_sse3=vp8_full_search_sadx3;
 $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
 
-add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
+add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
diff --git a/libs/libvpx/vp8/common/setupintrarecon.h b/libs/libvpx/vp8/common/setupintrarecon.h
index f3ffa16607a..903a536aed8 100644
--- a/libs/libvpx/vp8/common/setupintrarecon.h
+++ b/libs/libvpx/vp8/common/setupintrarecon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SETUPINTRARECON_H_
-#define VP8_COMMON_SETUPINTRARECON_H_
+#ifndef VPX_VP8_COMMON_SETUPINTRARECON_H_
+#define VPX_VP8_COMMON_SETUPINTRARECON_H_
 
 #include "./vpx_config.h"
 #include "vpx_scale/yv12config.h"
@@ -37,4 +37,4 @@ static INLINE void setup_intra_recon_left(unsigned char *y_buffer,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SETUPINTRARECON_H_
+#endif  // VPX_VP8_COMMON_SETUPINTRARECON_H_
diff --git a/libs/libvpx/vp8/common/swapyv12buffer.h b/libs/libvpx/vp8/common/swapyv12buffer.h
index 0ee9a52ceb0..e37c471f632 100644
--- a/libs/libvpx/vp8/common/swapyv12buffer.h
+++ b/libs/libvpx/vp8/common/swapyv12buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SWAPYV12BUFFER_H_
-#define VP8_COMMON_SWAPYV12BUFFER_H_
+#ifndef VPX_VP8_COMMON_SWAPYV12BUFFER_H_
+#define VPX_VP8_COMMON_SWAPYV12BUFFER_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -24,4 +24,4 @@ void vp8_swap_yv12_buffer(YV12_BUFFER_CONFIG *new_frame,
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SWAPYV12BUFFER_H_
+#endif  // VPX_VP8_COMMON_SWAPYV12BUFFER_H_
diff --git a/libs/libvpx/vp8/common/systemdependent.h b/libs/libvpx/vp8/common/systemdependent.h
index 3d44e37cf24..83a5513aaef 100644
--- a/libs/libvpx/vp8/common/systemdependent.h
+++ b/libs/libvpx/vp8/common/systemdependent.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SYSTEMDEPENDENT_H_
-#define VP8_COMMON_SYSTEMDEPENDENT_H_
+#ifndef VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
+#define VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
 
 #include "vpx_config.h"
 
@@ -24,4 +24,4 @@ void vp8_machine_specific_config(struct VP8Common *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SYSTEMDEPENDENT_H_
+#endif  // VPX_VP8_COMMON_SYSTEMDEPENDENT_H_
diff --git a/libs/libvpx/vp8/common/threading.h b/libs/libvpx/vp8/common/threading.h
index c89cf9bad74..7a637cdff22 100644
--- a/libs/libvpx/vp8/common/threading.h
+++ b/libs/libvpx/vp8/common/threading.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_THREADING_H_
-#define VP8_COMMON_THREADING_H_
+#ifndef VPX_VP8_COMMON_THREADING_H_
+#define VPX_VP8_COMMON_THREADING_H_
 
 #include "./vpx_config.h"
 
@@ -205,4 +205,4 @@ static INLINE void vp8_atomic_spin_wait(
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_THREADING_H_
+#endif  // VPX_VP8_COMMON_THREADING_H_
diff --git a/libs/libvpx/vp8/common/treecoder.c b/libs/libvpx/vp8/common/treecoder.c
index 9feb40a5a7c..f1e78f43210 100644
--- a/libs/libvpx/vp8/common/treecoder.c
+++ b/libs/libvpx/vp8/common/treecoder.c
@@ -12,6 +12,7 @@
 #include <stdio.h>
 
 #include "vp8/common/treecoder.h"
+#include "vpx/vpx_integer.h"
 
 static void tree2tok(struct vp8_token_struct *const p, vp8_tree t, int i, int v,
                      int L) {
@@ -79,7 +80,7 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
                                       vp8_prob probs[/* n-1 */],
                                       unsigned int branch_ct[/* n-1 */][2],
                                       const unsigned int num_events[/* n */],
-                                      unsigned int Pfac, int rd) {
+                                      unsigned int Pfactor, int Round) {
   const int tree_len = n - 1;
   int t = 0;
 
@@ -89,10 +90,10 @@ void vp8_tree_probs_from_distribution(int n, /* n = size of alphabet */
     const unsigned int *const c = branch_ct[t];
     const unsigned int tot = c[0] + c[1];
 
-    assert(tot < (1 << 24)); /* no overflow below */
-
     if (tot) {
-      const unsigned int p = ((c[0] * Pfac) + (rd ? tot >> 1 : 0)) / tot;
+      const unsigned int p =
+          (unsigned int)(((uint64_t)c[0] * Pfactor) + (Round ? tot >> 1 : 0)) /
+          tot;
       probs[t] = p < 256 ? (p ? p : 1) : 255; /* agree w/old version for now */
     } else {
       probs[t] = vp8_prob_half;
diff --git a/libs/libvpx/vp8/common/treecoder.h b/libs/libvpx/vp8/common/treecoder.h
index d8503cf3f8f..d7d8d0ead00 100644
--- a/libs/libvpx/vp8/common/treecoder.h
+++ b/libs/libvpx/vp8/common/treecoder.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_TREECODER_H_
-#define VP8_COMMON_TREECODER_H_
+#ifndef VPX_VP8_COMMON_TREECODER_H_
+#define VPX_VP8_COMMON_TREECODER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,7 +32,7 @@ typedef const bool_coder_spec c_bool_coder_spec;
 typedef const bool_writer c_bool_writer;
 typedef const bool_reader c_bool_reader;
 
-#define vp8_complement(x) (255 - x)
+#define vp8_complement(x) (255 - (x))
 
 /* We build coding trees compactly in arrays.
    Each node of the tree is a pair of vp8_tree_indices.
@@ -79,4 +79,4 @@ void vp8bc_tree_probs_from_distribution(int n, /* n = size of alphabet */
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_TREECODER_H_
+#endif  // VPX_VP8_COMMON_TREECODER_H_
diff --git a/libs/libvpx/vp8/common/vp8_entropymodedata.h b/libs/libvpx/vp8/common/vp8_entropymodedata.h
index 9a81ebfe624..3fc942e0509 100644
--- a/libs/libvpx/vp8/common/vp8_entropymodedata.h
+++ b/libs/libvpx/vp8/common/vp8_entropymodedata.h
@@ -6,10 +6,10 @@
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
-*/
+ */
 
-#ifndef VP8_COMMON_VP8_ENTROPYMODEDATA_H_
-#define VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#ifndef VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#define VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -169,4 +169,4 @@ const vp8_prob
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_VP8_ENTROPYMODEDATA_H_
+#endif  // VPX_VP8_COMMON_VP8_ENTROPYMODEDATA_H_
diff --git a/libs/libvpx/vp8/common/vp8_skin_detection.h b/libs/libvpx/vp8/common/vp8_skin_detection.h
index 4d27f5eb2ea..ef0e4ae4fe7 100644
--- a/libs/libvpx/vp8/common/vp8_skin_detection.h
+++ b/libs/libvpx/vp8/common/vp8_skin_detection.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_COMMON_SKIN_DETECTION_H_
-#define VP8_COMMON_SKIN_DETECTION_H_
+#ifndef VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
+#define VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
 
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vpx_integer.h"
@@ -44,4 +44,4 @@ void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
 }  // extern "C"
 #endif
 
-#endif  // VP8_COMMON_SKIN_DETECTION_H_
+#endif  // VPX_VP8_COMMON_VP8_SKIN_DETECTION_H_
diff --git a/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c
new file mode 100644
index 00000000000..9bf65d8045e
--- /dev/null
+++ b/libs/libvpx/vp8/common/x86/bilinear_filter_sse2.c
@@ -0,0 +1,336 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <xmmintrin.h>
+
+#include "./vp8_rtcd.h"
+#include "./vpx_config.h"
+#include "vp8/common/filter.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void horizontal_16x16(uint8_t *src, const int stride,
+                                    uint16_t *dst, const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_lo);
+      _mm_store_si128((__m128i *)(dst + 8), a_hi);
+      src += stride;
+      dst += 16;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 17; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i a_lo = _mm_unpacklo_epi8(a, zero);
+      const __m128i a_hi = _mm_unpackhi_epi8(a, zero);
+      const __m128i a_lo_filtered = _mm_mullo_epi16(a_lo, hfilter_0);
+      const __m128i a_hi_filtered = _mm_mullo_epi16(a_hi, hfilter_0);
+
+      const __m128i b = _mm_loadu_si128((__m128i *)(src + 1));
+      const __m128i b_lo = _mm_unpacklo_epi8(b, zero);
+      const __m128i b_hi = _mm_unpackhi_epi8(b, zero);
+      const __m128i b_lo_filtered = _mm_mullo_epi16(b_lo, hfilter_1);
+      const __m128i b_hi_filtered = _mm_mullo_epi16(b_hi, hfilter_1);
+
+      const __m128i sum_lo = _mm_add_epi16(a_lo_filtered, b_lo_filtered);
+      const __m128i sum_hi = _mm_add_epi16(a_hi_filtered, b_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      _mm_store_si128((__m128i *)dst, shifted_lo);
+      _mm_store_si128((__m128i *)(dst + 8), shifted_hi);
+      src += stride;
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void vertical_16x16(uint16_t *src, uint8_t *dst, const int stride,
+                                  const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i packed = _mm_packus_epi16(row_lo, row_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      src += 16;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0_lo = _mm_load_si128((__m128i *)src);
+    __m128i row_0_hi = _mm_load_si128((__m128i *)(src + 8));
+    src += 16;
+    for (h = 0; h < 16; ++h) {
+      const __m128i row_0_lo_filtered = _mm_mullo_epi16(row_0_lo, vfilter_0);
+      const __m128i row_0_hi_filtered = _mm_mullo_epi16(row_0_hi, vfilter_0);
+
+      const __m128i row_1_lo = _mm_load_si128((__m128i *)src);
+      const __m128i row_1_hi = _mm_load_si128((__m128i *)(src + 8));
+      const __m128i row_1_lo_filtered = _mm_mullo_epi16(row_1_lo, vfilter_1);
+      const __m128i row_1_hi_filtered = _mm_mullo_epi16(row_1_hi, vfilter_1);
+
+      const __m128i sum_lo =
+          _mm_add_epi16(row_0_lo_filtered, row_1_lo_filtered);
+      const __m128i sum_hi =
+          _mm_add_epi16(row_0_hi_filtered, row_1_hi_filtered);
+
+      const __m128i compensated_lo = _mm_add_epi16(sum_lo, round_factor);
+      const __m128i compensated_hi = _mm_add_epi16(sum_hi, round_factor);
+
+      const __m128i shifted_lo =
+          _mm_srai_epi16(compensated_lo, VP8_FILTER_SHIFT);
+      const __m128i shifted_hi =
+          _mm_srai_epi16(compensated_hi, VP8_FILTER_SHIFT);
+
+      const __m128i packed = _mm_packus_epi16(shifted_lo, shifted_hi);
+      _mm_store_si128((__m128i *)dst, packed);
+      row_0_lo = row_1_lo;
+      row_0_hi = row_1_hi;
+      src += 16;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict16x16_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                    int xoffset, int yoffset, uint8_t *dst_ptr,
+                                    int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[16 * 17]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_16x16(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_16x16(FData, dst_ptr, dst_pitch, yoffset);
+}
+
+static INLINE void horizontal_8xN(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset, const int height) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadl_epi64((__m128i *)src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_store_si128((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    // Filter horizontally. Rather than load the whole array and transpose, load
+    // 16 values (overreading) and shift to set up the second value. Do an
+    // "extra" 9th line so the vertical pass has the necessary context.
+    for (h = 0; h < height; ++h) {
+      const __m128i a = _mm_loadu_si128((__m128i *)src);
+      const __m128i b = _mm_srli_si128(a, 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_store_si128((__m128i *)dst, shifted);
+      src += stride;
+      dst += 8;
+    }
+  }
+}
+
+static INLINE void vertical_8xN(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset, const int height) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < height; ++h) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      const __m128i packed = _mm_packus_epi16(row, row);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      src += 8;
+      dst += stride;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    __m128i row_0 = _mm_load_si128((__m128i *)src);
+    src += 8;
+    for (h = 0; h < height; ++h) {
+      const __m128i row_1 = _mm_load_si128((__m128i *)src);
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      const __m128i packed = _mm_packus_epi16(shifted, shifted);
+      _mm_storel_epi64((__m128i *)dst, packed);
+      row_0 = row_1;
+      src += 8;
+      dst += stride;
+    }
+  }
+}
+
+void vp8_bilinear_predict8x8_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[8 * 9]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 9);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 8);
+}
+
+void vp8_bilinear_predict8x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[8 * 5]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_8xN(src_ptr, src_pixels_per_line, FData, xoffset, 5);
+
+  vertical_8xN(FData, dst_ptr, dst_pitch, yoffset, 4);
+}
+
+static INLINE void horizontal_4x4(uint8_t *src, const int stride, uint16_t *dst,
+                                  const int xoffset) {
+  int h;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (xoffset == 0) {
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      _mm_storel_epi64((__m128i *)dst, a_u16);
+      src += stride;
+      dst += 4;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i hfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][0]);
+    const __m128i hfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[xoffset][1]);
+
+    for (h = 0; h < 5; ++h) {
+      const __m128i a = load_unaligned_u32(src);
+      const __m128i b = load_unaligned_u32(src + 1);
+      const __m128i a_u16 = _mm_unpacklo_epi8(a, zero);
+      const __m128i b_u16 = _mm_unpacklo_epi8(b, zero);
+      const __m128i a_filtered = _mm_mullo_epi16(a_u16, hfilter_0);
+      const __m128i b_filtered = _mm_mullo_epi16(b_u16, hfilter_1);
+      const __m128i sum = _mm_add_epi16(a_filtered, b_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      _mm_storel_epi64((__m128i *)dst, shifted);
+      src += stride;
+      dst += 4;
+    }
+  }
+}
+
+static INLINE void vertical_4x4(uint16_t *src, uint8_t *dst, const int stride,
+                                const int yoffset) {
+  int h;
+
+  if (yoffset == 0) {
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row = _mm_load_si128((__m128i *)src);
+      __m128i packed = _mm_packus_epi16(row, row);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      packed = _mm_srli_si128(packed, 4);
+      store_unaligned_u32(dst, packed);
+      dst += stride;
+      src += 8;
+    }
+    return;
+  }
+
+  {
+    const __m128i round_factor = _mm_set1_epi16(1 << (VP8_FILTER_SHIFT - 1));
+    const __m128i vfilter_0 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][0]);
+    const __m128i vfilter_1 = _mm_set1_epi16(vp8_bilinear_filters[yoffset][1]);
+
+    for (h = 0; h < 4; h += 2) {
+      const __m128i row_0 = _mm_load_si128((__m128i *)src);
+      const __m128i row_1 = _mm_loadu_si128((__m128i *)(src + 4));
+      const __m128i row_0_filtered = _mm_mullo_epi16(row_0, vfilter_0);
+      const __m128i row_1_filtered = _mm_mullo_epi16(row_1, vfilter_1);
+      const __m128i sum = _mm_add_epi16(row_0_filtered, row_1_filtered);
+      const __m128i compensated = _mm_add_epi16(sum, round_factor);
+      const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
+      __m128i packed = _mm_packus_epi16(shifted, shifted);
+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      packed = _mm_srli_si128(packed, 4);
+      dst += stride;
+      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      dst += stride;
+      src += 8;
+    }
+  }
+}
+
+void vp8_bilinear_predict4x4_sse2(uint8_t *src_ptr, int src_pixels_per_line,
+                                  int xoffset, int yoffset, uint8_t *dst_ptr,
+                                  int dst_pitch) {
+  DECLARE_ALIGNED(16, uint16_t, FData[4 * 5]);
+
+  assert((xoffset | yoffset) != 0);
+
+  horizontal_4x4(src_ptr, src_pixels_per_line, FData, xoffset);
+
+  vertical_4x4(FData, dst_ptr, dst_pitch, yoffset);
+}
diff --git a/libs/libvpx/vp8/common/x86/filter_x86.c b/libs/libvpx/vp8/common/x86/filter_x86.c
deleted file mode 100644
index 2405342f02a..00000000000
--- a/libs/libvpx/vp8/common/x86/filter_x86.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vp8/common/x86/filter_x86.h"
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
-  { 128, 128, 128, 128, 0, 0, 0, 0 }, { 112, 112, 112, 112, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 32, 32, 32, 32 }, { 80, 80, 80, 80, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64 }, { 48, 48, 48, 48, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
-  { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
-  { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
-  { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
-  { 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
-  { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-  { 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
-  { 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
-  { 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
-};
diff --git a/libs/libvpx/vp8/common/x86/filter_x86.h b/libs/libvpx/vp8/common/x86/filter_x86.h
deleted file mode 100644
index d282841bee4..00000000000
--- a/libs/libvpx/vp8/common/x86/filter_x86.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP8_COMMON_X86_FILTER_X86_H_
-#define VP8_COMMON_X86_FILTER_X86_H_
-
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
- * duplicated values */
-
-/* duplicated 4x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]);
-
-/* duplicated 8x */
-extern DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_COMMON_X86_FILTER_X86_H_
diff --git a/libs/libvpx/vp8/common/x86/idct_blk_sse2.c b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c
index 8aefb279970..897ed5b6527 100644
--- a/libs/libvpx/vp8/common/x86/idct_blk_sse2.c
+++ b/libs/libvpx/vp8/common/x86/idct_blk_sse2.c
@@ -42,43 +42,43 @@ void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst,
 }
 
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq,
-                                        unsigned char *dstu,
-                                        unsigned char *dstv, int stride,
+                                        unsigned char *dst_u,
+                                        unsigned char *dst_v, int stride,
                                         char *eobs) {
   if (((short *)(eobs))[0]) {
     if (((short *)(eobs))[0] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
     }
   }
   q += 32;
-  dstu += stride * 4;
+  dst_u += stride * 4;
 
   if (((short *)(eobs))[1]) {
     if (((short *)(eobs))[1] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_u, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstu, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_u, stride);
     }
   }
   q += 32;
 
   if (((short *)(eobs))[2]) {
     if (((short *)(eobs))[2] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
     }
   }
   q += 32;
-  dstv += stride * 4;
+  dst_v += stride * 4;
 
   if (((short *)(eobs))[3]) {
     if (((short *)(eobs))[3] & 0xfefe) {
-      vp8_idct_dequant_full_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_full_2x_sse2(q, dq, dst_v, stride);
     } else {
-      vp8_idct_dequant_0_2x_sse2(q, dq, dstv, stride);
+      vp8_idct_dequant_0_2x_sse2(q, dq, dst_v, stride);
     }
   }
 }
diff --git a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
index 82d7bf91a69..0043e93b061 100644
--- a/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -13,7 +13,7 @@
 
 SECTION .text
 
-;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
+;void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff)
 global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp8_short_inv_walsh4x4_sse2):
     push        rbp
diff --git a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
index 1f3a2baca00..67bcd0cbd71 100644
--- a/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/libs/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -10,8 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
-
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define vp8_filter_weight 128
@@ -205,280 +203,6 @@ sym(vp8_filter_block1dc_v6_mmx):
     ret
 
 
-;void bilinear_predict8x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
-sym(vp8_bilinear_predict8x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        mov         rsi,        arg(0) ;src_ptr              ;
-        add         rax,        rcx
-
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_8x4:
-        movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movq        mm4,        mm3                 ; make a copy of current line
-
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   mm4,        mm0                 ;
-
-        pmullw      mm3,        mm1                 ;
-        pmullw      mm4,        mm1                 ;
-
-        movq        mm5,        [rsi+1]             ;
-        movq        mm6,        mm5                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0                 ;
-
-        pmullw      mm5,        mm2                 ;
-        pmullw      mm6,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-        movq        mm5,        mm7                 ;
-        movq        mm6,        mm7                 ;
-
-        punpcklbw   mm5,        mm0                 ;
-        punpckhbw   mm6,        mm0
-
-        pmullw      mm5,        [rax]               ;
-        pmullw      mm6,        [rax]               ;
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm4                 ;
-
-
-        pmullw      mm3,        [rax+16]            ;
-        pmullw      mm4,        [rax+16]            ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm4,        mm6                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       mm4,        [GLOBAL(rd)]                 ;
-        psraw       mm4,        VP8_FILTER_SHIFT        ;
-
-        packuswb    mm3,        mm4
-
-        movq        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_8x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void bilinear_predict4x4_mmx
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
-sym(vp8_bilinear_predict4x4_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        mov         rdi,        arg(4) ;dst_ptr           ;
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        shl         rax,        5
-
-        add         rax,        rcx ; HFilter
-        mov         rsi,        arg(0) ;src_ptr              ;
-
-        movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
-        movq        mm1,        [rax]               ;
-
-        movq        mm2,        [rax+16]            ;
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        pxor        mm0,        mm0                 ;
-        shl         rax,        5
-
-        add         rax,        rcx
-        lea         rcx,        [rdi+rdx*4]          ;
-
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
-
-        ; get the first horizontal line done       ;
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movq        mm7,        mm3                 ;
-        packuswb    mm7,        mm0                 ;
-
-        add         rsi,        rdx                 ; next line
-.next_row_4x4:
-        movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
-
-        pmullw      mm3,        mm1                 ;
-        movd        mm5,        [rsi+1]             ;
-
-        punpcklbw   mm5,        mm0                 ;
-        pmullw      mm5,        mm2                 ;
-
-        paddw       mm3,        mm5                 ;
-
-        movq        mm5,        mm7                 ;
-        punpcklbw   mm5,        mm0                 ;
-
-        pmullw      mm5,        [rax]               ;
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-        movq        mm7,        mm3                 ;
-
-        packuswb    mm7,        mm0                 ;
-
-        pmullw      mm3,        [rax+16]            ;
-        paddw       mm3,        mm5                 ;
-
-
-        paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
-        psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    mm3,        mm0
-        movd        [rdi],      mm3                 ; store the results in the destination
-
-%if ABI_IS_32BIT
-        add         rsi,        rdx                 ; next line
-        add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
-%else
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
-        add         rsi,        rdx                 ; next line
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx                 ;
-        jne         .next_row_4x4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
index 6e70f6d2e80..51c015e3df0 100644
--- a/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/libs/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -10,7 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-extern sym(vp8_bilinear_filters_x86_8)
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
@@ -958,419 +957,6 @@ sym(vp8_unpack_block1d16_h6_sse2):
     ret
 
 
-;void vp8_bilinear_predict16x16_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-extern sym(vp8_bilinear_filters_x86_8)
-global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
-sym(vp8_bilinear_predict16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-
-        cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          .b16x16_sp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-
-        cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          .b16x16_fp_only
-
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd      r8,         dword ptr arg(5) ;dst_pitch
-%endif
-        ; get the first horizontal line done
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        add         rsi,        rdx                 ; next line
-.next_row:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       [rax]
-        pmullw      xmm6,       [rax]
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        movdqa      xmm7,       xmm3
-        packuswb    xmm7,       xmm4
-
-        pmullw      xmm3,       [rax+16]
-        pmullw      xmm4,       [rax+16]
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rdx                 ; next line
-%if ABI_IS_32BIT
-        add         rdi,        DWORD PTR arg(5) ;dst_pitch
-%else
-        add         rdi,        r8
-%endif
-
-        cmp         rdi,        rcx
-        jne         .next_row
-
-        jmp         .done
-
-.b16x16_sp_only:
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        add         rsi,        rax                 ; next line
-.next_row_spo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
-        movdqa      xmm5,       xmm7
-        movdqa      xmm6,       xmm7
-
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        movdqa      xmm7,       xmm3
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm5,       xmm1
-        pmullw      xmm6,       xmm1
-        pmullw      xmm3,       xmm2
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ;dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_spo
-
-        jmp         .done
-
-.b16x16_fp_only:
-        lea         rcx,        [rdi+rdx*8]
-        lea         rcx,        [rcx+rdx*8]
-        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
-        pxor        xmm0,       xmm0
-
-.next_row_fpo:
-        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-
-        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
-        punpckhbw   xmm4,       xmm0
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm1
-
-        movdqu      xmm5,       [rsi+1]
-        movdqa      xmm6,       xmm5
-
-        punpcklbw   xmm5,       xmm0
-        punpckhbw   xmm6,       xmm0
-
-        pmullw      xmm5,       xmm2
-        pmullw      xmm6,       xmm2
-
-        paddw       xmm3,       xmm5
-        paddw       xmm4,       xmm6
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        paddw       xmm4,       [GLOBAL(rd)]
-        psraw       xmm4,       VP8_FILTER_SHIFT
-
-        packuswb    xmm3,       xmm4
-        movdqa      [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsi,        rax                 ; next line
-        add         rdi,        rdx                 ; dst_pitch
-        cmp         rdi,        rcx
-        jne         .next_row_fpo
-
-.done:
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_bilinear_predict8x8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    int   src_pixels_per_line,
-;    int  xoffset,
-;    int  yoffset,
-;    unsigned char *dst_ptr,
-;    int dst_pitch
-;)
-global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
-sym(vp8_bilinear_predict8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 144                         ; reserve 144 bytes
-
-    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
-    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
-
-        mov         rsi,        arg(0) ;src_ptr
-        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
-
-    ;Read 9-line unaligned data in and put them on stack. This gives a big
-    ;performance boost.
-        movdqu      xmm0,       [rsi]
-        lea         rax,        [rdx + rdx*2]
-        movdqu      xmm1,       [rsi+rdx]
-        movdqu      xmm2,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm3,       [rsi]
-        movdqu      xmm4,       [rsi+rdx]
-        movdqu      xmm5,       [rsi+rdx*2]
-        add         rsi,        rax
-        movdqu      xmm6,       [rsi]
-        movdqu      xmm7,       [rsi+rdx]
-
-        movdqa      XMMWORD PTR [rsp],            xmm0
-
-        movdqu      xmm0,       [rsi+rdx*2]
-
-        movdqa      XMMWORD PTR [rsp+16],         xmm1
-        movdqa      XMMWORD PTR [rsp+32],         xmm2
-        movdqa      XMMWORD PTR [rsp+48],         xmm3
-        movdqa      XMMWORD PTR [rsp+64],         xmm4
-        movdqa      XMMWORD PTR [rsp+80],         xmm5
-        movdqa      XMMWORD PTR [rsp+96],         xmm6
-        movdqa      XMMWORD PTR [rsp+112],        xmm7
-        movdqa      XMMWORD PTR [rsp+128],        xmm0
-
-        movsxd      rax,        dword ptr arg(2) ;xoffset
-        shl         rax,        5
-        add         rax,        rcx    ;HFilter
-
-        mov         rdi,        arg(4) ;dst_ptr
-        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
-
-        movdqa      xmm1,       [rax]
-        movdqa      xmm2,       [rax+16]
-
-        movsxd      rax,        dword ptr arg(3) ;yoffset
-        shl         rax,        5
-        add         rax,        rcx    ;VFilter
-
-        lea         rcx,        [rdi+rdx*8]
-
-        movdqa      xmm5,       [rax]
-        movdqa      xmm6,       [rax+16]
-
-        pxor        xmm0,       xmm0
-
-        ; get the first horizontal line done
-        movdqa      xmm3,       XMMWORD PTR [rsp]
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm7,       xmm3
-        add         rsp,        16                 ; next line
-.next_row8x8:
-        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-        movdqa      xmm4,       xmm3                 ; make a copy of current line
-        psrldq      xmm4,       1
-
-        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
-        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
-
-        pmullw      xmm3,       xmm1
-        pmullw      xmm4,       xmm2
-
-        paddw       xmm3,       xmm4
-        pmullw      xmm7,       xmm5
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        movdqa      xmm4,       xmm3
-
-        pmullw      xmm3,       xmm6
-        paddw       xmm3,       xmm7
-
-        movdqa      xmm7,       xmm4
-
-        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
-        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
-
-        packuswb    xmm3,       xmm0
-        movq        [rdi],      xmm3                 ; store the results in the destination
-
-        add         rsp,        16                 ; next line
-        add         rdi,        rdx
-
-        cmp         rdi,        rcx
-        jne         .next_row8x8
-
-    ;add rsp, 144
-    pop rsp
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 SECTION_RODATA
 align 16
 rd:
diff --git a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
index b9d087e20df..7fb83c2d5e2 100644
--- a/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
+++ b/libs/libvpx/vp8/common/x86/vp8_asm_stubs.c
@@ -11,7 +11,6 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "filter_x86.h"
 
 extern const short vp8_six_tap_x86[8][6 * 8];
 
@@ -95,9 +94,7 @@ void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
 void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
-                                  int dst_pitch
-
-                                  ) {
+                                  int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned short,
                   FData2[24 * 24]); /* Temp data bufffer used in filtering */
 
@@ -236,9 +233,7 @@ extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
 void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
                                    int src_pixels_per_line, int xoffset,
                                    int yoffset, unsigned char *dst_ptr,
-                                   int dst_pitch
-
-                                   ) {
+                                   int dst_pitch) {
   DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
 
   if (xoffset) {
@@ -351,8 +346,8 @@ void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
                                    yoffset);
     } else {
       /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
-        * yoffset==0) case correctly. Add copy function here to guarantee
-        * six-tap function handles all possible offsets. */
+       * yoffset==0) case correctly. Add copy function here to guarantee
+       * six-tap function handles all possible offsets. */
       int r;
 
       for (r = 0; r < 4; ++r) {
diff --git a/libs/libvpx/vp8/decoder/dboolhuff.h b/libs/libvpx/vp8/decoder/dboolhuff.h
index 04c027cd786..f2a18f0d90d 100644
--- a/libs/libvpx/vp8/decoder/dboolhuff.h
+++ b/libs/libvpx/vp8/decoder/dboolhuff.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DBOOLHUFF_H_
-#define VP8_DECODER_DBOOLHUFF_H_
+#ifndef VPX_VP8_DECODER_DBOOLHUFF_H_
+#define VPX_VP8_DECODER_DBOOLHUFF_H_
 
 #include <stddef.h>
 #include <limits.h>
@@ -76,7 +76,7 @@ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
   }
 
   {
-    register int shift = vp8_norm[range];
+    const unsigned char shift = vp8_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -127,4 +127,4 @@ static INLINE int vp8dx_bool_error(BOOL_DECODER *br) {
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DBOOLHUFF_H_
+#endif  // VPX_VP8_DECODER_DBOOLHUFF_H_
diff --git a/libs/libvpx/vp8/decoder/decodeframe.c b/libs/libvpx/vp8/decoder/decodeframe.c
index 077bd3da268..650d1d0408d 100644
--- a/libs/libvpx/vp8/decoder/decodeframe.c
+++ b/libs/libvpx/vp8/decoder/decodeframe.c
@@ -674,7 +674,7 @@ static unsigned int read_partition_size(VP8D_COMP *pbi,
 
 static int read_is_valid(const unsigned char *start, size_t len,
                          const unsigned char *end) {
-  return (start + len > start && start + len <= end);
+  return len != 0 && end > start && len <= (size_t)(end - start);
 }
 
 static unsigned int read_available_partition_size(
@@ -686,6 +686,12 @@ static unsigned int read_available_partition_size(
   const unsigned char *partition_size_ptr = token_part_sizes + i * 3;
   unsigned int partition_size = 0;
   ptrdiff_t bytes_left = fragment_end - fragment_start;
+  if (bytes_left < 0) {
+    vpx_internal_error(
+        &pc->error, VPX_CODEC_CORRUPT_FRAME,
+        "Truncated packet or corrupt partition. No bytes left %d.",
+        (int)bytes_left);
+  }
   /* Calculate the length of this partition. The last partition
    * size is implicit. If the partition size can't be read, then
    * either use the remaining data in the buffer (for EC mode)
@@ -750,6 +756,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
       ptrdiff_t ext_first_part_size = token_part_sizes -
                                       pbi->fragments.ptrs[0] +
                                       3 * (num_token_partitions - 1);
+      if (fragment_size < (unsigned int)ext_first_part_size)
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted fragment size %d", fragment_size);
       fragment_size -= (unsigned int)ext_first_part_size;
       if (fragment_size > 0) {
         pbi->fragments.sizes[0] = (unsigned int)ext_first_part_size;
@@ -767,6 +776,9 @@ static void setup_token_decoder(VP8D_COMP *pbi,
           first_fragment_end, fragment_end, fragment_idx - 1,
           num_token_partitions);
       pbi->fragments.sizes[fragment_idx] = (unsigned int)partition_size;
+      if (fragment_size < (unsigned int)partition_size)
+        vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted fragment size %d", fragment_size);
       fragment_size -= (unsigned int)partition_size;
       assert(fragment_idx <= num_token_partitions);
       if (fragment_size > 0) {
@@ -1208,7 +1220,11 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) &&
       pc->multi_token_partition != ONE_PARTITION) {
     unsigned int thread;
-    vp8mt_decode_mb_rows(pbi, xd);
+    if (vp8mt_decode_mb_rows(pbi, xd)) {
+      vp8_decoder_remove_threads(pbi);
+      pbi->restart_threads = 1;
+      vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, NULL);
+    }
     vp8_yv12_extend_frame_borders(yv12_fb_new);
     for (thread = 0; thread < pbi->decoding_thread_count; ++thread) {
       corrupt_tokens |= pbi->mb_row_di[thread].mbd.corrupted;
diff --git a/libs/libvpx/vp8/decoder/decodemv.h b/libs/libvpx/vp8/decoder/decodemv.h
index f33b07351d3..504e943d855 100644
--- a/libs/libvpx/vp8/decoder/decodemv.h
+++ b/libs/libvpx/vp8/decoder/decodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DECODEMV_H_
-#define VP8_DECODER_DECODEMV_H_
+#ifndef VPX_VP8_DECODER_DECODEMV_H_
+#define VPX_VP8_DECODER_DECODEMV_H_
 
 #include "onyxd_int.h"
 
@@ -23,4 +23,4 @@ void vp8_decode_mode_mvs(VP8D_COMP *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DECODEMV_H_
+#endif  // VPX_VP8_DECODER_DECODEMV_H_
diff --git a/libs/libvpx/vp8/decoder/decoderthreading.h b/libs/libvpx/vp8/decoder/decoderthreading.h
index c563cf6e93a..3d49bc83174 100644
--- a/libs/libvpx/vp8/decoder/decoderthreading.h
+++ b/libs/libvpx/vp8/decoder/decoderthreading.h
@@ -8,15 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DECODERTHREADING_H_
-#define VP8_DECODER_DECODERTHREADING_H_
+#ifndef VPX_VP8_DECODER_DECODERTHREADING_H_
+#define VPX_VP8_DECODER_DECODERTHREADING_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #if CONFIG_MULTITHREAD
-void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
+int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd);
 void vp8_decoder_remove_threads(VP8D_COMP *pbi);
 void vp8_decoder_create_threads(VP8D_COMP *pbi);
 void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows);
@@ -27,4 +27,4 @@ void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DECODERTHREADING_H_
+#endif  // VPX_VP8_DECODER_DECODERTHREADING_H_
diff --git a/libs/libvpx/vp8/decoder/detokenize.h b/libs/libvpx/vp8/decoder/detokenize.h
index f0b125444f0..410a431ba0f 100644
--- a/libs/libvpx/vp8/decoder/detokenize.h
+++ b/libs/libvpx/vp8/decoder/detokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_DETOKENIZE_H_
-#define VP8_DECODER_DETOKENIZE_H_
+#ifndef VPX_VP8_DECODER_DETOKENIZE_H_
+#define VPX_VP8_DECODER_DETOKENIZE_H_
 
 #include "onyxd_int.h"
 
@@ -24,4 +24,4 @@ int vp8_decode_mb_tokens(VP8D_COMP *, MACROBLOCKD *);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_DETOKENIZE_H_
+#endif  // VPX_VP8_DECODER_DETOKENIZE_H_
diff --git a/libs/libvpx/vp8/decoder/ec_types.h b/libs/libvpx/vp8/decoder/ec_types.h
index 0ab08b649ad..84feb269df0 100644
--- a/libs/libvpx/vp8/decoder/ec_types.h
+++ b/libs/libvpx/vp8/decoder/ec_types.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_EC_TYPES_H_
-#define VP8_DECODER_EC_TYPES_H_
+#ifndef VPX_VP8_DECODER_EC_TYPES_H_
+#define VPX_VP8_DECODER_EC_TYPES_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,7 +34,9 @@ typedef struct {
 /* Structure used to hold all the overlaps of a macroblock. The overlaps of a
  * macroblock is further divided into block overlaps.
  */
-typedef struct { B_OVERLAP overlaps[16]; } MB_OVERLAP;
+typedef struct {
+  B_OVERLAP overlaps[16];
+} MB_OVERLAP;
 
 /* Structure for keeping track of motion vectors and which reference frame they
  * refer to. Used for motion vector interpolation.
@@ -48,4 +50,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_EC_TYPES_H_
+#endif  // VPX_VP8_DECODER_EC_TYPES_H_
diff --git a/libs/libvpx/vp8/decoder/error_concealment.c b/libs/libvpx/vp8/decoder/error_concealment.c
index e22141492c8..85982e4de33 100644
--- a/libs/libvpx/vp8/decoder/error_concealment.c
+++ b/libs/libvpx/vp8/decoder/error_concealment.c
@@ -147,8 +147,8 @@ static void calculate_overlaps_mb(B_OVERLAP *b_overlaps, union b_mode_info *bmi,
   }
 }
 
-void vp8_calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
-                            union b_mode_info *bmi, int b_row, int b_col) {
+static void calculate_overlaps(MB_OVERLAP *overlap_ul, int mb_rows, int mb_cols,
+                               union b_mode_info *bmi, int b_row, int b_col) {
   MB_OVERLAP *mb_overlap;
   int row, col, rel_row, rel_col;
   int new_row, new_col;
@@ -280,9 +280,9 @@ static void calc_prev_mb_overlaps(MB_OVERLAP *overlaps, MODE_INFO *prev_mi,
   int sub_col;
   for (sub_row = 0; sub_row < 4; ++sub_row) {
     for (sub_col = 0; sub_col < 4; ++sub_col) {
-      vp8_calculate_overlaps(overlaps, mb_rows, mb_cols,
-                             &(prev_mi->bmi[sub_row * 4 + sub_col]),
-                             4 * mb_row + sub_row, 4 * mb_col + sub_col);
+      calculate_overlaps(overlaps, mb_rows, mb_cols,
+                         &(prev_mi->bmi[sub_row * 4 + sub_col]),
+                         4 * mb_row + sub_row, 4 * mb_col + sub_col);
     }
   }
 }
diff --git a/libs/libvpx/vp8/decoder/error_concealment.h b/libs/libvpx/vp8/decoder/error_concealment.h
index 89c78c14420..608a79f1891 100644
--- a/libs/libvpx/vp8/decoder/error_concealment.h
+++ b/libs/libvpx/vp8/decoder/error_concealment.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_ERROR_CONCEALMENT_H_
-#define VP8_DECODER_ERROR_CONCEALMENT_H_
+#ifndef VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
+#define VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
 
 #include "onyxd_int.h"
 #include "ec_types.h"
@@ -38,4 +38,4 @@ void vp8_interpolate_motion(MACROBLOCKD *mb, int mb_row, int mb_col,
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_ERROR_CONCEALMENT_H_
+#endif  // VPX_VP8_DECODER_ERROR_CONCEALMENT_H_
diff --git a/libs/libvpx/vp8/decoder/onyxd_if.c b/libs/libvpx/vp8/decoder/onyxd_if.c
index f516eb0c78b..c6fb51d0cb5 100644
--- a/libs/libvpx/vp8/decoder/onyxd_if.c
+++ b/libs/libvpx/vp8/decoder/onyxd_if.c
@@ -16,6 +16,7 @@
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/alloccommon.h"
+#include "vp8/common/common.h"
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
@@ -321,21 +322,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
   pbi->dec_fb_ref[GOLDEN_FRAME] = &cm->yv12_fb[cm->gld_fb_idx];
   pbi->dec_fb_ref[ALTREF_FRAME] = &cm->yv12_fb[cm->alt_fb_idx];
 
-  if (setjmp(pbi->common.error.jmp)) {
-    /* We do not know if the missing frame(s) was supposed to update
-     * any of the reference buffers, but we act conservative and
-     * mark only the last buffer as corrupted.
-     */
-    cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
-    if (cm->fb_idx_ref_cnt[cm->new_fb_idx] > 0) {
-      cm->fb_idx_ref_cnt[cm->new_fb_idx]--;
-    }
-    goto decode_exit;
-  }
-
-  pbi->common.error.setjmp = 1;
-
   retcode = vp8_decode_frame(pbi);
 
   if (retcode < 0) {
@@ -344,6 +330,12 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
     }
 
     pbi->common.error.error_code = VPX_CODEC_ERROR;
+    // Propagate the error info.
+    if (pbi->mb.error_info.error_code != 0) {
+      pbi->common.error.error_code = pbi->mb.error_info.error_code;
+      memcpy(pbi->common.error.detail, pbi->mb.error_info.detail,
+             sizeof(pbi->mb.error_info.detail));
+    }
     goto decode_exit;
   }
 
@@ -382,7 +374,6 @@ int vp8dx_receive_compressed_data(VP8D_COMP *pbi, size_t size,
   pbi->last_time_stamp = time_stamp;
 
 decode_exit:
-  pbi->common.error.setjmp = 0;
   vpx_clear_system_state();
   return retcode;
 }
@@ -445,7 +436,7 @@ int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf) {
 #if CONFIG_MULTITHREAD
   if (setjmp(fb->pbi[0]->common.error.jmp)) {
     vp8_remove_decoder_instances(fb);
-    memset(fb->pbi, 0, sizeof(fb->pbi));
+    vp8_zero(fb->pbi);
     vpx_clear_system_state();
     return VPX_CODEC_ERROR;
   }
@@ -471,6 +462,6 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb) {
   return VPX_CODEC_OK;
 }
 
-int vp8dx_get_quantizer(const VP8D_COMP *cpi) {
-  return cpi->common.base_qindex;
+int vp8dx_get_quantizer(const VP8D_COMP *pbi) {
+  return pbi->common.base_qindex;
 }
diff --git a/libs/libvpx/vp8/decoder/onyxd_int.h b/libs/libvpx/vp8/decoder/onyxd_int.h
index 5ecacdbb972..cf2c066d9be 100644
--- a/libs/libvpx/vp8/decoder/onyxd_int.h
+++ b/libs/libvpx/vp8/decoder/onyxd_int.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_ONYXD_INT_H_
-#define VP8_DECODER_ONYXD_INT_H_
+#ifndef VPX_VP8_DECODER_ONYXD_INT_H_
+#define VPX_VP8_DECODER_ONYXD_INT_H_
 
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
@@ -31,7 +31,9 @@ typedef struct {
   void *ptr2;
 } DECODETHREAD_DATA;
 
-typedef struct { MACROBLOCKD mbd; } MB_ROW_DEC;
+typedef struct {
+  MACROBLOCKD mbd;
+} MB_ROW_DEC;
 
 typedef struct {
   int enabled;
@@ -116,11 +118,17 @@ typedef struct VP8D_COMP {
 
   vpx_decrypt_cb decrypt_cb;
   void *decrypt_state;
+#if CONFIG_MULTITHREAD
+  // Restart threads on next frame if set to 1.
+  // This is set when error happens in multithreaded decoding and all threads
+  // are shut down.
+  int restart_threads;
+#endif
 } VP8D_COMP;
 
 void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-int vp8_decode_frame(VP8D_COMP *cpi);
+int vp8_decode_frame(VP8D_COMP *pbi);
 
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
@@ -128,8 +136,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
                          "Failed to allocate " #lval " at %s:%d", __FILE__, \
                          __LINE__);                                         \
@@ -137,8 +145,8 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
-    lval = (expr);                                                \
-    if (!lval)                                                    \
+    (lval) = (expr);                                              \
+    if (!(lval))                                                  \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
                          "Failed to allocate " #lval);            \
   } while (0)
@@ -148,4 +156,4 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_ONYXD_INT_H_
+#endif  // VPX_VP8_DECODER_ONYXD_INT_H_
diff --git a/libs/libvpx/vp8/decoder/threading.c b/libs/libvpx/vp8/decoder/threading.c
index d0213f75c12..561922de329 100644
--- a/libs/libvpx/vp8/decoder/threading.c
+++ b/libs/libvpx/vp8/decoder/threading.c
@@ -15,8 +15,8 @@
 #endif
 #include "onyxd_int.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp8/common/common.h"
 #include "vp8/common/threading.h"
-
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
@@ -400,16 +400,32 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
       xd->dst.u_buffer = dst_buffer[1] + recon_uvoffset;
       xd->dst.v_buffer = dst_buffer[2] + recon_uvoffset;
 
-      xd->pre.y_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][0] + recon_yoffset;
-      xd->pre.u_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][1] + recon_uvoffset;
-      xd->pre.v_buffer =
-          ref_buffer[xd->mode_info_context->mbmi.ref_frame][2] + recon_uvoffset;
-
       /* propagate errors from reference frames */
       xd->corrupted |= ref_fb_corrupted[xd->mode_info_context->mbmi.ref_frame];
 
+      if (xd->corrupted) {
+        // Move current decoding marcoblock to the end of row for all rows
+        // assigned to this thread, such that other threads won't be waiting.
+        for (; mb_row < pc->mb_rows;
+             mb_row += (pbi->decoding_thread_count + 1)) {
+          current_mb_col = &pbi->mt_current_mb_col[mb_row];
+          vpx_atomic_store_release(current_mb_col, pc->mb_cols + nsync);
+        }
+        vpx_internal_error(&xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                           "Corrupted reference frame");
+      }
+
+      if (xd->mode_info_context->mbmi.ref_frame >= LAST_FRAME) {
+        const MV_REFERENCE_FRAME ref = xd->mode_info_context->mbmi.ref_frame;
+        xd->pre.y_buffer = ref_buffer[ref][0] + recon_yoffset;
+        xd->pre.u_buffer = ref_buffer[ref][1] + recon_uvoffset;
+        xd->pre.v_buffer = ref_buffer[ref][2] + recon_uvoffset;
+      } else {
+        // ref_frame is INTRA_FRAME, pre buffer should not be used.
+        xd->pre.y_buffer = 0;
+        xd->pre.u_buffer = 0;
+        xd->pre.v_buffer = 0;
+      }
       mt_decode_macroblock(pbi, xd, 0);
 
       xd->left_available = 1;
@@ -557,8 +573,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
   }
 
-  /* signal end of frame decoding if this thread processed the last mb_row */
-  if (last_mb_row == (pc->mb_rows - 1)) sem_post(&pbi->h_event_end_decoding);
+  /* signal end of decoding of current thread for current frame */
+  if (last_mb_row + (int)pbi->decoding_thread_count + 1 >= pc->mb_rows)
+    sem_post(&pbi->h_event_end_decoding);
 }
 
 static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
@@ -576,7 +593,13 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
       } else {
         MACROBLOCKD *xd = &mbrd->mbd;
         xd->left_context = &mb_row_left_context;
-
+        if (setjmp(xd->error_info.jmp)) {
+          xd->error_info.setjmp = 0;
+          // Signal the end of decoding for current thread.
+          sem_post(&pbi->h_event_end_decoding);
+          continue;
+        }
+        xd->error_info.setjmp = 1;
         mt_decode_mb_rows(pbi, xd, ithread + 1);
       }
     }
@@ -738,25 +761,28 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_yabove_row[i],
-          vpx_memalign(
-              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (width + (VP8BORDERINPIXELS << 1))));
+      vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
+    }
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_uabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));
+      vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
+    }
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
-    for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(
-          pbi->mt_vabove_row[i],
-          vpx_memalign(16,
-                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
+    for (i = 0; i < pc->mb_rows; ++i) {
+      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+                      vpx_memalign(16, sizeof(unsigned char) *
+                                           (uv_width + VP8BORDERINPIXELS)));
+      vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
+    }
 
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
@@ -812,7 +838,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
   }
 }
 
-void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
+int vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   VP8_COMMON *pc = &pbi->common;
   unsigned int i;
   int j;
@@ -858,7 +884,22 @@ void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
     sem_post(&pbi->h_event_start_decoding[i]);
   }
 
+  if (setjmp(xd->error_info.jmp)) {
+    xd->error_info.setjmp = 0;
+    xd->corrupted = 1;
+    // Wait for other threads to finish. This prevents other threads decoding
+    // the current frame while the main thread starts decoding the next frame,
+    // which causes a data race.
+    for (i = 0; i < pbi->decoding_thread_count; ++i)
+      sem_wait(&pbi->h_event_end_decoding);
+    return -1;
+  }
+
+  xd->error_info.setjmp = 1;
   mt_decode_mb_rows(pbi, xd, 0);
 
-  sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+  for (i = 0; i < pbi->decoding_thread_count + 1; ++i)
+    sem_wait(&pbi->h_event_end_decoding); /* add back for each frame */
+
+  return 0;
 }
diff --git a/libs/libvpx/vp8/decoder/treereader.h b/libs/libvpx/vp8/decoder/treereader.h
index dd0f0986e97..4bf938a741d 100644
--- a/libs/libvpx/vp8/decoder/treereader.h
+++ b/libs/libvpx/vp8/decoder/treereader.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_DECODER_TREEREADER_H_
-#define VP8_DECODER_TREEREADER_H_
+#ifndef VPX_VP8_DECODER_TREEREADER_H_
+#define VPX_VP8_DECODER_TREEREADER_H_
 
 #include "./vpx_config.h"
 #include "vp8/common/treecoder.h"
@@ -30,7 +30,7 @@ typedef BOOL_DECODER vp8_reader;
 static INLINE int vp8_treed_read(
     vp8_reader *const r, /* !!! must return a 0 or 1 !!! */
     vp8_tree t, const vp8_prob *const p) {
-  register vp8_tree_index i = 0;
+  vp8_tree_index i = 0;
 
   while ((i = t[i + vp8_read(r, p[i >> 1])]) > 0) {
   }
@@ -42,4 +42,4 @@ static INLINE int vp8_treed_read(
 }  // extern "C"
 #endif
 
-#endif  // VP8_DECODER_TREEREADER_H_
+#endif  // VPX_VP8_DECODER_TREEREADER_H_
diff --git a/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
index c42005df6c8..6fc60805f65 100644
--- a/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/libs/libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 
 static const uint16_t inv_zig_zag[16] = { 1, 2, 6,  7,  3,  5,  8,  13,
@@ -26,9 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
+#ifndef __aarch64__
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
+#endif  // __arch64__
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -66,11 +70,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
+#ifdef __aarch64__
+  *d->eob = (int8_t)vmaxvq_u16(eob0);
+#else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
   eob_q32 = vmovl_u16(eob_d16);
   eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+#endif  // __aarch64__
+
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
   vst1q_s16(d->qcoeff + 8, x1);
@@ -78,6 +88,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   /* dqcoeff = x * dequant */
   vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
   vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
-
-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
 }
diff --git a/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
index 76853e65243..99dff6b5209 100644
--- a/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
+++ b/libs/libvpx/vp8/encoder/arm/neon/shortfdct_neon.c
@@ -10,6 +10,8 @@
 
 #include <arm_neon.h>
 
+#include "./vp8_rtcd.h"
+
 void vp8_short_fdct4x4_neon(int16_t *input, int16_t *output, int pitch) {
   int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
   int16x4_t d16s16, d17s16, d26s16, dEmptys16;
diff --git a/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
index 8d6ea4ccbe9..02056f2f900 100644
--- a/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
+++ b/libs/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c
@@ -9,6 +9,8 @@
  */
 
 #include <arm_neon.h>
+
+#include "./vp8_rtcd.h"
 #include "vpx_ports/arm.h"
 
 #ifdef VPX_INCOMPATIBLE_GCC
diff --git a/libs/libvpx/vp8/encoder/bitstream.c b/libs/libvpx/vp8/encoder/bitstream.c
index 8cacb645055..64bf0a79e93 100644
--- a/libs/libvpx/vp8/encoder/bitstream.c
+++ b/libs/libvpx/vp8/encoder/bitstream.c
@@ -41,13 +41,6 @@ const int vp8cx_base_skip_false_prob[128] = {
 unsigned __int64 Sectionbits[500];
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-int intra_mode_stats[10][10][10];
-static unsigned int tree_update_hist[BLOCK_TYPES][COEF_BANDS]
-                                    [PREV_COEF_CONTEXTS][ENTROPY_NODES][2];
-extern unsigned int active_section;
-#endif
-
 #ifdef MODE_STATS
 int count_mb_seg[4] = { 0, 0, 0, 0 };
 #endif
@@ -428,10 +421,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
   vp8_convert_rfct_to_prob(cpi);
 
-#ifdef VP8_ENTROPY_STATS
-  active_section = 1;
-#endif
-
   if (pc->mb_no_coeff_skip) {
     int total_mbs = pc->mb_rows * pc->mb_cols;
 
@@ -472,10 +461,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
       xd->mb_to_top_edge = -((mb_row * 16) << 3);
       xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
 
-#ifdef VP8_ENTROPY_STATS
-      active_section = 9;
-#endif
-
       if (cpi->mb.e_mbd.update_mb_segmentation_map) {
         write_mb_features(w, mi, &cpi->mb.e_mbd);
       }
@@ -486,9 +471,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
 
       if (rf == INTRA_FRAME) {
         vp8_write(w, 0, cpi->prob_intra_coded);
-#ifdef VP8_ENTROPY_STATS
-        active_section = 6;
-#endif
         write_ymode(w, mode, pc->fc.ymode_prob);
 
         if (mode == B_PRED) {
@@ -522,28 +504,13 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
           vp8_clamp_mv2(&best_mv, xd);
 
           vp8_mv_ref_probs(mv_ref_p, ct);
-
-#ifdef VP8_ENTROPY_STATS
-          accum_mv_refs(mode, ct);
-#endif
         }
 
-#ifdef VP8_ENTROPY_STATS
-        active_section = 3;
-#endif
-
         write_mv_ref(w, mode, mv_ref_p);
 
         switch (mode) /* new, split require MVs */
         {
-          case NEWMV:
-
-#ifdef VP8_ENTROPY_STATS
-            active_section = 5;
-#endif
-
-            write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
-            break;
+          case NEWMV: write_mv(w, &mi->mv.as_mv, &best_mv, mvc); break;
 
           case SPLITMV: {
             int j = 0;
@@ -574,9 +541,6 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
               write_sub_mv_ref(w, blockmode, vp8_sub_mv_ref_prob2[mv_contz]);
 
               if (blockmode == NEW4X4) {
-#ifdef VP8_ENTROPY_STATS
-                active_section = 11;
-#endif
                 write_mv(w, &blockmv.as_mv, &best_mv, (const MV_CONTEXT *)mvc);
               }
             } while (++j < cpi->mb.partition_info->count);
@@ -642,10 +606,6 @@ static void write_kfmodes(VP8_COMP *cpi) {
           const B_PREDICTION_MODE L = left_block_mode(m, i);
           const int bm = m->bmi[i].as_mode;
 
-#ifdef VP8_ENTROPY_STATS
-          ++intra_mode_stats[A][L][bm];
-#endif
-
           write_bmode(bc, bm, vp8_kf_bmode_prob[A][L]);
         } while (++i < 16);
       }
@@ -973,10 +933,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
           vp8_write(w, u, upd);
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-          ++tree_update_hist[i][j][k][t][u];
-#endif
-
           if (u) {
             /* send/use new probability */
 
@@ -990,16 +946,6 @@ void vp8_update_coef_probs(VP8_COMP *cpi) {
 
         } while (++t < ENTROPY_NODES);
 
-/* Accum token counts for generation of default statistics */
-#ifdef VP8_ENTROPY_STATS
-        t = 0;
-
-        do {
-          context_counters[i][j][k][t] += cpi->coef_counts[i][j][k][t];
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-#endif
-
       } while (++k < PREV_COEF_CONTEXTS);
     } while (++j < COEF_BANDS);
   } while (++i < BLOCK_TYPES);
@@ -1097,12 +1043,18 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     cx_data[1] = 0x01;
     cx_data[2] = 0x2a;
 
+    /* Pack scale and frame size into 16 bits. Store it 8 bits at a time.
+     * https://tools.ietf.org/html/rfc6386
+     * 9.1. Uncompressed Data Chunk
+     * 16 bits      :     (2 bits Horizontal Scale << 14) | Width (14 bits)
+     * 16 bits      :     (2 bits Vertical Scale << 14) | Height (14 bits)
+     */
     v = (pc->horiz_scale << 14) | pc->Width;
-    cx_data[3] = v;
+    cx_data[3] = v & 0xff;
     cx_data[4] = v >> 8;
 
     v = (pc->vert_scale << 14) | pc->Height;
-    cx_data[5] = v;
+    cx_data[5] = v & 0xff;
     cx_data[6] = v >> 8;
 
     extra_bytes_packed = 7;
@@ -1286,15 +1238,6 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   if (pc->frame_type != KEY_FRAME) vp8_write_bit(bc, pc->refresh_last_frame);
 
-#ifdef VP8_ENTROPY_STATS
-
-  if (pc->frame_type == INTER_FRAME)
-    active_section = 0;
-  else
-    active_section = 7;
-
-#endif
-
   vpx_clear_system_state();
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -1308,25 +1251,13 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
   vp8_update_coef_probs(cpi);
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-  active_section = 2;
-#endif
-
   /* Write out the mb_no_coeff_skip flag */
   vp8_write_bit(bc, pc->mb_no_coeff_skip);
 
   if (pc->frame_type == KEY_FRAME) {
     write_kfmodes(cpi);
-
-#ifdef VP8_ENTROPY_STATS
-    active_section = 8;
-#endif
   } else {
     pack_inter_mode_mvs(cpi);
-
-#ifdef VP8_ENTROPY_STATS
-    active_section = 1;
-#endif
   }
 
   vp8_stop_encode(bc);
@@ -1337,11 +1268,30 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
 
   /* update frame tag */
   {
+    /* Pack partition size, show frame, version and frame type into to 24 bits.
+     * Store it 8 bits at a time.
+     * https://tools.ietf.org/html/rfc6386
+     * 9.1. Uncompressed Data Chunk
+     *    The uncompressed data chunk comprises a common (for key frames and
+     *    interframes) 3-byte frame tag that contains four fields, as follows:
+     *
+     *    1.  A 1-bit frame type (0 for key frames, 1 for interframes).
+     *
+     *    2.  A 3-bit version number (0 - 3 are defined as four different
+     *        profiles with different decoding complexity; other values may be
+     *        defined for future variants of the VP8 data format).
+     *
+     *    3.  A 1-bit show_frame flag (0 when current frame is not for display,
+     *        1 when current frame is for display).
+     *
+     *    4.  A 19-bit field containing the size of the first data partition in
+     *        bytes
+     */
     int v = (oh.first_partition_length_in_bytes << 5) | (oh.show_frame << 4) |
             (oh.version << 1) | oh.type;
 
-    dest[0] = v;
-    dest[1] = v >> 8;
+    dest[0] = v & 0xff;
+    dest[1] = (v >> 8) & 0xff;
     dest[2] = v >> 16;
   }
 
@@ -1431,50 +1381,3 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
   }
 #endif
 }
-
-#ifdef VP8_ENTROPY_STATS
-void print_tree_update_probs() {
-  int i, j, k, l;
-  FILE *f = fopen("context.c", "a");
-  int Sum;
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-  fprintf(f,
-          "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] "
-          "[PREV_COEF_CONTEXTS] [ENTROPY_NODES] = {\n");
-
-  for (i = 0; i < BLOCK_TYPES; ++i) {
-    fprintf(f, "  { \n");
-
-    for (j = 0; j < COEF_BANDS; ++j) {
-      fprintf(f, "    {\n");
-
-      for (k = 0; k < PREV_COEF_CONTEXTS; ++k) {
-        fprintf(f, "      {");
-
-        for (l = 0; l < ENTROPY_NODES; ++l) {
-          Sum =
-              tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
-
-          if (Sum > 0) {
-            if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
-              fprintf(f, "%3ld, ",
-                      (tree_update_hist[i][j][k][l][0] * 255) / Sum);
-            else
-              fprintf(f, "%3ld, ", 1);
-          } else
-            fprintf(f, "%3ld, ", 128);
-        }
-
-        fprintf(f, "},\n");
-      }
-
-      fprintf(f, "    },\n");
-    }
-
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-#endif
diff --git a/libs/libvpx/vp8/encoder/bitstream.h b/libs/libvpx/vp8/encoder/bitstream.h
index ed45bff9e20..ee3f3e4aab8 100644
--- a/libs/libvpx/vp8/encoder/bitstream.h
+++ b/libs/libvpx/vp8/encoder/bitstream.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_BITSTREAM_H_
-#define VP8_ENCODER_BITSTREAM_H_
+#ifndef VPX_VP8_ENCODER_BITSTREAM_H_
+#define VPX_VP8_ENCODER_BITSTREAM_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,4 +29,4 @@ void vp8_update_coef_probs(struct VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BITSTREAM_H_
+#endif  // VPX_VP8_ENCODER_BITSTREAM_H_
diff --git a/libs/libvpx/vp8/encoder/block.h b/libs/libvpx/vp8/encoder/block.h
index 492af0e41fd..1bc5ef75bc2 100644
--- a/libs/libvpx/vp8/encoder/block.h
+++ b/libs/libvpx/vp8/encoder/block.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_BLOCK_H_
-#define VP8_ENCODER_BLOCK_H_
+#ifndef VPX_VP8_ENCODER_BLOCK_H_
+#define VPX_VP8_ENCODER_BLOCK_H_
 
 #include "vp8/common/onyx.h"
 #include "vp8/common/blockd.h"
@@ -165,4 +165,4 @@ typedef struct macroblock {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BLOCK_H_
+#endif  // VPX_VP8_ENCODER_BLOCK_H_
diff --git a/libs/libvpx/vp8/encoder/boolhuff.c b/libs/libvpx/vp8/encoder/boolhuff.c
index 04f8db93311..819c2f22a0f 100644
--- a/libs/libvpx/vp8/encoder/boolhuff.c
+++ b/libs/libvpx/vp8/encoder/boolhuff.c
@@ -15,10 +15,6 @@ unsigned __int64 Sectionbits[500];
 
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-unsigned int active_section = 0;
-#endif
-
 const unsigned int vp8_prob_cost[256] = {
   2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129,
   1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,  873,  858,
@@ -42,26 +38,26 @@ const unsigned int vp8_prob_cost[256] = {
   12,   10,   9,    7,    6,    4,    3,    1,    1
 };
 
-void vp8_start_encode(BOOL_CODER *br, unsigned char *source,
+void vp8_start_encode(BOOL_CODER *bc, unsigned char *source,
                       unsigned char *source_end) {
-  br->lowvalue = 0;
-  br->range = 255;
-  br->count = -24;
-  br->buffer = source;
-  br->buffer_end = source_end;
-  br->pos = 0;
+  bc->lowvalue = 0;
+  bc->range = 255;
+  bc->count = -24;
+  bc->buffer = source;
+  bc->buffer_end = source_end;
+  bc->pos = 0;
 }
 
-void vp8_stop_encode(BOOL_CODER *br) {
+void vp8_stop_encode(BOOL_CODER *bc) {
   int i;
 
-  for (i = 0; i < 32; ++i) vp8_encode_bool(br, 0, 128);
+  for (i = 0; i < 32; ++i) vp8_encode_bool(bc, 0, 128);
 }
 
-void vp8_encode_value(BOOL_CODER *br, int data, int bits) {
+void vp8_encode_value(BOOL_CODER *bc, int data, int bits) {
   int bit;
 
   for (bit = bits - 1; bit >= 0; bit--) {
-    vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+    vp8_encode_bool(bc, (1 & (data >> bit)), 0x80);
   }
 }
diff --git a/libs/libvpx/vp8/encoder/boolhuff.h b/libs/libvpx/vp8/encoder/boolhuff.h
index d001eea9cd8..8ac0a2cc4a4 100644
--- a/libs/libvpx/vp8/encoder/boolhuff.h
+++ b/libs/libvpx/vp8/encoder/boolhuff.h
@@ -9,14 +9,14 @@
  */
 
 /****************************************************************************
-*
-*   Module Title :     boolhuff.h
-*
-*   Description  :     Bool Coder header file.
-*
-****************************************************************************/
-#ifndef VP8_ENCODER_BOOLHUFF_H_
-#define VP8_ENCODER_BOOLHUFF_H_
+ *
+ *   Module Title :     boolhuff.h
+ *
+ *   Description  :     Bool Coder header file.
+ *
+ ****************************************************************************/
+#ifndef VPX_VP8_ENCODER_BOOLHUFF_H_
+#define VPX_VP8_ENCODER_BOOLHUFF_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -35,11 +35,11 @@ typedef struct {
   struct vpx_internal_error_info *error;
 } BOOL_CODER;
 
-extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer,
-                             unsigned char *buffer_end);
+void vp8_start_encode(BOOL_CODER *bc, unsigned char *source,
+                      unsigned char *source_end);
 
-extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
-extern void vp8_stop_encode(BOOL_CODER *bc);
+void vp8_encode_value(BOOL_CODER *bc, int data, int bits);
+void vp8_stop_encode(BOOL_CODER *bc);
 extern const unsigned int vp8_prob_cost[256];
 
 DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]);
@@ -56,23 +56,12 @@ static int validate_buffer(const unsigned char *start, size_t len,
 
   return 0;
 }
-static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
+static void vp8_encode_bool(BOOL_CODER *bc, int bit, int probability) {
   unsigned int split;
-  int count = br->count;
-  unsigned int range = br->range;
-  unsigned int lowvalue = br->lowvalue;
-  register int shift;
-
-#ifdef VP8_ENTROPY_STATS
-#if defined(SECTIONBITS_OUTPUT)
-
-  if (bit)
-    Sectionbits[active_section] += vp8_prob_cost[255 - probability];
-  else
-    Sectionbits[active_section] += vp8_prob_cost[probability];
-
-#endif
-#endif
+  int count = bc->count;
+  unsigned int range = bc->range;
+  unsigned int lowvalue = bc->lowvalue;
+  int shift;
 
   split = 1 + (((range - 1) * probability) >> 8);
 
@@ -80,7 +69,7 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
 
   if (bit) {
     lowvalue += split;
-    range = br->range - split;
+    range = bc->range - split;
   }
 
   shift = vp8_norm[range];
@@ -92,18 +81,18 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
     int offset = shift - count;
 
     if ((lowvalue << (offset - 1)) & 0x80000000) {
-      int x = br->pos - 1;
+      int x = bc->pos - 1;
 
-      while (x >= 0 && br->buffer[x] == 0xff) {
-        br->buffer[x] = (unsigned char)0;
+      while (x >= 0 && bc->buffer[x] == 0xff) {
+        bc->buffer[x] = (unsigned char)0;
         x--;
       }
 
-      br->buffer[x] += 1;
+      bc->buffer[x] += 1;
     }
 
-    validate_buffer(br->buffer + br->pos, 1, br->buffer_end, br->error);
-    br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+    validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error);
+    bc->buffer[bc->pos++] = (lowvalue >> (24 - offset));
 
     lowvalue <<= offset;
     shift = count;
@@ -112,13 +101,13 @@ static void vp8_encode_bool(BOOL_CODER *br, int bit, int probability) {
   }
 
   lowvalue <<= shift;
-  br->count = count;
-  br->lowvalue = lowvalue;
-  br->range = range;
+  bc->count = count;
+  bc->lowvalue = lowvalue;
+  bc->range = range;
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_BOOLHUFF_H_
+#endif  // VPX_VP8_ENCODER_BOOLHUFF_H_
diff --git a/libs/libvpx/vp8/common/copy_c.c b/libs/libvpx/vp8/encoder/copy_c.c
similarity index 100%
rename from libs/libvpx/vp8/common/copy_c.c
rename to libs/libvpx/vp8/encoder/copy_c.c
diff --git a/libs/libvpx/vp8/encoder/dct_value_cost.h b/libs/libvpx/vp8/encoder/dct_value_cost.h
index 278dce73f40..0cd6cb4e65b 100644
--- a/libs/libvpx/vp8/encoder/dct_value_cost.h
+++ b/libs/libvpx/vp8/encoder/dct_value_cost.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DCT_VALUE_COST_H_
-#define VP8_ENCODER_DCT_VALUE_COST_H_
+#ifndef VPX_VP8_ENCODER_DCT_VALUE_COST_H_
+#define VPX_VP8_ENCODER_DCT_VALUE_COST_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -341,4 +341,4 @@ static const short dct_value_cost[2048 * 2] = {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DCT_VALUE_COST_H_
+#endif  // VPX_VP8_ENCODER_DCT_VALUE_COST_H_
diff --git a/libs/libvpx/vp8/encoder/dct_value_tokens.h b/libs/libvpx/vp8/encoder/dct_value_tokens.h
index 0597deab2d4..5cc4505f09a 100644
--- a/libs/libvpx/vp8/encoder/dct_value_tokens.h
+++ b/libs/libvpx/vp8/encoder/dct_value_tokens.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DCT_VALUE_TOKENS_H_
-#define VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#ifndef VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#define VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -845,4 +845,4 @@ static const TOKENVALUE dct_value_tokens[2048 * 2] = {
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DCT_VALUE_TOKENS_H_
+#endif  // VPX_VP8_ENCODER_DCT_VALUE_TOKENS_H_
diff --git a/libs/libvpx/vp8/encoder/defaultcoefcounts.h b/libs/libvpx/vp8/encoder/defaultcoefcounts.h
index 2976325dc57..a3ab34c8a04 100644
--- a/libs/libvpx/vp8/encoder/defaultcoefcounts.h
+++ b/libs/libvpx/vp8/encoder/defaultcoefcounts.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
-#define VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#ifndef VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#define VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -232,4 +232,4 @@ static const unsigned int default_coef_counts
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
+#endif  // VPX_VP8_ENCODER_DEFAULTCOEFCOUNTS_H_
diff --git a/libs/libvpx/vp8/encoder/denoising.c b/libs/libvpx/vp8/encoder/denoising.c
index eb963b97e36..e54d1e9f4bd 100644
--- a/libs/libvpx/vp8/encoder/denoising.c
+++ b/libs/libvpx/vp8/encoder/denoising.c
@@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
   return FILTER_BLOCK;
 }
 
-int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
-                             int mc_avg_uv_stride,
-                             unsigned char *running_avg_uv, int avg_uv_stride,
+int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride,
+                             unsigned char *running_avg, int avg_stride,
                              unsigned char *sig, int sig_stride,
                              unsigned int motion_magnitude,
                              int increase_denoising) {
-  unsigned char *running_avg_uv_start = running_avg_uv;
+  unsigned char *running_avg_start = running_avg;
   unsigned char *sig_start = sig;
   int sum_diff_thresh;
   int r, c;
@@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
       int adjustment = 0;
       int absdiff = 0;
 
-      diff = mc_running_avg_uv[c] - sig[c];
+      diff = mc_running_avg[c] - sig[c];
       absdiff = abs(diff);
 
       // When |diff| <= |3 + shift_inc1|, use pixel value from
       // last denoised raw.
       if (absdiff <= 3 + shift_inc1) {
-        running_avg_uv[c] = mc_running_avg_uv[c];
+        running_avg[c] = mc_running_avg[c];
         sum_diff += diff;
       } else {
         if (absdiff >= 4 && absdiff <= 7) {
@@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         }
         if (diff > 0) {
           if ((sig[c] + adjustment) > 255) {
-            running_avg_uv[c] = 255;
+            running_avg[c] = 255;
           } else {
-            running_avg_uv[c] = sig[c] + adjustment;
+            running_avg[c] = sig[c] + adjustment;
           }
           sum_diff += adjustment;
         } else {
           if ((sig[c] - adjustment) < 0) {
-            running_avg_uv[c] = 0;
+            running_avg[c] = 0;
           } else {
-            running_avg_uv[c] = sig[c] - adjustment;
+            running_avg[c] = sig[c] - adjustment;
           }
           sum_diff -= adjustment;
         }
@@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
     /* Update pointers for next iteration. */
     sig += sig_stride;
-    mc_running_avg_uv += mc_avg_uv_stride;
-    running_avg_uv += avg_uv_stride;
+    mc_running_avg += mc_avg_stride;
+    running_avg += avg_stride;
   }
 
   sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
@@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     // Only apply the adjustment for max delta up to 3.
     if (delta < 4) {
       sig -= sig_stride * 8;
-      mc_running_avg_uv -= mc_avg_uv_stride * 8;
-      running_avg_uv -= avg_uv_stride * 8;
+      mc_running_avg -= mc_avg_stride * 8;
+      running_avg -= avg_stride * 8;
       for (r = 0; r < 8; ++r) {
         for (c = 0; c < 8; ++c) {
-          int diff = mc_running_avg_uv[c] - sig[c];
+          int diff = mc_running_avg[c] - sig[c];
           int adjustment = abs(diff);
           if (adjustment > delta) adjustment = delta;
           if (diff > 0) {
             // Bring denoised signal down.
-            if (running_avg_uv[c] - adjustment < 0) {
-              running_avg_uv[c] = 0;
+            if (running_avg[c] - adjustment < 0) {
+              running_avg[c] = 0;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] - adjustment;
+              running_avg[c] = running_avg[c] - adjustment;
             }
             sum_diff -= adjustment;
           } else if (diff < 0) {
             // Bring denoised signal up.
-            if (running_avg_uv[c] + adjustment > 255) {
-              running_avg_uv[c] = 255;
+            if (running_avg[c] + adjustment > 255) {
+              running_avg[c] = 255;
             } else {
-              running_avg_uv[c] = running_avg_uv[c] + adjustment;
+              running_avg[c] = running_avg[c] + adjustment;
             }
             sum_diff += adjustment;
           }
@@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
         // TODO(marpan): Check here if abs(sum_diff) has gone below the
         // threshold sum_diff_thresh, and if so, we can exit the row loop.
         sig += sig_stride;
-        mc_running_avg_uv += mc_avg_uv_stride;
-        running_avg_uv += avg_uv_stride;
+        mc_running_avg += mc_avg_stride;
+        running_avg += avg_stride;
       }
       if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK;
     } else {
@@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
     }
   }
 
-  vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride);
+  vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
   return FILTER_BLOCK;
 }
 
diff --git a/libs/libvpx/vp8/encoder/denoising.h b/libs/libvpx/vp8/encoder/denoising.h
index 91d87b3a1cd..51ae3b0ab33 100644
--- a/libs/libvpx/vp8/encoder/denoising.h
+++ b/libs/libvpx/vp8/encoder/denoising.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_DENOISING_H_
-#define VP8_ENCODER_DENOISING_H_
+#ifndef VPX_VP8_ENCODER_DENOISING_H_
+#define VPX_VP8_ENCODER_DENOISING_H_
 
 #include "block.h"
 #include "vp8/common/loopfilter.h"
@@ -100,4 +100,4 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MACROBLOCK *x,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_DENOISING_H_
+#endif  // VPX_VP8_ENCODER_DENOISING_H_
diff --git a/libs/libvpx/vp8/encoder/encodeframe.c b/libs/libvpx/vp8/encoder/encodeframe.c
index 9bb0df72d52..2b3d9564ce2 100644
--- a/libs/libvpx/vp8/encoder/encodeframe.c
+++ b/libs/libvpx/vp8/encoder/encodeframe.c
@@ -64,9 +64,9 @@ unsigned int b_modes[14] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const unsigned char VP8_VAR_OFFS[16] = {
-  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
+static const unsigned char VP8_VAR_OFFS[16] = { 128, 128, 128, 128, 128, 128,
+                                                128, 128, 128, 128, 128, 128,
+                                                128, 128, 128, 128 };
 
 /* Original activity measure from Tim T's code. */
 static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
diff --git a/libs/libvpx/vp8/encoder/encodeframe.h b/libs/libvpx/vp8/encoder/encodeframe.h
index 5274aba4120..cc8cf4d713d 100644
--- a/libs/libvpx/vp8/encoder/encodeframe.h
+++ b/libs/libvpx/vp8/encoder/encodeframe.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP8_ENCODER_ENCODEFRAME_H_
-#define VP8_ENCODER_ENCODEFRAME_H_
+#ifndef VPX_VP8_ENCODER_ENCODEFRAME_H_
+#define VPX_VP8_ENCODER_ENCODEFRAME_H_
 
 #include "vp8/encoder/tokenize.h"
 
@@ -37,4 +37,4 @@ int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEFRAME_H_
+#endif  // VPX_VP8_ENCODER_ENCODEFRAME_H_
diff --git a/libs/libvpx/vp8/encoder/encodeintra.h b/libs/libvpx/vp8/encoder/encodeintra.h
index 3956cf5fb1e..021dc5ed76b 100644
--- a/libs/libvpx/vp8/encoder/encodeintra.h
+++ b/libs/libvpx/vp8/encoder/encodeintra.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEINTRA_H_
-#define VP8_ENCODER_ENCODEINTRA_H_
+#ifndef VPX_VP8_ENCODER_ENCODEINTRA_H_
+#define VPX_VP8_ENCODER_ENCODEINTRA_H_
 #include "onyx_int.h"
 
 #ifdef __cplusplus
@@ -25,4 +25,4 @@ void vp8_encode_intra4x4block(MACROBLOCK *x, int ib);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEINTRA_H_
+#endif  // VPX_VP8_ENCODER_ENCODEINTRA_H_
diff --git a/libs/libvpx/vp8/encoder/encodemb.h b/libs/libvpx/vp8/encoder/encodemb.h
index b55ba3ac3f1..db577ddc108 100644
--- a/libs/libvpx/vp8/encoder/encodemb.h
+++ b/libs/libvpx/vp8/encoder/encodemb.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEMB_H_
-#define VP8_ENCODER_ENCODEMB_H_
+#ifndef VPX_VP8_ENCODER_ENCODEMB_H_
+#define VPX_VP8_ENCODER_ENCODEMB_H_
 
 #include "onyx_int.h"
 
@@ -37,4 +37,4 @@ void vp8_encode_inter16x16y(MACROBLOCK *x);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEMB_H_
+#endif  // VPX_VP8_ENCODER_ENCODEMB_H_
diff --git a/libs/libvpx/vp8/encoder/encodemv.c b/libs/libvpx/vp8/encoder/encodemv.c
index ea93ccd7105..04adf105b9b 100644
--- a/libs/libvpx/vp8/encoder/encodemv.c
+++ b/libs/libvpx/vp8/encoder/encodemv.c
@@ -16,10 +16,6 @@
 
 #include <math.h>
 
-#ifdef VP8_ENTROPY_STATS
-extern unsigned int active_section;
-#endif
-
 static void encode_mvcomponent(vp8_writer *const w, const int v,
                                const struct mv_context *mvc) {
   const vp8_prob *p = mvc->prob;
@@ -309,9 +305,6 @@ void vp8_write_mvprobs(VP8_COMP *cpi) {
   vp8_writer *const w = cpi->bc;
   MV_CONTEXT *mvc = cpi->common.fc.mvc;
   int flags[2] = { 0, 0 };
-#ifdef VP8_ENTROPY_STATS
-  active_section = 4;
-#endif
   write_component_probs(w, &mvc[0], &vp8_default_mv_context[0],
                         &vp8_mv_update_probs[0], cpi->mb.MVcount[0], 0,
                         &flags[0]);
@@ -323,8 +316,4 @@ void vp8_write_mvprobs(VP8_COMP *cpi) {
     vp8_build_component_cost_table(
         cpi->mb.mvcost, (const MV_CONTEXT *)cpi->common.fc.mvc, flags);
   }
-
-#ifdef VP8_ENTROPY_STATS
-  active_section = 5;
-#endif
 }
diff --git a/libs/libvpx/vp8/encoder/encodemv.h b/libs/libvpx/vp8/encoder/encodemv.h
index 87db30f3105..347b9feffea 100644
--- a/libs/libvpx/vp8/encoder/encodemv.h
+++ b/libs/libvpx/vp8/encoder/encodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ENCODEMV_H_
-#define VP8_ENCODER_ENCODEMV_H_
+#ifndef VPX_VP8_ENCODER_ENCODEMV_H_
+#define VPX_VP8_ENCODER_ENCODEMV_H_
 
 #include "onyx_int.h"
 
@@ -26,4 +26,4 @@ void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ENCODEMV_H_
+#endif  // VPX_VP8_ENCODER_ENCODEMV_H_
diff --git a/libs/libvpx/vp8/encoder/ethreading.h b/libs/libvpx/vp8/encoder/ethreading.h
index 95bf73d182c..598fe60559b 100644
--- a/libs/libvpx/vp8/encoder/ethreading.h
+++ b/libs/libvpx/vp8/encoder/ethreading.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ETHREADING_H_
-#define VP8_ENCODER_ETHREADING_H_
+#ifndef VPX_VP8_ENCODER_ETHREADING_H_
+#define VPX_VP8_ENCODER_ETHREADING_H_
 
 #include "vp8/encoder/onyx_int.h"
 
@@ -29,4 +29,4 @@ void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
 }
 #endif
 
-#endif  // VP8_ENCODER_ETHREADING_H_
+#endif  // VPX_VP8_ENCODER_ETHREADING_H_
diff --git a/libs/libvpx/vp8/encoder/firstpass.c b/libs/libvpx/vp8/encoder/firstpass.c
index 70f92434103..4ea991e5240 100644
--- a/libs/libvpx/vp8/encoder/firstpass.c
+++ b/libs/libvpx/vp8/encoder/firstpass.c
@@ -989,11 +989,11 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     bits_per_mb_at_this_q =
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
-    bits_per_mb_at_this_q = (int)(.5 +
-                                  err_correction_factor * speed_correction *
-                                      cpi->twopass.est_max_qcorrection_factor *
-                                      cpi->twopass.section_max_qfactor *
-                                      (double)bits_per_mb_at_this_q);
+    bits_per_mb_at_this_q =
+        (int)(.5 + err_correction_factor * speed_correction *
+                       cpi->twopass.est_max_qcorrection_factor *
+                       cpi->twopass.section_max_qfactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1086,9 +1086,8 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
         vp8_bits_per_mb[INTER_FRAME][Q] + overhead_bits_per_mb;
 
     bits_per_mb_at_this_q =
-        (int)(.5 +
-              err_correction_factor * speed_correction * clip_iifactor *
-                  (double)bits_per_mb_at_this_q);
+        (int)(.5 + err_correction_factor * speed_correction * clip_iifactor *
+                       (double)bits_per_mb_at_this_q);
 
     /* Mode and motion overhead */
     /* As Q rises in real encode loop rd code will force overhead down
@@ -1273,9 +1272,8 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
    * sum duration is not. Its calculated based on the actual durations of
    * all frames from the first pass.
    */
-  vp8_new_framerate(cpi,
-                    10000000.0 * cpi->twopass.total_stats.count /
-                        cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
+                             cpi->twopass.total_stats.duration);
 
   cpi->output_framerate = cpi->framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
@@ -1739,10 +1737,11 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             /* Dont break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
-            (!flash_detected) && ((mv_ratio_accumulator > 100.0) ||
-                                  (abs_mv_in_out_accumulator > 3.0) ||
-                                  (mv_in_out_accumulator < -2.0) ||
-                                  ((boost_score - old_boost_score) < 2.0)))) {
+            (!flash_detected) &&
+            ((mv_ratio_accumulator > 100.0) ||
+             (abs_mv_in_out_accumulator > 3.0) ||
+             (mv_in_out_accumulator < -2.0) ||
+             ((boost_score - old_boost_score) < 2.0)))) {
       boost_score = old_boost_score;
       break;
     }
@@ -1815,8 +1814,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       (next_frame.pcnt_inter > 0.75) &&
       ((mv_in_out_accumulator / (double)i > -0.2) ||
        (mv_in_out_accumulator > -2.0)) &&
-      (cpi->gfu_boost > 100) && (cpi->twopass.gf_decay_rate <=
-                                 (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
+      (cpi->gfu_boost > 100) &&
+      (cpi->twopass.gf_decay_rate <=
+       (ARF_DECAY_THRESH + (cpi->gfu_boost / 200))))
 #endif
   {
     int Boost;
diff --git a/libs/libvpx/vp8/encoder/firstpass.h b/libs/libvpx/vp8/encoder/firstpass.h
index ac8a7b1bfb6..f5490f1efff 100644
--- a/libs/libvpx/vp8/encoder/firstpass.h
+++ b/libs/libvpx/vp8/encoder/firstpass.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_FIRSTPASS_H_
-#define VP8_ENCODER_FIRSTPASS_H_
+#ifndef VPX_VP8_ENCODER_FIRSTPASS_H_
+#define VPX_VP8_ENCODER_FIRSTPASS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,4 +28,4 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_FIRSTPASS_H_
+#endif  // VPX_VP8_ENCODER_FIRSTPASS_H_
diff --git a/libs/libvpx/vp8/encoder/lookahead.h b/libs/libvpx/vp8/encoder/lookahead.h
index a67f226946a..bf0401190b1 100644
--- a/libs/libvpx/vp8/encoder/lookahead.h
+++ b/libs/libvpx/vp8/encoder/lookahead.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP8_ENCODER_LOOKAHEAD_H_
-#define VP8_ENCODER_LOOKAHEAD_H_
+#ifndef VPX_VP8_ENCODER_LOOKAHEAD_H_
+#define VPX_VP8_ENCODER_LOOKAHEAD_H_
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
 
@@ -74,7 +74,7 @@ int vp8_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
 struct lookahead_entry *vp8_lookahead_pop(struct lookahead_ctx *ctx, int drain);
 
 #define PEEK_FORWARD 1
-#define PEEK_BACKWARD -1
+#define PEEK_BACKWARD (-1)
 /**\brief Get a future source buffer to encode
  *
  * \param[in] ctx       Pointer to the lookahead context
@@ -96,4 +96,4 @@ unsigned int vp8_lookahead_depth(struct lookahead_ctx *ctx);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_LOOKAHEAD_H_
+#endif  // VPX_VP8_ENCODER_LOOKAHEAD_H_
diff --git a/libs/libvpx/vp8/encoder/mcomp.c b/libs/libvpx/vp8/encoder/mcomp.c
index 970120f3b26..999d6e851a6 100644
--- a/libs/libvpx/vp8/encoder/mcomp.c
+++ b/libs/libvpx/vp8/encoder/mcomp.c
@@ -21,11 +21,6 @@
 #include "vp8/common/common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-#ifdef VP8_ENTROPY_STATS
-static int mv_ref_ct[31][4][2];
-static int mv_mode_cts[4][2];
-#endif
-
 int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
   /* MV costing is based on the distribution of vectors in the previous
    * frame and as such will tend to over state the cost of vectors. In
@@ -34,19 +29,22 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
    * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
    * limited extent, for some account to be taken of these factors.
    */
-  return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
-           mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
-          Weight) >>
-         7;
+  const int mv_idx_row =
+      clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
+  const int mv_idx_col =
+      clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
+  return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7;
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2],
                        int error_per_bit) {
   /* Ignore mv costing if mvcost is NULL */
   if (mvcost) {
-    return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
-             mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
-                error_per_bit +
+    const int mv_idx_row =
+        clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
+    const int mv_idx_col =
+        clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
+    return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit +
             128) >>
            8;
   }
@@ -1131,6 +1129,7 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1279,6 +1278,7 @@ int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE2 || HAVE_MSA
 
 int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
@@ -1366,6 +1366,7 @@ int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSSE3
 int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
@@ -1484,7 +1485,9 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSSE3
 
+#if HAVE_SSE4_1
 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                           int sad_per_bit, int distance,
                           vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
@@ -1630,6 +1633,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
+#endif  // HAVE_SSE4_1
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
@@ -1709,6 +1713,7 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
+#if HAVE_SSE2 || HAVE_MSA
 int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
                               int search_range, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1818,96 +1823,4 @@ int vp8_refining_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-
-#ifdef VP8_ENTROPY_STATS
-void print_mode_context(void) {
-  FILE *f = fopen("modecont.c", "w");
-  int i, j;
-
-  fprintf(f, "#include \"entropy.h\"\n");
-  fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
-  fprintf(f, "{\n");
-
-  for (j = 0; j < 6; ++j) {
-    fprintf(f, "  { /* %d */\n", j);
-    fprintf(f, "    ");
-
-    for (i = 0; i < 4; ++i) {
-      int overal_prob;
-      int this_prob;
-      int count;
-
-      /* Overall probs */
-      count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
-
-      if (count)
-        overal_prob = 256 * mv_mode_cts[i][0] / count;
-      else
-        overal_prob = 128;
-
-      if (overal_prob == 0) overal_prob = 1;
-
-      /* context probs */
-      count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-
-      if (count)
-        this_prob = 256 * mv_ref_ct[j][i][0] / count;
-      else
-        this_prob = 128;
-
-      if (this_prob == 0) this_prob = 1;
-
-      fprintf(f, "%5d, ", this_prob);
-    }
-
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-
-/* MV ref count VP8_ENTROPY_STATS stats code */
-#ifdef VP8_ENTROPY_STATS
-void init_mv_ref_counts() {
-  memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-  memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4]) {
-  if (m == ZEROMV) {
-    ++mv_ref_ct[ct[0]][0][0];
-    ++mv_mode_cts[0][0];
-  } else {
-    ++mv_ref_ct[ct[0]][0][1];
-    ++mv_mode_cts[0][1];
-
-    if (m == NEARESTMV) {
-      ++mv_ref_ct[ct[1]][1][0];
-      ++mv_mode_cts[1][0];
-    } else {
-      ++mv_ref_ct[ct[1]][1][1];
-      ++mv_mode_cts[1][1];
-
-      if (m == NEARMV) {
-        ++mv_ref_ct[ct[2]][2][0];
-        ++mv_mode_cts[2][0];
-      } else {
-        ++mv_ref_ct[ct[2]][2][1];
-        ++mv_mode_cts[2][1];
-
-        if (m == NEWMV) {
-          ++mv_ref_ct[ct[3]][3][0];
-          ++mv_mode_cts[3][0];
-        } else {
-          ++mv_ref_ct[ct[3]][3][1];
-          ++mv_mode_cts[3][1];
-        }
-      }
-    }
-  }
-}
-
-#endif /* END MV ref count VP8_ENTROPY_STATS stats code */
-
-#endif
+#endif  // HAVE_SSE2 || HAVE_MSA
diff --git a/libs/libvpx/vp8/encoder/mcomp.h b/libs/libvpx/vp8/encoder/mcomp.h
index b6228798ff3..6c77995da4d 100644
--- a/libs/libvpx/vp8/encoder/mcomp.h
+++ b/libs/libvpx/vp8/encoder/mcomp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MCOMP_H_
-#define VP8_ENCODER_MCOMP_H_
+#ifndef VPX_VP8_ENCODER_MCOMP_H_
+#define VPX_VP8_ENCODER_MCOMP_H_
 
 #include "block.h"
 #include "vpx_dsp/variance.h"
@@ -18,11 +18,6 @@
 extern "C" {
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-extern void init_mv_ref_counts();
-extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
-#endif
-
 /* The maximum number of steps in a step search given the largest allowed
  * initial step
  */
@@ -34,15 +29,14 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 /* Maximum size of the first step in full pel units */
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
 
-extern void print_mode_context(void);
-extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
-extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
-extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
+int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight);
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride);
 
-extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int_mv *best_mv, int search_param, int error_per_bit,
-                          const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2],
-                          int *mvcost[2], int_mv *center_mv);
+int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                   int_mv *best_mv, int search_param, int sad_per_bit,
+                   const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2],
+                   int *mvcost[2], int_mv *center_mv);
 
 typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int_mv *bestmv, int_mv *ref_mv,
@@ -51,10 +45,10 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                    int *mvcost[2], int *distortion,
                                    unsigned int *sse);
 
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
-extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
-extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
-extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+fractional_mv_step_fp vp8_find_best_half_pixel_step;
+fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
 typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                     int_mv *ref_mv, int sad_per_bit,
@@ -78,4 +72,4 @@ typedef int (*vp8_diamond_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MCOMP_H_
+#endif  // VPX_VP8_ENCODER_MCOMP_H_
diff --git a/libs/libvpx/vp8/encoder/modecosts.h b/libs/libvpx/vp8/encoder/modecosts.h
index dfb8989f7f9..09ee2b5520b 100644
--- a/libs/libvpx/vp8/encoder/modecosts.h
+++ b/libs/libvpx/vp8/encoder/modecosts.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MODECOSTS_H_
-#define VP8_ENCODER_MODECOSTS_H_
+#ifndef VPX_VP8_ENCODER_MODECOSTS_H_
+#define VPX_VP8_ENCODER_MODECOSTS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -17,10 +17,10 @@ extern "C" {
 
 struct VP8_COMP;
 
-void vp8_init_mode_costs(struct VP8_COMP *x);
+void vp8_init_mode_costs(struct VP8_COMP *c);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MODECOSTS_H_
+#endif  // VPX_VP8_ENCODER_MODECOSTS_H_
diff --git a/libs/libvpx/vp8/encoder/mr_dissim.h b/libs/libvpx/vp8/encoder/mr_dissim.h
index da36628afad..58f5a976230 100644
--- a/libs/libvpx/vp8/encoder/mr_dissim.h
+++ b/libs/libvpx/vp8/encoder/mr_dissim.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_MR_DISSIM_H_
-#define VP8_ENCODER_MR_DISSIM_H_
+#ifndef VPX_VP8_ENCODER_MR_DISSIM_H_
+#define VPX_VP8_ENCODER_MR_DISSIM_H_
 #include "vpx_config.h"
 
 #ifdef __cplusplus
@@ -24,4 +24,4 @@ extern void vp8_store_drop_frame_info(VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_MR_DISSIM_H_
+#endif  // VPX_VP8_ENCODER_MR_DISSIM_H_
diff --git a/libs/libvpx/vp8/encoder/onyx_if.c b/libs/libvpx/vp8/encoder/onyx_if.c
index 22431824256..c0915941211 100644
--- a/libs/libvpx/vp8/encoder/onyx_if.c
+++ b/libs/libvpx/vp8/encoder/onyx_if.c
@@ -65,9 +65,7 @@ extern int vp8_update_coef_context(VP8_COMP *cpi);
 extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
                               YV12_BUFFER_CONFIG *post, int filt_lvl,
                               int low_var_thresh, int flag);
-extern void print_parms(VP8_CONFIG *ocf, char *filenam);
 extern unsigned int vp8_get_processor_freq();
-extern void print_tree_update_probs();
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
@@ -101,10 +99,6 @@ extern int skip_true_count;
 extern int skip_false_count;
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-extern int intra_mode_stats[10][10][10];
-#endif
-
 #ifdef SPEEDSTATS
 unsigned int frames_at_speed[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
                                      0, 0, 0, 0, 0, 0, 0, 0 };
@@ -224,6 +218,8 @@ static void save_layer_context(VP8_COMP *cpi) {
   lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot;
   lc->force_maxqp = cpi->force_maxqp;
   lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
+  lc->last_q[0] = cpi->last_q[0];
+  lc->last_q[1] = cpi->last_q[1];
 
   memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage,
          sizeof(cpi->mb.count_mb_ref_frame_usage));
@@ -261,6 +257,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) {
   cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot;
   cpi->force_maxqp = lc->force_maxqp;
   cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
+  cpi->last_q[0] = lc->last_q[0];
+  cpi->last_q[1] = lc->last_q[1];
 
   memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage,
          sizeof(cpi->mb.count_mb_ref_frame_usage));
@@ -689,8 +687,8 @@ static void set_default_lf_deltas(VP8_COMP *cpi) {
 /* Convenience macros for mapping speed and mode into a continuous
  * range
  */
-#define GOOD(x) (x + 1)
-#define RT(x) (x + 7)
+#define GOOD(x) ((x) + 1)
+#define RT(x) ((x) + 7)
 
 static int speed_map(int speed, const int *map) {
   int res;
@@ -743,9 +741,9 @@ static const int mode_check_freq_map_zn2[] = {
   0, RT(10), 1 << 1, RT(11), 1 << 2, RT(12), 1 << 3, INT_MAX
 };
 
-static const int mode_check_freq_map_vhbpred[] = {
-  0, GOOD(5), 2, RT(0), 0, RT(3), 2, RT(5), 4, INT_MAX
-};
+static const int mode_check_freq_map_vhbpred[] = { 0, GOOD(5), 2, RT(0),
+                                                   0, RT(3),   2, RT(5),
+                                                   4, INT_MAX };
 
 static const int mode_check_freq_map_near2[] = {
   0,      GOOD(5), 2,      RT(0),  0,      RT(3),  2,
@@ -761,13 +759,13 @@ static const int mode_check_freq_map_new2[] = { 0,      GOOD(5), 4,      RT(0),
                                                 1 << 3, RT(11),  1 << 4, RT(12),
                                                 1 << 5, INT_MAX };
 
-static const int mode_check_freq_map_split1[] = {
-  0, GOOD(2), 2, GOOD(3), 7, RT(1), 2, RT(2), 7, INT_MAX
-};
+static const int mode_check_freq_map_split1[] = { 0, GOOD(2), 2, GOOD(3),
+                                                  7, RT(1),   2, RT(2),
+                                                  7, INT_MAX };
 
-static const int mode_check_freq_map_split2[] = {
-  0, GOOD(1), 2, GOOD(2), 4, GOOD(3), 15, RT(1), 4, RT(2), 15, INT_MAX
-};
+static const int mode_check_freq_map_split2[] = { 0, GOOD(1), 2,  GOOD(2),
+                                                  4, GOOD(3), 15, RT(1),
+                                                  4, RT(2),   15, INT_MAX };
 
 void vp8_set_speed_features(VP8_COMP *cpi) {
   SPEED_FEATURES *sf = &cpi->sf;
@@ -1534,6 +1532,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     }
   }
 
+  cpi->ext_refresh_frame_flags_pending = 0;
+
   cpi->baseline_gf_interval =
       cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
 
@@ -1893,10 +1893,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
                   vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
-#ifdef VP8_ENTROPY_STATS
-  init_context_counters();
-#endif
-
   /*Initialize the feed-forward activity masking.*/
   cpi->activity_avg = 90 << 12;
 
@@ -2005,10 +2001,6 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
     cpi->mb.rd_thresh_mult[i] = 128;
   }
 
-#ifdef VP8_ENTROPY_STATS
-  init_mv_ref_counts();
-#endif
-
 #if CONFIG_MULTITHREAD
   if (vp8cx_create_encoder_threads(cpi)) {
     vp8_remove_compressor(&cpi);
@@ -2106,8 +2098,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   return cpi;
 }
 
-void vp8_remove_compressor(VP8_COMP **ptr) {
-  VP8_COMP *cpi = *ptr;
+void vp8_remove_compressor(VP8_COMP **comp) {
+  VP8_COMP *cpi = *comp;
 
   if (!cpi) return;
 
@@ -2120,12 +2112,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-    print_context_counters();
-    print_tree_update_probs();
-    print_mode_context();
-#endif
-
 #if CONFIG_INTERNAL_STATS
 
     if (cpi->pass != 1) {
@@ -2252,40 +2238,6 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
     }
 #endif
 
-#ifdef VP8_ENTROPY_STATS
-    {
-      int i, j, k;
-      FILE *fmode = fopen("modecontext.c", "w");
-
-      fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
-      fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
-      fprintf(fmode,
-              "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
-
-      for (i = 0; i < 10; ++i) {
-        fprintf(fmode, "    { /* Above Mode :  %d */\n", i);
-
-        for (j = 0; j < 10; ++j) {
-          fprintf(fmode, "        {");
-
-          for (k = 0; k < 10; ++k) {
-            if (!intra_mode_stats[i][j][k])
-              fprintf(fmode, " %5d, ", 1);
-            else
-              fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
-          }
-
-          fprintf(fmode, "}, /* left_mode %d */\n", j);
-        }
-
-        fprintf(fmode, "    },\n");
-      }
-
-      fprintf(fmode, "};\n");
-      fclose(fmode);
-    }
-#endif
-
 #if defined(SECTIONBITS_OUTPUT)
 
     if (0) {
@@ -2326,7 +2278,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 
   vp8_remove_common(&cpi->common);
   vpx_free(cpi);
-  *ptr = 0;
+  *comp = 0;
 
 #ifdef OUTPUT_YUV_SRC
   fclose(yuv_file);
@@ -2464,6 +2416,7 @@ int vp8_update_reference(VP8_COMP *cpi, int ref_frame_flags) {
 
   if (ref_frame_flags & VP8_ALTR_FRAME) cpi->common.refresh_alt_ref_frame = 1;
 
+  cpi->ext_refresh_frame_flags_pending = 1;
   return 0;
 }
 
@@ -2862,7 +2815,6 @@ void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
     fclose(yframe);
 }
 #endif
-/* return of 0 means drop frame */
 
 #if !CONFIG_REALTIME_ONLY
 /* Function to test for conditions that indeicate we should loop
@@ -3065,7 +3017,7 @@ static int measure_square_diff_partial(YV12_BUFFER_CONFIG *source,
   // Only return non-zero if we have at least ~1/16 samples for estimate.
   if (num_blocks > (tot_num_blocks >> 4)) {
     assert(num_blocks != 0);
-    return (Total / num_blocks);
+    return num_blocks ? (Total / num_blocks) : 0;
   } else {
     return 0;
   }
@@ -3364,11 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
 
     if (cpi->oxcf.mr_encoder_id) {
-      // TODO(marpan): This constraint shouldn't be needed, as we would like
-      // to allow for key frame setting (forced or periodic) defined per
-      // spatial layer. For now, keep this in.
-      cm->frame_type = low_res_frame_info->frame_type;
-
       // Check if lower resolution is available for motion vector reuse.
       if (cm->frame_type != KEY_FRAME) {
         cpi->mr_low_res_mv_avail = 1;
@@ -3393,7 +3340,16 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                      == low_res_frame_info->low_res_ref_frames[ALTREF_FRAME]);
         */
       }
+      // Disable motion vector reuse (i.e., disable any usage of the low_res)
+      // if the previous lower stream is skipped/disabled.
+      if (low_res_frame_info->skip_encoding_prev_stream) {
+        cpi->mr_low_res_mv_avail = 0;
+      }
     }
+    // This stream is not skipped (i.e., it's being encoded), so set this skip
+    // flag to 0. This is needed for the next stream (i.e., which is the next
+    // frame to be encoded).
+    low_res_frame_info->skip_encoding_prev_stream = 0;
 
     // On a key frame: For the lowest resolution, keep track of the key frame
     // counter value. For the higher resolutions, reset the current video
@@ -3559,6 +3515,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
       cm->current_video_frame++;
       cpi->frames_since_key++;
+      cpi->ext_refresh_frame_flags_pending = 0;
       // We advance the temporal pattern for dropped frames.
       cpi->temporal_pattern_counter++;
 
@@ -3600,6 +3557,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
     cm->current_video_frame++;
     cpi->frames_since_key++;
+    cpi->ext_refresh_frame_flags_pending = 0;
     // We advance the temporal pattern for dropped frames.
     cpi->temporal_pattern_counter++;
     return;
@@ -3799,7 +3757,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   /* Setup background Q adjustment for error resilient mode.
    * For multi-layer encodes only enable this for the base layer.
-  */
+   */
   if (cpi->cyclic_refresh_mode_enabled) {
     // Special case for screen_content_mode with golden frame updates.
     int disable_cr_gf =
@@ -4001,6 +3959,9 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
+      if (cm->frame_type != KEY_FRAME)
+        cpi->last_pred_err_mb =
+            (int)(cpi->mb.prediction_error / cpi->common.MBs);
     }
 
     cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
@@ -4283,6 +4244,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->common.current_video_frame++;
     cpi->frames_since_key++;
     cpi->drop_frame_count++;
+    cpi->ext_refresh_frame_flags_pending = 0;
     // We advance the temporal pattern for dropped frames.
     cpi->temporal_pattern_counter++;
     return;
@@ -4391,8 +4353,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   /* For inter frames the current default behavior is that when
    * cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
    * This is purely an encoder decision at present.
+   * Avoid this behavior when refresh flags are set by the user.
    */
-  if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame) {
+  if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame &&
+      !cpi->ext_refresh_frame_flags_pending) {
     cm->copy_buffer_to_arf = 2;
   } else {
     cm->copy_buffer_to_arf = 0;
@@ -4699,6 +4663,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
 #endif
 
+  cpi->ext_refresh_frame_flags_pending = 0;
+
   if (cm->refresh_golden_frame == 1) {
     cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
   } else {
@@ -4782,8 +4748,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     cpi->temporal_pattern_counter++;
   }
 
-/* reset to normal state now that we are done. */
-
 #if 0
     {
         char filename[512];
@@ -4866,14 +4830,6 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   cm = &cpi->common;
 
-  if (setjmp(cpi->common.error.jmp)) {
-    cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
-  cpi->common.error.setjmp = 1;
-
   vpx_usec_timer_start(&cmptimer);
 
   cpi->source = NULL;
@@ -4999,10 +4955,13 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
         // be received for that high layer, which will yield an incorrect
         // frame rate (from time-stamp adjustment in above calculation).
         if (cpi->oxcf.mr_encoder_id) {
-          cpi->ref_framerate = low_res_frame_info->low_res_framerate;
+          if (!low_res_frame_info->skip_encoding_base_stream)
+            cpi->ref_framerate = low_res_frame_info->low_res_framerate;
         } else {
           // Keep track of frame rate for lowest resolution.
           low_res_frame_info->low_res_framerate = cpi->ref_framerate;
+          // The base stream is being encoded so set skip flag to 0.
+          low_res_frame_info->skip_encoding_base_stream = 0;
         }
       }
 #endif
diff --git a/libs/libvpx/vp8/encoder/onyx_int.h b/libs/libvpx/vp8/encoder/onyx_int.h
index c489b46c2d3..50a750da310 100644
--- a/libs/libvpx/vp8/encoder/onyx_int.h
+++ b/libs/libvpx/vp8/encoder/onyx_int.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_ONYX_INT_H_
-#define VP8_ENCODER_ONYX_INT_H_
+#ifndef VPX_VP8_ENCODER_ONYX_INT_H_
+#define VPX_VP8_ENCODER_ONYX_INT_H_
 
 #include <stdio.h>
 #include "vpx_config.h"
@@ -57,6 +57,9 @@ extern "C" {
 
 #define VP8_TEMPORAL_ALT_REF !CONFIG_REALTIME_ONLY
 
+/* vp8 uses 10,000,000 ticks/second as time stamp */
+#define TICKS_PER_SEC 10000000
+
 typedef struct {
   int kf_indicated;
   unsigned int frames_since_key;
@@ -257,6 +260,7 @@ typedef struct {
 
   int count_mb_ref_frame_usage[MAX_REF_FRAMES];
 
+  int last_q[2];
 } LAYER_CONTEXT;
 
 typedef struct VP8_COMP {
@@ -510,6 +514,7 @@ typedef struct VP8_COMP {
 
   int force_maxqp;
   int frames_since_last_drop_overshoot;
+  int last_pred_err_mb;
 
   // GF update for 1 pass cbr.
   int gf_update_onepass_cbr;
@@ -695,6 +700,8 @@ typedef struct VP8_COMP {
 
   // Use the static threshold from ROI settings.
   int use_roi_static_threshold;
+
+  int ext_refresh_frame_flags_pending;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
@@ -714,8 +721,8 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
                          "Failed to allocate " #lval " at %s:%d", __FILE__, \
                          __LINE__);                                         \
@@ -723,8 +730,8 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
-    lval = (expr);                                                \
-    if (!lval)                                                    \
+    (lval) = (expr);                                              \
+    if (!(lval))                                                  \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
                          "Failed to allocate " #lval);            \
   } while (0)
@@ -733,4 +740,4 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_ONYX_INT_H_
+#endif  // VPX_VP8_ENCODER_ONYX_INT_H_
diff --git a/libs/libvpx/vp8/encoder/pickinter.c b/libs/libvpx/vp8/encoder/pickinter.c
index a9943eb6ab9..dc72eed88ce 100644
--- a/libs/libvpx/vp8/encoder/pickinter.c
+++ b/libs/libvpx/vp8/encoder/pickinter.c
@@ -173,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) {
 
 static int pick_intra4x4block(MACROBLOCK *x, int ib,
                               B_PREDICTION_MODE *best_mode,
-                              const int *mode_costs,
-
-                              int *bestrate, int *bestdistortion) {
+                              const int *mode_costs, int *bestrate,
+                              int *bestdistortion) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
   int dst_stride = x->e_mbd.dst.y_stride;
@@ -564,7 +563,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO best_mbmode;
 
-  int_mv best_ref_mv_sb[2];
+  int_mv best_ref_mv_sb[2] = { { 0 }, { 0 } };
   int_mv mode_mv_sb[2][MB_MODE_COUNT];
   int_mv best_ref_mv;
   int_mv *mode_mv;
@@ -602,7 +601,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   /* search range got from mv_pred(). It uses step_param levels. (0-7) */
   int sr = 0;
 
-  unsigned char *plane[4][3];
+  unsigned char *plane[4][3] = { { 0, 0 } };
   int ref_frame_map[4];
   int sign_bias = 0;
   int dot_artifact_candidate = 0;
@@ -631,13 +630,16 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
       }
     }
 #endif
+    assert(plane[LAST_FRAME][0] != NULL);
     dot_artifact_candidate = check_dot_artifact_candidate(
         cpi, x, target_y, stride, plane[LAST_FRAME][0], mb_row, mb_col, 0);
     // If not found in Y channel, check UV channel.
     if (!dot_artifact_candidate) {
+      assert(plane[LAST_FRAME][1] != NULL);
       dot_artifact_candidate = check_dot_artifact_candidate(
           cpi, x, target_u, stride_uv, plane[LAST_FRAME][1], mb_row, mb_col, 1);
       if (!dot_artifact_candidate) {
+        assert(plane[LAST_FRAME][2] != NULL);
         dot_artifact_candidate = check_dot_artifact_candidate(
             cpi, x, target_v, stride_uv, plane[LAST_FRAME][2], mb_row, mb_col,
             2);
@@ -741,10 +743,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
   /* If the frame has big static background and current MB is in low
-  *  motion area, its mode decision is biased to ZEROMV mode.
-  *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
-  *  At such speed settings, ZEROMV is already heavily favored.
-  */
+   *  motion area, its mode decision is biased to ZEROMV mode.
+   *  No adjustment if cpu_used is <= -12 (i.e., cpi->Speed >= 12).
+   *  At such speed settings, ZEROMV is already heavily favored.
+   */
   if (cpi->Speed < 12) {
     calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
   }
@@ -1068,10 +1070,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rate2 +=
             vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, cpi->mb.mvcost, 128);
       }
+        // fall through
 
       case NEARESTMV:
       case NEARMV:
         if (mode_mv[this_mode].as_int == 0) continue;
+        // fall through
 
       case ZEROMV:
 
@@ -1301,9 +1305,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16 = INT_MAX;
-  int rate, best_rate = 0, distortion, best_sse;
+  int rate_, best_rate = 0, distortion, best_sse;
   MB_PREDICTION_MODE mode, best_mode = DC_PRED;
   int this_rd;
   unsigned int sse;
@@ -1321,23 +1325,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) {
                                      xd->predictor, 16);
     distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor,
                                    16, &sse);
-    rate = x->mbmode_cost[xd->frame_type][mode];
-    this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+    rate_ = x->mbmode_cost[xd->frame_type][mode];
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion);
 
     if (error16x16 > this_rd) {
       error16x16 = this_rd;
       best_mode = mode;
       best_sse = sse;
-      best_rate = rate;
+      best_rate = rate_;
     }
   }
   xd->mode_info_context->mbmi.mode = best_mode;
 
-  error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse);
+  error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse);
   if (error4x4 < error16x16) {
     xd->mode_info_context->mbmi.mode = B_PRED;
-    best_rate = rate;
+    best_rate = rate_;
   }
 
-  *rate_ = best_rate;
+  *rate = best_rate;
 }
diff --git a/libs/libvpx/vp8/encoder/pickinter.h b/libs/libvpx/vp8/encoder/pickinter.h
index bf1d0c97499..392fb41593f 100644
--- a/libs/libvpx/vp8/encoder/pickinter.h
+++ b/libs/libvpx/vp8/encoder/pickinter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_PICKINTER_H_
-#define VP8_ENCODER_PICKINTER_H_
+#ifndef VPX_VP8_ENCODER_PICKINTER_H_
+#define VPX_VP8_ENCODER_PICKINTER_H_
 #include "vpx_config.h"
 #include "vp8/common/onyxc_int.h"
 
@@ -30,4 +30,4 @@ extern int vp8_get_inter_mbpred_error(MACROBLOCK *mb,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_PICKINTER_H_
+#endif  // VPX_VP8_ENCODER_PICKINTER_H_
diff --git a/libs/libvpx/vp8/encoder/picklpf.h b/libs/libvpx/vp8/encoder/picklpf.h
index e6ad0dbf265..03597e5427a 100644
--- a/libs/libvpx/vp8/encoder/picklpf.h
+++ b/libs/libvpx/vp8/encoder/picklpf.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_PICKLPF_H_
-#define VP8_ENCODER_PICKLPF_H_
+#ifndef VPX_VP8_ENCODER_PICKLPF_H_
+#define VPX_VP8_ENCODER_PICKLPF_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,4 +27,4 @@ void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
 }
 #endif
 
-#endif  // VP8_ENCODER_PICKLPF_H_
+#endif  // VPX_VP8_ENCODER_PICKLPF_H_
diff --git a/libs/libvpx/vp8/encoder/quantize.h b/libs/libvpx/vp8/encoder/quantize.h
index 267150f99fb..78746c0c20b 100644
--- a/libs/libvpx/vp8/encoder/quantize.h
+++ b/libs/libvpx/vp8/encoder/quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_QUANTIZE_H_
-#define VP8_ENCODER_QUANTIZE_H_
+#ifndef VPX_VP8_ENCODER_QUANTIZE_H_
+#define VPX_VP8_ENCODER_QUANTIZE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -31,4 +31,4 @@ extern void vp8cx_init_quantizer(struct VP8_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_QUANTIZE_H_
+#endif  // VPX_VP8_ENCODER_QUANTIZE_H_
diff --git a/libs/libvpx/vp8/encoder/ratectrl.c b/libs/libvpx/vp8/encoder/ratectrl.c
index e58c3109802..dbd76edad05 100644
--- a/libs/libvpx/vp8/encoder/ratectrl.c
+++ b/libs/libvpx/vp8/encoder/ratectrl.c
@@ -996,7 +996,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
            * bits on this frame even if it is a contructed arf.
            * The active maximum quantizer insures that an appropriate
            * number of bits will be spent if needed for contstructed ARFs.
-          */
+           */
           cpi->this_frame_target = 0;
         }
 
@@ -1052,9 +1052,8 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
    * overflow when values are large
    */
   projected_size_based_on_q =
-      (int)(((.5 +
-              rate_correction_factor *
-                  vp8_bits_per_mb[cpi->common.frame_type][Q]) *
+      (int)(((.5 + rate_correction_factor *
+                       vp8_bits_per_mb[cpi->common.frame_type][Q]) *
              cpi->common.MBs) /
             (1 << BPER_MB_NORMBITS));
 
@@ -1126,6 +1125,14 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) {
   }
 }
 
+static int limit_q_cbr_inter(int last_q, int current_q) {
+  int limit_down = 12;
+  if (last_q - current_q > limit_down)
+    return (last_q - limit_down);
+  else
+    return current_q;
+}
+
 int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
   int Q = cpi->active_worst_quality;
 
@@ -1265,6 +1272,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) {
     }
   }
 
+  // Limit decrease in Q for 1 pass CBR screen content mode.
+  if (cpi->common.frame_type != KEY_FRAME && cpi->pass == 0 &&
+      cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+      cpi->oxcf.screen_content_mode)
+    Q = limit_q_cbr_inter(cpi->last_q[1], Q);
+
   return Q;
 }
 
@@ -1465,7 +1478,7 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       (cpi->oxcf.screen_content_mode == 2 ||
        (cpi->drop_frames_allowed &&
         (force_drop_overshoot ||
-         (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) &&
+         (cpi->rate_correction_factor < (8.0f * MIN_BPB_FACTOR) &&
           cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) {
     // Note: the "projected_frame_size" from encode_frame() only gives estimate
     // of mode/motion vector rate (in non-rd mode): so below we only require
@@ -1485,7 +1498,8 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
       thresh_rate = thresh_rate >> 3;
     if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
-         pred_err_mb > thresh_pred_err_mb) ||
+         pred_err_mb > thresh_pred_err_mb &&
+         pred_err_mb > 2 * cpi->last_pred_err_mb) ||
         force_drop_overshoot) {
       unsigned int i;
       double new_correction_factor;
diff --git a/libs/libvpx/vp8/encoder/ratectrl.h b/libs/libvpx/vp8/encoder/ratectrl.h
index 249de4e706c..844c72cb861 100644
--- a/libs/libvpx/vp8/encoder/ratectrl.h
+++ b/libs/libvpx/vp8/encoder/ratectrl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_RATECTRL_H_
-#define VP8_ENCODER_RATECTRL_H_
+#ifndef VPX_VP8_ENCODER_RATECTRL_H_
+#define VPX_VP8_ENCODER_RATECTRL_H_
 
 #include "onyx_int.h"
 
@@ -37,4 +37,4 @@ extern int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_RATECTRL_H_
+#endif  // VPX_VP8_ENCODER_RATECTRL_H_
diff --git a/libs/libvpx/vp8/encoder/rdopt.c b/libs/libvpx/vp8/encoder/rdopt.c
index e210b441055..79a858e4370 100644
--- a/libs/libvpx/vp8/encoder/rdopt.c
+++ b/libs/libvpx/vp8/encoder/rdopt.c
@@ -770,9 +770,9 @@ static void rd_pick_intra_mbuv_mode(MACROBLOCK *x, int *rate,
     vp8_quantize_mbuv(x);
 
     rate_to = rd_cost_mbuv(x);
-    this_rate = rate_to +
-                x->intra_uv_mode_cost[xd->frame_type]
-                                     [xd->mode_info_context->mbmi.uv_mode];
+    this_rate =
+        rate_to + x->intra_uv_mode_cost[xd->frame_type]
+                                       [xd->mode_info_context->mbmi.uv_mode];
 
     this_distortion = vp8_mbuverror(x) / 4;
 
@@ -989,7 +989,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
   br += rate;
 
   for (i = 0; i < label_count; ++i) {
-    int_mv mode_mv[B_MODE_COUNT];
+    int_mv mode_mv[B_MODE_COUNT] = { { 0 }, { 0 } };
     int best_label_rd = INT_MAX;
     B_PREDICTION_MODE mode_selected = ZERO4X4;
     int bestlabelyrate = 0;
@@ -1767,7 +1767,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   /* search range got from mv_pred(). It uses step_param levels. (0-7) */
   int sr = 0;
 
-  unsigned char *plane[4][3];
+  unsigned char *plane[4][3] = { { 0, 0 } };
   int ref_frame_map[4];
   int sign_bias = 0;
 
@@ -1779,6 +1779,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
                best_rd_sse = UINT_MAX;
 #endif
 
+  // _uv variables are not set consistantly before calling update_best_mode.
+  rd.rate_uv = 0;
+  rd.distortion_uv = 0;
+
   mode_mv = mode_mv_sb[sign_bias];
   best_ref_mv.as_int = 0;
   best_mode.rd = INT_MAX;
@@ -1846,6 +1850,9 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 
     /* everything but intra */
     if (x->e_mbd.mode_info_context->mbmi.ref_frame) {
+      assert(plane[this_ref_frame][0] != NULL &&
+             plane[this_ref_frame][1] != NULL &&
+             plane[this_ref_frame][2] != NULL);
       x->e_mbd.pre.y_buffer = plane[this_ref_frame][0];
       x->e_mbd.pre.u_buffer = plane[this_ref_frame][1];
       x->e_mbd.pre.v_buffer = plane[this_ref_frame][2];
@@ -1940,6 +1947,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         if (tmp_rd < best_mode.yrd) {
+          assert(uv_intra_done);
           rd.rate2 += uv_intra_rate;
           rd.rate_uv = uv_intra_rate_tokenonly;
           rd.distortion2 += uv_intra_distortion;
@@ -2000,6 +2008,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
         rd.rate2 += x->mbmode_cost[x->e_mbd.frame_type]
                                   [x->e_mbd.mode_info_context->mbmi.mode];
+        assert(uv_intra_done);
         rd.rate2 += uv_intra_rate;
         rd.rate_uv = uv_intra_rate_tokenonly;
         rd.distortion2 += uv_intra_distortion;
@@ -2131,6 +2140,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.rate2 +=
             vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
       }
+        // fall through
 
       case NEARESTMV:
       case NEARMV:
@@ -2147,6 +2157,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
             (mode_mv[this_mode].as_int == 0)) {
           continue;
         }
+        // fall through
 
       case ZEROMV:
 
@@ -2352,11 +2363,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
   rd_update_mvcount(x, &best_ref_mv);
 }
 
-void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) {
   int error4x4, error16x16;
   int rate4x4, rate16x16 = 0, rateuv;
   int dist4x4, dist16x16, distuv;
-  int rate;
+  int rate_;
   int rate4x4_tokenonly = 0;
   int rate16x16_tokenonly = 0;
   int rateuv_tokenonly = 0;
@@ -2364,7 +2375,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
   x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
 
   rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv);
-  rate = rateuv;
+  rate_ = rateuv;
 
   error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly,
                                           &dist16x16);
@@ -2374,10 +2385,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) {
 
   if (error4x4 < error16x16) {
     x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
-    rate += rate4x4;
+    rate_ += rate4x4;
   } else {
-    rate += rate16x16;
+    rate_ += rate16x16;
   }
 
-  *rate_ = rate;
+  *rate = rate_;
 }
diff --git a/libs/libvpx/vp8/encoder/rdopt.h b/libs/libvpx/vp8/encoder/rdopt.h
index 960bd8f1cdf..cc3db8197c7 100644
--- a/libs/libvpx/vp8/encoder/rdopt.h
+++ b/libs/libvpx/vp8/encoder/rdopt.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_RDOPT_H_
-#define VP8_ENCODER_RDOPT_H_
+#ifndef VPX_VP8_ENCODER_RDOPT_H_
+#define VPX_VP8_ENCODER_RDOPT_H_
 
 #include "./vpx_config.h"
 
@@ -63,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) {
   }
 }
 
-extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
-extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x,
-                                   int recon_yoffset, int recon_uvoffset,
-                                   int *returnrate, int *returndistortion,
-                                   int *returnintra, int mb_row, int mb_col);
-extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
+void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue);
+void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
+                            int recon_uvoffset, int *returnrate,
+                            int *returndistortion, int *returnintra, int mb_row,
+                            int mb_col);
+void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate);
 
 static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb,
                                       unsigned char *plane[3],
@@ -110,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi,
   for (; i < 4; ++i) ref_frame_map[i] = -1;
 }
 
-extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
-                        int_mv *mvp, int refframe, int *ref_frame_sign_bias,
-                        int *sr, int near_sadidx[]);
+void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here,
+                 int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr,
+                 int near_sadidx[]);
 void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x,
                  int recon_yoffset, int near_sadidx[]);
 int VP8_UVSSE(MACROBLOCK *x);
@@ -123,4 +123,4 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv);
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_RDOPT_H_
+#endif  // VPX_VP8_ENCODER_RDOPT_H_
diff --git a/libs/libvpx/vp8/encoder/segmentation.h b/libs/libvpx/vp8/encoder/segmentation.h
index 1395a341185..4ddbdbbd265 100644
--- a/libs/libvpx/vp8/encoder/segmentation.h
+++ b/libs/libvpx/vp8/encoder/segmentation.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_SEGMENTATION_H_
-#define VP8_ENCODER_SEGMENTATION_H_
+#ifndef VPX_VP8_ENCODER_SEGMENTATION_H_
+#define VPX_VP8_ENCODER_SEGMENTATION_H_
 
 #include "string.h"
 #include "vp8/common/blockd.h"
@@ -26,4 +26,4 @@ extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_SEGMENTATION_H_
+#endif  // VPX_VP8_ENCODER_SEGMENTATION_H_
diff --git a/libs/libvpx/vp8/encoder/temporal_filter.c b/libs/libvpx/vp8/encoder/temporal_filter.c
index 0a7d25fb0a7..76f99a17d7f 100644
--- a/libs/libvpx/vp8/encoder/temporal_filter.c
+++ b/libs/libvpx/vp8/encoder/temporal_filter.c
@@ -159,6 +159,7 @@ static int vp8_temporal_filter_find_matching_mb_c(VP8_COMP *cpi,
   bestsme =
       vp8_hex_search(x, b, d, &best_ref_mv1_full, &d->bmi.mv, step_param, sadpb,
                      &cpi->fn_ptr[BLOCK_16X16], NULL, NULL, &best_ref_mv1);
+  (void)bestsme;  // Ignore unused return value.
 
 #if ALT_REF_SUBPEL_ENABLED
   /* Try sub-pixel MC? */
diff --git a/libs/libvpx/vp8/encoder/temporal_filter.h b/libs/libvpx/vp8/encoder/temporal_filter.h
index 865d909fb65..fd39f5cb873 100644
--- a/libs/libvpx/vp8/encoder/temporal_filter.h
+++ b/libs/libvpx/vp8/encoder/temporal_filter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
-#define VP8_ENCODER_TEMPORAL_FILTER_H_
+#ifndef VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,4 +23,4 @@ void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
 }
 #endif
 
-#endif  // VP8_ENCODER_TEMPORAL_FILTER_H_
+#endif  // VPX_VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libs/libvpx/vp8/encoder/tokenize.c b/libs/libvpx/vp8/encoder/tokenize.c
index ca5f0e3d892..c3d70266074 100644
--- a/libs/libvpx/vp8/encoder/tokenize.c
+++ b/libs/libvpx/vp8/encoder/tokenize.c
@@ -19,10 +19,6 @@
 /* Global event counters used for accumulating statistics across several
    compressions, then generating context.c = initial stats. */
 
-#ifdef VP8_ENTROPY_STATS
-_int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                       [MAX_ENTROPY_TOKENS];
-#endif
 void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
 void vp8_fix_contexts(MACROBLOCKD *x);
 
@@ -383,72 +379,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) {
   tokenize1st_order_b(x, t, plane_type, cpi);
 }
 
-#ifdef VP8_ENTROPY_STATS
-
-void init_context_counters(void) {
-  memset(context_counters, 0, sizeof(context_counters));
-}
-
-void print_context_counters() {
-  int type, band, pt, t;
-
-  FILE *const f = fopen("context.c", "w");
-
-  fprintf(f, "#include \"entropy.h\"\n");
-
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  fprintf(f,
-          "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] "
-          "[MAX_ENTROPY_TOKENS];\n\n");
-
-  fprintf(f,
-          "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] "
-          "[PREV_COEF_CONTEXTS] [MAX_ENTROPY_TOKENS] = {");
-
-#define Comma(X) (X ? "," : "")
-
-  type = 0;
-
-  do {
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-
-    band = 0;
-
-    do {
-      fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
-
-      pt = 0;
-
-      do {
-        fprintf(f, "%s\n      {", Comma(pt));
-
-        t = 0;
-
-        do {
-          const _int64 x = context_counters[type][band][pt][t];
-          const int y = (int)x;
-
-          assert(x == (_int64)y); /* no overflow handling yet */
-          fprintf(f, "%s %d", Comma(t), y);
-
-        } while (++t < MAX_ENTROPY_TOKENS);
-
-        fprintf(f, "}");
-      } while (++pt < PREV_COEF_CONTEXTS);
-
-      fprintf(f, "\n    }");
-
-    } while (++band < COEF_BANDS);
-
-    fprintf(f, "\n  }");
-  } while (++type < BLOCK_TYPES);
-
-  fprintf(f, "\n};\n");
-  fclose(f);
-}
-#endif
-
 static void stuff2nd_order_b(TOKENEXTRA **tp, ENTROPY_CONTEXT *a,
                              ENTROPY_CONTEXT *l, VP8_COMP *cpi, MACROBLOCK *x) {
   int pt;              /* near block/prev token context index */
diff --git a/libs/libvpx/vp8/encoder/tokenize.h b/libs/libvpx/vp8/encoder/tokenize.h
index e5dbdfc5af4..47b5be17f1f 100644
--- a/libs/libvpx/vp8/encoder/tokenize.h
+++ b/libs/libvpx/vp8/encoder/tokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_TOKENIZE_H_
-#define VP8_ENCODER_TOKENIZE_H_
+#ifndef VPX_VP8_ENCODER_TOKENIZE_H_
+#define VPX_VP8_ENCODER_TOKENIZE_H_
 
 #include "vp8/common/entropy.h"
 #include "block.h"
@@ -34,14 +34,6 @@ typedef struct {
 
 int rd_cost_mby(MACROBLOCKD *);
 
-#ifdef VP8_ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern _int64 context_counters[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                              [MAX_ENTROPY_TOKENS];
-#endif
-
 extern const short *const vp8_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
@@ -53,4 +45,4 @@ extern const TOKENVALUE *const vp8_dct_value_tokens_ptr;
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_TOKENIZE_H_
+#endif  // VPX_VP8_ENCODER_TOKENIZE_H_
diff --git a/libs/libvpx/vp8/encoder/treewriter.h b/libs/libvpx/vp8/encoder/treewriter.h
index dadbbe3f80d..c02683a58b7 100644
--- a/libs/libvpx/vp8/encoder/treewriter.h
+++ b/libs/libvpx/vp8/encoder/treewriter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP8_ENCODER_TREEWRITER_H_
-#define VP8_ENCODER_TREEWRITER_H_
+#ifndef VPX_VP8_ENCODER_TREEWRITER_H_
+#define VPX_VP8_ENCODER_TREEWRITER_H_
 
 /* Trees map alphabets into huffman-like codes suitable for an arithmetic
    bit coder.  Timothy S Murphy  11 October 2004 */
@@ -56,8 +56,7 @@ static INLINE unsigned int vp8_cost_branch(const unsigned int ct[2],
 
 static void vp8_treed_write(vp8_writer *const w, vp8_tree t,
                             const vp8_prob *const p, int v,
-                            int n /* number of bits in v, assumed nonzero */
-                            ) {
+                            int n) { /* number of bits in v, assumed nonzero */
   vp8_tree_index i = 0;
 
   do {
@@ -73,8 +72,7 @@ static INLINE void vp8_write_token(vp8_writer *const w, vp8_tree t,
 }
 
 static int vp8_treed_cost(vp8_tree t, const vp8_prob *const p, int v,
-                          int n /* number of bits in v, assumed nonzero */
-                          ) {
+                          int n) { /* number of bits in v, assumed nonzero */
   int c = 0;
   vp8_tree_index i = 0;
 
@@ -93,12 +91,12 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p,
 
 /* Fill array of costs for all possible token values. */
 
-void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree);
+void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree);
 
-void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int);
+void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP8_ENCODER_TREEWRITER_H_
+#endif  // VPX_VP8_ENCODER_TREEWRITER_H_
diff --git a/libs/libvpx/vp8/encoder/x86/encodeopt.asm b/libs/libvpx/vp8/encoder/x86/block_error_sse2.asm
similarity index 100%
rename from libs/libvpx/vp8/encoder/x86/encodeopt.asm
rename to libs/libvpx/vp8/encoder/x86/block_error_sse2.asm
diff --git a/libs/libvpx/vp8/common/x86/copy_sse2.asm b/libs/libvpx/vp8/encoder/x86/copy_sse2.asm
similarity index 100%
rename from libs/libvpx/vp8/common/x86/copy_sse2.asm
rename to libs/libvpx/vp8/encoder/x86/copy_sse2.asm
diff --git a/libs/libvpx/vp8/common/x86/copy_sse3.asm b/libs/libvpx/vp8/encoder/x86/copy_sse3.asm
similarity index 100%
rename from libs/libvpx/vp8/common/x86/copy_sse3.asm
rename to libs/libvpx/vp8/encoder/x86/copy_sse3.asm
diff --git a/libs/libvpx/vp8/encoder/x86/quantize_sse4.c b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c
index 6f2c1634926..389c16705d4 100644
--- a/libs/libvpx/vp8/encoder/x86/quantize_sse4.c
+++ b/libs/libvpx/vp8/encoder/x86/quantize_sse4.c
@@ -11,28 +11,29 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/encoder/block.h"
 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
+#include "vp8/encoder/block.h"
 
-#define SELECT_EOB(i, z, x, y, q)         \
-  do {                                    \
-    short boost = *zbin_boost_ptr;        \
-    short x_z = _mm_extract_epi16(x, z);  \
-    short y_z = _mm_extract_epi16(y, z);  \
-    int cmp = (x_z < boost) | (y_z == 0); \
-    zbin_boost_ptr++;                     \
-    if (cmp) break;                       \
-    q = _mm_insert_epi16(q, y_z, z);      \
-    eob = i;                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;  \
+#define SELECT_EOB(i, z, x, y, q)                         \
+  do {                                                    \
+    short boost = *zbin_boost_ptr;                        \
+    /* Technically _mm_extract_epi16() returns an int: */ \
+    /* https://bugs.llvm.org/show_bug.cgi?id=41657 */     \
+    short x_z = (short)_mm_extract_epi16(x, z);           \
+    short y_z = (short)_mm_extract_epi16(y, z);           \
+    int cmp = (x_z < boost) | (y_z == 0);                 \
+    zbin_boost_ptr++;                                     \
+    if (cmp) break;                                       \
+    q = _mm_insert_epi16(q, y_z, z);                      \
+    eob = i;                                              \
+    zbin_boost_ptr = b->zrun_zbin_boost;                  \
   } while (0)
 
 void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   char eob = 0;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
 
-  __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0,
-      dqcoeff1;
+  __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
   __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
@@ -53,15 +54,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
   zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
 
-  /* Sign of z: z >> 15 */
-  sz0 = _mm_srai_epi16(z0, 15);
-  sz1 = _mm_srai_epi16(z1, 15);
-
-  /* x = abs(z): (z ^ sz) - sz */
-  x0 = _mm_xor_si128(z0, sz0);
-  x1 = _mm_xor_si128(z1, sz1);
-  x0 = _mm_sub_epi16(x0, sz0);
-  x1 = _mm_sub_epi16(x1, sz1);
+  /* x = abs(z) */
+  x0 = _mm_abs_epi16(z0);
+  x1 = _mm_abs_epi16(z1);
 
   /* zbin[] + zbin_extra */
   zbin0 = _mm_add_epi16(zbin0, zbin_extra);
@@ -89,11 +84,9 @@ void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
   y0 = _mm_mulhi_epi16(y0, quant_shift0);
   y1 = _mm_mulhi_epi16(y1, quant_shift1);
 
-  /* Return the sign: (y ^ sz) - sz */
-  y0 = _mm_xor_si128(y0, sz0);
-  y1 = _mm_xor_si128(y1, sz1);
-  y0 = _mm_sub_epi16(y0, sz0);
-  y1 = _mm_sub_epi16(y1, sz1);
+  /* Restore the sign. */
+  y0 = _mm_sign_epi16(y0, z0);
+  y1 = _mm_sign_epi16(y1, z1);
 
   /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
   SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
diff --git a/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index d5474501544..147c30cc353 100644
--- a/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
+++ b/libs/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -52,9 +52,9 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
 
   __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1;
 
-  DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = {
-    0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
-  };
+  DECLARE_ALIGNED(16, const uint8_t,
+                  pshufb_zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                               9, 12, 13, 10, 7, 11, 14, 15 };
   __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask);
 
   /* sign of z: z >> 15 */
diff --git a/libs/libvpx/vp8/vp8_common.mk b/libs/libvpx/vp8/vp8_common.mk
index 246fe6a6772..3b442b1e4a5 100644
--- a/libs/libvpx/vp8/vp8_common.mk
+++ b/libs/libvpx/vp8/vp8_common.mk
@@ -15,7 +15,6 @@ VP8_COMMON_SRCS-yes += common/onyxd.h
 VP8_COMMON_SRCS-yes += common/alloccommon.c
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
-VP8_COMMON_SRCS-yes += common/copy_c.c
 # VP8_COMMON_SRCS-yes += common/debugmodes.c
 VP8_COMMON_SRCS-yes += common/default_coef_probs.h
 VP8_COMMON_SRCS-yes += common/dequantize.c
@@ -70,8 +69,6 @@ VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
-VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
@@ -82,14 +79,13 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idct_blk_mmx.c
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/idctllm_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/bilinear_filter_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
@@ -130,14 +126,13 @@ endif
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
+VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/copymem_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dc_only_idct_add_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequant_idct_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/dequantizeb_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_blk_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_0_2x_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/idct_dequant_full_2x_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/iwalsh_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_loopfilter_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/loopfiltersimplehorizontaledge_neon.c
diff --git a/libs/libvpx/vp8/vp8_cx_iface.c b/libs/libvpx/vp8/vp8_cx_iface.c
index af6689fd977..c92c547365b 100644
--- a/libs/libvpx/vp8/vp8_cx_iface.c
+++ b/libs/libvpx/vp8/vp8_cx_iface.c
@@ -16,7 +16,9 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
+#include "vpx_util/vpx_timestamp.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -49,7 +51,7 @@ static struct vp8_extracfg default_extracfg = {
 #if !(CONFIG_REALTIME_ONLY)
   0, /* cpu_used      */
 #else
-  4, /* cpu_used      */
+  4,                      /* cpu_used      */
 #endif
   0, /* enable_auto_alt_ref */
   0, /* noise_sensitivity */
@@ -74,6 +76,9 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp8_extracfg vp8_cfg;
+  vpx_rational64_t timestamp_ratio;
+  vpx_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
   VP8_CONFIG oxcf;
   struct VP8_COMP *cpi;
   unsigned char *cx_data;
@@ -105,10 +110,10 @@ static vpx_codec_err_t update_error_state(
     return VPX_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -126,6 +131,22 @@ static vpx_codec_err_t update_error_state(
     if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
   } while (0)
 
+#if defined(_MSC_VER)
+#define COMPILE_TIME_ASSERT(boolexp)              \
+  do {                                            \
+    char compile_time_assert[(boolexp) ? 1 : -1]; \
+    (void)compile_time_assert;                    \
+  } while (0)
+#else /* !_MSC_VER */
+#define COMPILE_TIME_ASSERT(boolexp)                         \
+  do {                                                       \
+    struct {                                                 \
+      unsigned int compile_time_assert : (boolexp) ? 1 : -1; \
+    } compile_time_assert;                                   \
+    (void)compile_time_assert;                               \
+  } while (0)
+#endif /* _MSC_VER */
+
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp8_extracfg *vp8_cfg,
@@ -484,6 +505,9 @@ static vpx_codec_err_t update_extracfg(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t set_cpu_used(vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp8_extracfg extra_cfg = ctx->vp8_cfg;
   extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  // Use fastest speed setting (speed 16 or -16) if it's set beyond the range.
+  extra_cfg.cpu_used = VPXMIN(16, extra_cfg.cpu_used);
+  extra_cfg.cpu_used = VPXMAX(-16, extra_cfg.cpu_used);
   return update_extracfg(ctx, &extra_cfg);
 }
 
@@ -577,7 +601,7 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
 
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
-  vpx_codec_err_t res = 0;
+  vpx_codec_err_t res = VPX_CODEC_OK;
 
 #if CONFIG_MULTI_RES_ENCODING
   LOWER_RES_FRAME_INFO *shared_mem_loc;
@@ -586,12 +610,13 @@ static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
 
   shared_mem_loc = calloc(1, sizeof(LOWER_RES_FRAME_INFO));
   if (!shared_mem_loc) {
-    res = VPX_CODEC_MEM_ERROR;
+    return VPX_CODEC_MEM_ERROR;
   }
 
   shared_mem_loc->mb_info =
       calloc(mb_rows * mb_cols, sizeof(LOWER_RES_MB_INFO));
   if (!(shared_mem_loc->mb_info)) {
+    free(shared_mem_loc);
     res = VPX_CODEC_MEM_ERROR;
   } else {
     *mem_loc = (void *)shared_mem_loc;
@@ -655,6 +680,12 @@ static vpx_codec_err_t vp8e_init(vpx_codec_ctx_t *ctx,
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
     if (!res) {
+      priv->pts_offset_initialized = 0;
+      priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+      priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num;
+      priv->timestamp_ratio.num *= TICKS_PER_SEC;
+      reduce_ratio(&priv->timestamp_ratio);
+
       set_vp8e_config(&priv->oxcf, priv->cfg, priv->vp8_cfg, mr_cfg);
       priv->cpi = vp8_create_compressor(&priv->oxcf);
       if (!priv->cpi) res = VPX_CODEC_MEM_ERROR;
@@ -719,12 +750,14 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
   new_qc = MODE_BESTQUALITY;
 
   if (deadline) {
+    /* Convert duration parameter from stream timebase to microseconds */
     uint64_t duration_us;
 
-    /* Convert duration parameter from stream timebase to microseconds */
-    duration_us = (uint64_t)duration * 1000000 *
-                  (uint64_t)ctx->cfg.g_timebase.num /
-                  (uint64_t)ctx->cfg.g_timebase.den;
+    COMPILE_TIME_ASSERT(TICKS_PER_SEC > 1000000 &&
+                        (TICKS_PER_SEC % 1000000) == 0);
+
+    duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
+                  (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
 
     /* If the deadline is more that the duration this frame is to be shown,
      * use good quality mode. Otherwise use realtime mode.
@@ -798,16 +831,38 @@ static vpx_codec_err_t set_reference_and_update(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
                                    const vpx_image_t *img, vpx_codec_pts_t pts,
                                    unsigned long duration,
-                                   vpx_enc_frame_flags_t flags,
+                                   vpx_enc_frame_flags_t enc_flags,
                                    unsigned long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res = VPX_CODEC_OK;
+  // Make a copy as volatile to avoid -Wclobbered with longjmp.
+  volatile vpx_enc_frame_flags_t flags = enc_flags;
+  volatile vpx_codec_pts_t pts_val = pts;
 
-  if (!ctx->cfg.rc_target_bitrate) return res;
+  if (!ctx->cfg.rc_target_bitrate) {
+#if CONFIG_MULTI_RES_ENCODING
+    if (!ctx->cpi) return VPX_CODEC_ERROR;
+    if (ctx->cpi->oxcf.mr_total_resolutions > 1) {
+      LOWER_RES_FRAME_INFO *low_res_frame_info =
+          (LOWER_RES_FRAME_INFO *)ctx->cpi->oxcf.mr_low_res_mode_info;
+      if (!low_res_frame_info) return VPX_CODEC_ERROR;
+      low_res_frame_info->skip_encoding_prev_stream = 1;
+      if (ctx->cpi->oxcf.mr_encoder_id == 0)
+        low_res_frame_info->skip_encoding_base_stream = 1;
+    }
+#endif
+    return res;
+  }
 
   if (img) res = validate_img(ctx, img);
 
   if (!res) res = validate_config(ctx, &ctx->cfg, &ctx->vp8_cfg, 1);
 
+  if (!ctx->pts_offset_initialized) {
+    ctx->pts_offset = pts_val;
+    ctx->pts_offset_initialized = 1;
+  }
+  pts_val -= ctx->pts_offset;
+
   pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
@@ -829,6 +884,12 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return VPX_CODEC_CORRUPT_FRAME;
+  }
+
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -851,11 +912,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     /* Convert API flags to internal codec lib flags */
     lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
 
-    /* vp8 use 10,000,000 ticks/second as time stamp */
     dst_time_stamp =
-        pts * 10000000 * ctx->cfg.g_timebase.num / ctx->cfg.g_timebase.den;
-    dst_end_time_stamp = (pts + duration) * 10000000 * ctx->cfg.g_timebase.num /
-                         ctx->cfg.g_timebase.den;
+        pts_val * ctx->timestamp_ratio.num / ctx->timestamp_ratio.den;
+    dst_end_time_stamp = (pts_val + duration) * ctx->timestamp_ratio.num /
+                         ctx->timestamp_ratio.den;
 
     if (img != NULL) {
       res = image2yuvconfig(img, &sd);
@@ -875,6 +935,8 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
+    ctx->cpi->common.error.setjmp = 1;
+
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -892,16 +954,21 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
 
         /* Add the frame packet to the list of returned packets. */
-        round = (vpx_codec_pts_t)10000000 * ctx->cfg.g_timebase.num / 2 - 1;
+        round = (vpx_codec_pts_t)ctx->timestamp_ratio.num / 2;
+        if (round > 0) --round;
         delta = (dst_end_time_stamp - dst_time_stamp);
         pkt.kind = VPX_CODEC_CX_FRAME_PKT;
         pkt.data.frame.pts =
-            (dst_time_stamp * ctx->cfg.g_timebase.den + round) /
-            ctx->cfg.g_timebase.num / 10000000;
+            (dst_time_stamp * ctx->timestamp_ratio.den + round) /
+                ctx->timestamp_ratio.num +
+            ctx->pts_offset;
         pkt.data.frame.duration =
-            (unsigned long)((delta * ctx->cfg.g_timebase.den + round) /
-                            ctx->cfg.g_timebase.num / 10000000);
+            (unsigned long)((delta * ctx->timestamp_ratio.den + round) /
+                            ctx->timestamp_ratio.num);
         pkt.data.frame.flags = lib_flags << 16;
+        pkt.data.frame.width[0] = cpi->common.Width;
+        pkt.data.frame.height[0] = cpi->common.Height;
+        pkt.data.frame.spatial_layer_encoded[0] = 1;
 
         if (lib_flags & FRAMEFLAGS_KEY) {
           pkt.data.frame.flags |= VPX_FRAME_IS_KEY;
@@ -916,9 +983,9 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
            * Invisible frames have no duration.
            */
           pkt.data.frame.pts =
-              ((cpi->last_time_stamp_seen * ctx->cfg.g_timebase.den + round) /
-               ctx->cfg.g_timebase.num / 10000000) +
-              1;
+              ((cpi->last_time_stamp_seen * ctx->timestamp_ratio.den + round) /
+               ctx->timestamp_ratio.num) +
+              ctx->pts_offset + 1;
           pkt.data.frame.duration = 0;
         }
 
@@ -1176,7 +1243,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = {
 static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
   { 0,
     {
-        0, /* g_usage */
+        0, /* g_usage (unused) */
         0, /* g_threads */
         0, /* g_profile */
 
@@ -1259,6 +1326,9 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = {
       vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t    cfg_maps; */
       vp8e_encode,        /* vpx_codec_encode_fn_t      encode; */
       vp8e_get_cxdata,    /* vpx_codec_get_cx_data_fn_t   get_cx_data; */
-      vp8e_set_config, NULL, vp8e_get_preview, vp8e_mr_alloc_mem,
+      vp8e_set_config,
+      NULL,
+      vp8e_get_preview,
+      vp8e_mr_alloc_mem,
   } /* encoder functions */
 };
diff --git a/libs/libvpx/vp8/vp8_dx_iface.c b/libs/libvpx/vp8/vp8_dx_iface.c
index f20283c1e1b..4207775ca7c 100644
--- a/libs/libvpx/vp8/vp8_dx_iface.c
+++ b/libs/libvpx/vp8/vp8_dx_iface.c
@@ -38,13 +38,19 @@ typedef vpx_codec_stream_info_t vp8_stream_info_t;
 
 /* Structures for handling memory allocations */
 typedef enum { VP8_SEG_ALG_PRIV = 256, VP8_SEG_MAX } mem_seg_id_t;
-#define NELEMENTS(x) ((int)(sizeof(x) / sizeof(x[0])))
+#define NELEMENTS(x) ((int)(sizeof(x) / sizeof((x)[0])))
 
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_dec_cfg_t cfg;
   vp8_stream_info_t si;
   int decoder_init;
+#if CONFIG_MULTITHREAD
+  // Restart threads on next frame if set to 1.
+  // This is set when error happens in multithreaded decoding and all threads
+  // are shut down.
+  int restart_threads;
+#endif
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
   vpx_decrypt_cb decrypt_cb;
@@ -117,11 +123,17 @@ static vpx_codec_err_t vp8_destroy(vpx_codec_alg_priv_t *ctx) {
   return VPX_CODEC_OK;
 }
 
+#ifdef __clang_analyzer__
+#define FUNC_ATTR_NONNULL(...) __attribute__((nonnull(__VA_ARGS__)))
+#else
+#define FUNC_ATTR_NONNULL(...)
+#endif
+
 static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
                                             unsigned int data_sz,
                                             vpx_codec_stream_info_t *si,
                                             vpx_decrypt_cb decrypt_cb,
-                                            void *decrypt_state) {
+                                            void *decrypt_state) FUNC_ATTR_NONNULL(1) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   assert(data != NULL);
@@ -200,9 +212,9 @@ static vpx_codec_err_t update_error_state(
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
   /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.*/
   img->fmt = VPX_IMG_FMT_I420;
   img->w = yv12->y_stride;
   img->h = (yv12->y_height + 2 * VP8BORDERINPIXELS + 15) & ~15;
@@ -267,8 +279,8 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
                                   void *user_priv, long deadline) {
-  volatile vpx_codec_err_t res;
-  unsigned int resolution_change = 0;
+  volatile vpx_codec_err_t res = VPX_CODEC_INVALID_PARAM;
+  volatile unsigned int resolution_change = 0;
   unsigned int w, h;
 
   if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
@@ -285,8 +297,10 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
   w = ctx->si.w;
   h = ctx->si.h;
 
-  res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0],
+  if (ctx->fragments.ptrs[0]) {
+      res = vp8_peek_si_internal(ctx->fragments.ptrs[0], ctx->fragments.sizes[0],
                              &ctx->si, ctx->decrypt_cb, ctx->decrypt_state);
+  }
 
   if ((res == VPX_CODEC_UNSUP_BITSTREAM) && !ctx->si.is_kf) {
     /* the peek function returns an error for non keyframes, however for
@@ -298,6 +312,27 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
   if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1;
 
+#if CONFIG_MULTITHREAD
+  if (!res && ctx->restart_threads) {
+    struct frame_buffers *fb = &ctx->yv12_frame_buffers;
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    VP8_COMMON *const pc = &pbi->common;
+    if (setjmp(pbi->common.error.jmp)) {
+      vp8_remove_decoder_instances(fb);
+      vp8_zero(fb->pbi);
+      vpx_clear_system_state();
+      return VPX_CODEC_ERROR;
+    }
+    pbi->common.error.setjmp = 1;
+    pbi->max_threads = ctx->cfg.threads;
+    vp8_decoder_create_threads(pbi);
+    if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+      vp8mt_alloc_temp_buffers(pbi, pc->Width, pc->mb_rows);
+    }
+    ctx->restart_threads = 0;
+    pbi->common.error.setjmp = 0;
+  }
+#endif
   /* Initialize the decoder instance on the first frame*/
   if (!res && !ctx->decoder_init) {
     VP8D_CONFIG oxcf;
@@ -335,8 +370,8 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
   if (!res) {
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    VP8_COMMON *const pc = &pbi->common;
     if (resolution_change) {
-      VP8_COMMON *const pc = &pbi->common;
       MACROBLOCKD *const xd = &pbi->mb;
 #if CONFIG_MULTITHREAD
       int i;
@@ -428,9 +463,35 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
       pbi->common.fb_idx_ref_cnt[0] = 0;
     }
 
+    if (setjmp(pbi->common.error.jmp)) {
+      /* We do not know if the missing frame(s) was supposed to update
+       * any of the reference buffers, but we act conservative and
+       * mark only the last buffer as corrupted.
+       */
+      pc->yv12_fb[pc->lst_fb_idx].corrupted = 1;
+
+      if (pc->fb_idx_ref_cnt[pc->new_fb_idx] > 0) {
+        pc->fb_idx_ref_cnt[pc->new_fb_idx]--;
+      }
+      pc->error.setjmp = 0;
+#if CONFIG_MULTITHREAD
+      if (pbi->restart_threads) {
+        ctx->si.w = 0;
+        ctx->si.h = 0;
+        ctx->restart_threads = 1;
+      }
+#endif
+      res = update_error_state(ctx, &pbi->common.error);
+      return res;
+    }
+
+    pbi->common.error.setjmp = 1;
+
     /* update the pbi fragment data */
     pbi->fragments = ctx->fragments;
-
+#if CONFIG_MULTITHREAD
+    pbi->restart_threads = 0;
+#endif
     ctx->user_priv = user_priv;
     if (vp8dx_receive_compressed_data(pbi, data_sz, data, deadline)) {
       res = update_error_state(ctx, &pbi->common.error);
diff --git a/libs/libvpx/vp8/vp8cx.mk b/libs/libvpx/vp8/vp8cx.mk
index 0dac0169d5f..3a8f8ea45a9 100644
--- a/libs/libvpx/vp8/vp8cx.mk
+++ b/libs/libvpx/vp8/vp8cx.mk
@@ -23,6 +23,7 @@ VP8_CX_SRCS-yes += vp8_cx_iface.c
 VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
+VP8_CX_SRCS-yes += encoder/copy_c.c
 VP8_CX_SRCS-yes += encoder/dct.c
 VP8_CX_SRCS-yes += encoder/encodeframe.c
 VP8_CX_SRCS-yes += encoder/encodeframe.h
@@ -82,6 +83,8 @@ VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
 endif
 
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/copy_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
@@ -92,9 +95,9 @@ ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 endif
 
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/block_error_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
new file mode 100644
index 00000000000..219ff63cb8f
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -0,0 +1,446 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+// Use macros to make sure argument lane is passed in as an constant integer.
+
+#define vmull_lane_s32_dual(in, c, lane, out)                          \
+  do {                                                                 \
+    out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+#define vmlal_lane_s32_dual(in, c, lane, out)                             \
+  do {                                                                    \
+    out[0].val[0] =                                                       \
+        vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] =                                                       \
+        vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] =                                                       \
+        vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] =                                                       \
+        vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+#define vmlsl_lane_s32_dual(in, c, lane, out)                             \
+  do {                                                                    \
+    out[0].val[0] =                                                       \
+        vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane);  \
+    out[0].val[1] =                                                       \
+        vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane);  \
+    out[1].val[0] =                                                       \
+        vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \
+    out[1].val[1] =                                                       \
+        vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \
+  } while (0)
+
+static INLINE int32x4x2_t
+highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS));
+  out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS));
+  return out;
+}
+
+#define highbd_iadst_half_butterfly(in, c, lane, out) \
+  do {                                                \
+    int64x2x2_t t[2];                                 \
+    vmull_lane_s32_dual(in, c, lane, t);              \
+    out = highbd_dct_const_round_shift_low_8(t);      \
+  } while (0)
+
+#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
+  do {                                                            \
+    vmull_lane_s32_dual(in0, c, lane0, s0);                       \
+    vmull_lane_s32_dual(in0, c, lane1, s1);                       \
+    vmlal_lane_s32_dual(in1, c, lane1, s0);                       \
+    vmlsl_lane_s32_dual(in1, c, lane0, s1);                       \
+  } while (0)
+
+static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0,
+                                         const int32x4x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vaddq_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vaddq_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0,
+                                         const int64x2x2_t in1) {
+  int64x2x2_t out;
+  out.val[0] = vaddq_s64(in0.val[0], in1.val[0]);
+  out.val[1] = vaddq_s64(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0,
+                                         const int32x4x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vsubq_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vsubq_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0,
+                                         const int64x2x2_t in1) {
+  int64x2x2_t out;
+  out.val[0] = vsubq_s64(in0.val[0], in1.val[0]);
+  out.val[1] = vsubq_s64(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0,
+                                            const int32x2x2_t in1) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(in0.val[0], in1.val[0]);
+  out.val[1] = vcombine_s32(in0.val[1], in1.val[1]);
+  return out;
+}
+
+static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8(
+    const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
+  const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]);
+  const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]);
+  int32x2x2_t out_lo, out_hi;
+
+  out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS);
+  out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS);
+  out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS);
+  out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS);
+  return vcombine_s32_dual(out_lo, out_hi);
+}
+
+static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8(
+    const int64x2x2_t *const in0, const int64x2x2_t *const in1) {
+  const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]);
+  const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]);
+  int32x2x2_t out_lo, out_hi;
+
+  out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS);
+  out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS);
+  out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS);
+  out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS);
+  return vcombine_s32_dual(out_lo, out_hi);
+}
+
+static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) {
+  int32x4x2_t out;
+  out.val[0] = vnegq_s32(in.val[0]);
+  out.val[1] = vnegq_s32(in.val[1]);
+  return out;
+}
+
+static void highbd_iadst16_neon(const int32_t *input, int32_t *output,
+                                uint16_t *dest, const int stride,
+                                const int bd) {
+  const int32x4_t c_1_31_5_27 =
+      create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
+  const int32x4_t c_9_23_13_19 =
+      create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64);
+  const int32x4_t c_17_15_21_11 =
+      create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64);
+  const int32x4_t c_25_7_29_3 =
+      create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64);
+  const int32x4_t c_4_28_20_12 =
+      create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64);
+  const int32x4_t c_16_n16_8_24 =
+      create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64);
+  int32x4x2_t in[16], out[16];
+  int32x4x2_t x[16], t[12];
+  int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+
+  // Load input (16x8)
+  in[0].val[0] = vld1q_s32(input);
+  in[0].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[8].val[0] = vld1q_s32(input);
+  in[8].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[1].val[0] = vld1q_s32(input);
+  in[1].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[9].val[0] = vld1q_s32(input);
+  in[9].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[2].val[0] = vld1q_s32(input);
+  in[2].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[10].val[0] = vld1q_s32(input);
+  in[10].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[3].val[0] = vld1q_s32(input);
+  in[3].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[11].val[0] = vld1q_s32(input);
+  in[11].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[4].val[0] = vld1q_s32(input);
+  in[4].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[12].val[0] = vld1q_s32(input);
+  in[12].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[5].val[0] = vld1q_s32(input);
+  in[5].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[13].val[0] = vld1q_s32(input);
+  in[13].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[6].val[0] = vld1q_s32(input);
+  in[6].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[14].val[0] = vld1q_s32(input);
+  in[14].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[7].val[0] = vld1q_s32(input);
+  in[7].val[1] = vld1q_s32(input + 4);
+  input += 8;
+  in[15].val[0] = vld1q_s32(input);
+  in[15].val[1] = vld1q_s32(input + 4);
+
+  // Transpose
+  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // stage 1
+  highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1);
+  highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3);
+  highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5);
+  highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7);
+  highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9);
+  highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10,
+                         s11);
+  highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12,
+                         s13);
+  highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14,
+                         s15);
+
+  x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8);
+  x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9);
+  x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10);
+  x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13);
+  x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14);
+  x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15);
+  x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8);
+  x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9);
+  x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10);
+  x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11);
+  x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12);
+  x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9);
+  highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10,
+                         s11);
+  highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13,
+                         s12);
+  highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15,
+                         s14);
+
+  x[0] = vaddq_s32_dual(t[0], t[4]);
+  x[1] = vaddq_s32_dual(t[1], t[5]);
+  x[2] = vaddq_s32_dual(t[2], t[6]);
+  x[3] = vaddq_s32_dual(t[3], t[7]);
+  x[4] = vsubq_s32_dual(t[0], t[4]);
+  x[5] = vsubq_s32_dual(t[1], t[5]);
+  x[6] = vsubq_s32_dual(t[2], t[6]);
+  x[7] = vsubq_s32_dual(t[3], t[7]);
+  x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12);
+  x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13);
+  x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14);
+  x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15);
+  x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12);
+  x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15);
+
+  // stage 3
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4,
+                         s5);
+  highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7,
+                         s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12,
+                         s13);
+  highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15,
+                         s14);
+
+  x[0] = vaddq_s32_dual(t[0], t[2]);
+  x[1] = vaddq_s32_dual(t[1], t[3]);
+  x[2] = vsubq_s32_dual(t[0], t[2]);
+  x[3] = vsubq_s32_dual(t[1], t[3]);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7);
+  x[8] = vaddq_s32_dual(t[8], t[10]);
+  x[9] = vaddq_s32_dual(t[9], t[11]);
+  x[10] = vsubq_s32_dual(t[8], t[10]);
+  x[11] = vsubq_s32_dual(t[9], t[11]);
+  x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14);
+  x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15);
+  x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14);
+  x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15);
+
+  // stage 4
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]);
+  }
+  {
+    const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]);
+    const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]);
+    highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]);
+    highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]);
+  }
+
+  out[0] = x[0];
+  out[1] = vnegq_s32_dual(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s32_dual(x[4]);
+  out[4] = x[6];
+  out[5] = x[14];
+  out[6] = x[10];
+  out[7] = x[2];
+  out[8] = x[3];
+  out[9] = x[11];
+  out[10] = x[15];
+  out[11] = x[7];
+  out[12] = x[5];
+  out[13] = vnegq_s32_dual(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s32_dual(x[1]);
+
+  if (output) {
+    highbd_idct16x16_store_pass1(out, output);
+  } else {
+    highbd_idct16x16_add_store(out, dest, stride, bd);
+  }
+}
+
+typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output,
+                              uint16_t *dest, const int stride, const int bd);
+
+typedef struct {
+  highbd_iht_1d cols, rows;  // vertical and horizontal
+} highbd_iht_2d;
+
+void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int tx_type, int bd) {
+  if (bd == 8) {
+    static const iht_2d IHT_16[] = {
+      { vpx_idct16x16_256_add_half1d,
+        vpx_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+      { vpx_iadst16x16_256_add_half1d,
+        vpx_idct16x16_256_add_half1d },  // ADST_DCT = 1
+      { vpx_idct16x16_256_add_half1d,
+        vpx_iadst16x16_256_add_half1d },  // DCT_ADST = 2
+      { vpx_iadst16x16_256_add_half1d,
+        vpx_iadst16x16_256_add_half1d }  // ADST_ADST = 3
+    };
+    const iht_2d ht = IHT_16[tx_type];
+    int16_t row_output[16 * 16];
+
+    // pass 1
+    ht.rows(input, row_output, dest, stride, 1);               // upper 8 rows
+    ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1);  // lower 8 rows
+
+    // pass 2
+    ht.cols(row_output, NULL, dest, stride, 1);               // left 8 columns
+    ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1);  // right 8 columns
+  } else {
+    static const highbd_iht_2d IHT_16[] = {
+      { vpx_highbd_idct16x16_256_add_half1d,
+        vpx_highbd_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+      { highbd_iadst16_neon,
+        vpx_highbd_idct16x16_256_add_half1d },  // ADST_DCT = 1
+      { vpx_highbd_idct16x16_256_add_half1d,
+        highbd_iadst16_neon },                      // DCT_ADST = 2
+      { highbd_iadst16_neon, highbd_iadst16_neon }  // ADST_ADST = 3
+    };
+    const highbd_iht_2d ht = IHT_16[tx_type];
+    int32_t row_output[16 * 16];
+
+    // pass 1
+    ht.rows(input, row_output, dest, stride, bd);               // upper 8 rows
+    ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd);  // lower 8 rows
+
+    // pass 2
+    ht.cols(row_output, NULL, dest, stride, bd);  // left 8 columns
+    ht.cols(row_output + 8 * 16, NULL, dest + 8, stride,
+            bd);  // right 8 columns
+  }
+}
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
new file mode 100644
index 00000000000..52c4f1937df
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void highbd_iadst4(int32x4_t *const io) {
+  const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
+  const int32x4_t sinpi = vld1q_s32(sinpis);
+  int64x2x2_t s[7], t[4];
+  int32x4_t s7;
+
+  s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0);
+  s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1);
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0);
+  s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1);
+  s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0);
+  s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1);
+  s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1);
+  s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1);
+  s7 = vsubq_s32(io[0], io[2]);
+  s7 = vaddq_s32(s7, io[3]);
+
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]);
+  s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]);
+  s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]);
+  s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]);
+  s[3] = s[2];
+  s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0);
+  s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0);
+
+  t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]);
+  t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]);
+  t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]);
+  t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]);
+  t[2] = s[2];
+  t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]);
+  t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]);
+  t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]);
+  t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]);
+  io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS));
+  io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS));
+  io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS));
+  io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS));
+}
+
+void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t a[2];
+  int32x4_t c[4];
+
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
+
+  if (bd == 8) {
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_s16_4x4q(&a[0], &a[1]);
+
+    switch (tx_type) {
+      case DCT_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      case ADST_DCT:
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+
+      case DCT_ADST:
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        idct4x4_16_kernel_bd8(a);
+        a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+        break;
+
+      default:
+        assert(tx_type == ADST_ADST);
+        iadst4(a);
+        transpose_s16_4x4q(&a[0], &a[1]);
+        iadst4(a);
+        break;
+    }
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis = vld1q_s32(kCospi32);
+
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        if (bd == 10) {
+          idct4x4_16_kernel_bd10(cospis, c);
+        } else {
+          idct4x4_16_kernel_bd12(cospis, c);
+        }
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+        highbd_iadst4(c);
+        break;
+      }
+    }
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
+  }
+
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
+}
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
new file mode 100644
index 00000000000..2232c6841c7
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c
@@ -0,0 +1,345 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_enums.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x,
+                                                    const int32x2_t c) {
+  const int32x4_t sum = vaddq_s32(x[0], x[1]);
+  const int32x4_t sub = vsubq_s32(x[0], x[1]);
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0);
+  const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS);
+  const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS);
+  const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS);
+  const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS);
+
+  x[0] = vcombine_s32(out0_lo, out0_hi);
+  x[1] = vcombine_s32(out1_lo, out1_hi);
+}
+
+static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0,
+                                                        const int32x4_t in1,
+                                                        const int32x2_t c,
+                                                        int64x2_t *const s0,
+                                                        int64x2_t *const s1) {
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
+
+  s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1);
+  s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0);
+  s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1);
+  s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0);
+}
+
+static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0,
+                                                        const int32x4_t in1,
+                                                        const int32x2_t c,
+                                                        int64x2_t *const s0,
+                                                        int64x2_t *const s1) {
+  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1);
+  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0);
+  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1);
+  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0);
+
+  s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0);
+  s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1);
+  s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0);
+  s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1);
+}
+
+static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8(
+    const int64x2_t *const in0, const int64x2_t *const in1) {
+  const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]);
+  const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]);
+  const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS);
+  const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS);
+  return vcombine_s32(out_lo, out_hi);
+}
+
+static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8(
+    const int64x2_t *const in0, const int64x2_t *const in1) {
+  const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]);
+  const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]);
+  const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS);
+  const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS);
+  return vcombine_s32(out_lo, out_hi);
+}
+
+static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1,
+                                 int32x4_t *const io2, int32x4_t *const io3,
+                                 int32x4_t *const io4, int32x4_t *const io5,
+                                 int32x4_t *const io6, int32x4_t *const io7) {
+  const int32x4_t c0 =
+      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int32x4_t c1 =
+      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int32x4_t c2 =
+      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int32x4_t x[8], t[4];
+  int64x2_t s[8][2];
+
+  x[0] = *io7;
+  x[1] = *io0;
+  x[2] = *io5;
+  x[3] = *io2;
+  x[4] = *io3;
+  x[5] = *io4;
+  x[6] = *io1;
+  x[7] = *io6;
+
+  // stage 1
+  highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0],
+                                       s[1]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2],
+                                       s[3]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4],
+                                       s[5]);
+  highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6],
+                                       s[7]);
+
+  x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]);
+  x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]);
+  x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]);
+  x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]);
+  x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]);
+  x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4],
+                                       s[5]);
+  highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7],
+                                       s[6]);
+
+  x[0] = vaddq_s32(t[0], t[2]);
+  x[1] = vaddq_s32(t[1], t[3]);
+  x[2] = vsubq_s32(t[0], t[2]);
+  x[3] = vsubq_s32(t[1], t[3]);
+  x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]);
+  x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]);
+  x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]);
+  x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]);
+
+  // stage 3
+  highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2));
+  highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2));
+
+  *io0 = x[0];
+  *io1 = vnegq_s32(x[4]);
+  *io2 = x[6];
+  *io3 = vnegq_s32(x[2]);
+  *io4 = x[3];
+  *io5 = vnegq_s32(x[7]);
+  *io6 = x[5];
+  *io7 = vnegq_s32(x[1]);
+}
+
+void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
+                                   int stride, int tx_type, int bd) {
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 4);
+  a[2] = vld1q_s32(input + 8);
+  a[3] = vld1q_s32(input + 12);
+  a[4] = vld1q_s32(input + 16);
+  a[5] = vld1q_s32(input + 20);
+  a[6] = vld1q_s32(input + 24);
+  a[7] = vld1q_s32(input + 28);
+  a[8] = vld1q_s32(input + 32);
+  a[9] = vld1q_s32(input + 36);
+  a[10] = vld1q_s32(input + 40);
+  a[11] = vld1q_s32(input + 44);
+  a[12] = vld1q_s32(input + 48);
+  a[13] = vld1q_s32(input + 52);
+  a[14] = vld1q_s32(input + 56);
+  a[15] = vld1q_s32(input + 60);
+
+  if (bd == 8) {
+    c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+    c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+    c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+    c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+    c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+    c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+    c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+    c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        break;
+      }
+
+      case ADST_DCT: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int16x8_t cospis = vld1q_s16(kCospi);
+        const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+        const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        idct8x8_64_1d_bd8(cospis0, cospis1, c);
+        break;
+      }
+
+      default: {
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6],
+                          &c[7]);
+        iadst8(c);
+        break;
+      }
+    }
+
+    c[0] = vrshrq_n_s16(c[0], 5);
+    c[1] = vrshrq_n_s16(c[1], 5);
+    c[2] = vrshrq_n_s16(c[2], 5);
+    c[3] = vrshrq_n_s16(c[3], 5);
+    c[4] = vrshrq_n_s16(c[4], 5);
+    c[5] = vrshrq_n_s16(c[5], 5);
+    c[6] = vrshrq_n_s16(c[6], 5);
+    c[7] = vrshrq_n_s16(c[7], 5);
+  } else {
+    switch (tx_type) {
+      case DCT_DCT: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        if (bd == 10) {
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                                 &a[4], &a[5], &a[6], &a[7]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                                 &a[12], &a[13], &a[14], &a[15]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                                 &a[2], &a[10], &a[3], &a[11]);
+          idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                                 &a[6], &a[14], &a[7], &a[15]);
+        } else {
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                                 &a[4], &a[5], &a[6], &a[7]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                                 &a[12], &a[13], &a[14], &a[15]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                                 &a[2], &a[10], &a[3], &a[11]);
+          idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                                 &a[6], &a[14], &a[7], &a[15]);
+        }
+        break;
+      }
+
+      case ADST_DCT: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                               &a[4], &a[5], &a[6], &a[7]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                               &a[12], &a[13], &a[14], &a[15]);
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                      &a[15]);
+        break;
+      }
+
+      case DCT_ADST: {
+        const int32x4_t cospis0 = vld1q_s32(kCospi32);  // cospi 0, 8, 16, 24
+        const int32x4_t cospis1 =
+            vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
+
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                      &a[15]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                               &a[2], &a[10], &a[3], &a[11]);
+        idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                               &a[6], &a[14], &a[7], &a[15]);
+        break;
+      }
+
+      default: {
+        assert(tx_type == ADST_ADST);
+        transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
+                          &a[7]);
+        highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+        transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                          &a[15]);
+        highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14],
+                      &a[15]);
+        transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3],
+                          &a[11]);
+        highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]);
+        transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                          &a[15]);
+        highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7],
+                      &a[15]);
+        break;
+      }
+    }
+
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
+  }
+  highbd_add8x8(c, dest, stride, bd);
+}
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
new file mode 100644
index 00000000000..db72ff11618
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c
@@ -0,0 +1,279 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output,
+                                   void *const dest, const int stride,
+                                   const int highbd_flag) {
+  int16x8_t in[16], out[16];
+  const int16x4_t c_1_31_5_27 =
+      create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64);
+  const int16x4_t c_9_23_13_19 =
+      create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64);
+  const int16x4_t c_17_15_21_11 =
+      create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64);
+  const int16x4_t c_25_7_29_3 =
+      create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64);
+  const int16x4_t c_4_28_20_12 =
+      create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64);
+  const int16x4_t c_16_n16_8_24 =
+      create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64);
+  int16x8_t x[16], t[12];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+
+  // Load input (16x8)
+  if (output) {
+    const tran_low_t *inputT = (const tran_low_t *)input;
+    in[0] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[8] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[1] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[9] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[2] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[10] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[3] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[11] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[4] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[12] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[5] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[13] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[6] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[14] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[7] = load_tran_low_to_s16q(inputT);
+    inputT += 8;
+    in[15] = load_tran_low_to_s16q(inputT);
+  } else {
+    const int16_t *inputT = (const int16_t *)input;
+    in[0] = vld1q_s16(inputT);
+    inputT += 8;
+    in[8] = vld1q_s16(inputT);
+    inputT += 8;
+    in[1] = vld1q_s16(inputT);
+    inputT += 8;
+    in[9] = vld1q_s16(inputT);
+    inputT += 8;
+    in[2] = vld1q_s16(inputT);
+    inputT += 8;
+    in[10] = vld1q_s16(inputT);
+    inputT += 8;
+    in[3] = vld1q_s16(inputT);
+    inputT += 8;
+    in[11] = vld1q_s16(inputT);
+    inputT += 8;
+    in[4] = vld1q_s16(inputT);
+    inputT += 8;
+    in[12] = vld1q_s16(inputT);
+    inputT += 8;
+    in[5] = vld1q_s16(inputT);
+    inputT += 8;
+    in[13] = vld1q_s16(inputT);
+    inputT += 8;
+    in[6] = vld1q_s16(inputT);
+    inputT += 8;
+    in[14] = vld1q_s16(inputT);
+    inputT += 8;
+    in[7] = vld1q_s16(inputT);
+    inputT += 8;
+    in[15] = vld1q_s16(inputT);
+  }
+
+  // Transpose
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+  transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+                    &in[15]);
+
+  x[0] = in[15];
+  x[1] = in[0];
+  x[2] = in[13];
+  x[3] = in[2];
+  x[4] = in[11];
+  x[5] = in[4];
+  x[6] = in[9];
+  x[7] = in[6];
+  x[8] = in[7];
+  x[9] = in[8];
+  x[10] = in[5];
+  x[11] = in[10];
+  x[12] = in[3];
+  x[13] = in[12];
+  x[14] = in[1];
+  x[15] = in[14];
+
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7);
+  iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9);
+  iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11);
+  iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13);
+  iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15);
+
+  x[0] = add_dct_const_round_shift_low_8(s0, s8);
+  x[1] = add_dct_const_round_shift_low_8(s1, s9);
+  x[2] = add_dct_const_round_shift_low_8(s2, s10);
+  x[3] = add_dct_const_round_shift_low_8(s3, s11);
+  x[4] = add_dct_const_round_shift_low_8(s4, s12);
+  x[5] = add_dct_const_round_shift_low_8(s5, s13);
+  x[6] = add_dct_const_round_shift_low_8(s6, s14);
+  x[7] = add_dct_const_round_shift_low_8(s7, s15);
+  x[8] = sub_dct_const_round_shift_low_8(s0, s8);
+  x[9] = sub_dct_const_round_shift_low_8(s1, s9);
+  x[10] = sub_dct_const_round_shift_low_8(s2, s10);
+  x[11] = sub_dct_const_round_shift_low_8(s3, s11);
+  x[12] = sub_dct_const_round_shift_low_8(s4, s12);
+  x[13] = sub_dct_const_round_shift_low_8(s5, s13);
+  x[14] = sub_dct_const_round_shift_low_8(s6, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s7, s15);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  t[4] = x[4];
+  t[5] = x[5];
+  t[6] = x[6];
+  t[7] = x[7];
+  iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9);
+  iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11);
+  iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12);
+  iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14);
+
+  x[0] = vaddq_s16(t[0], t[4]);
+  x[1] = vaddq_s16(t[1], t[5]);
+  x[2] = vaddq_s16(t[2], t[6]);
+  x[3] = vaddq_s16(t[3], t[7]);
+  x[4] = vsubq_s16(t[0], t[4]);
+  x[5] = vsubq_s16(t[1], t[5]);
+  x[6] = vsubq_s16(t[2], t[6]);
+  x[7] = vsubq_s16(t[3], t[7]);
+  x[8] = add_dct_const_round_shift_low_8(s8, s12);
+  x[9] = add_dct_const_round_shift_low_8(s9, s13);
+  x[10] = add_dct_const_round_shift_low_8(s10, s14);
+  x[11] = add_dct_const_round_shift_low_8(s11, s15);
+  x[12] = sub_dct_const_round_shift_low_8(s8, s12);
+  x[13] = sub_dct_const_round_shift_low_8(s9, s13);
+  x[14] = sub_dct_const_round_shift_low_8(s10, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s11, s15);
+
+  // stage 3
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6);
+  t[8] = x[8];
+  t[9] = x[9];
+  t[10] = x[10];
+  t[11] = x[11];
+  iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13);
+  iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14);
+
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+  x[8] = vaddq_s16(t[8], t[10]);
+  x[9] = vaddq_s16(t[9], t[11]);
+  x[10] = vsubq_s16(t[8], t[10]);
+  x[11] = vsubq_s16(t[9], t[11]);
+  x[12] = add_dct_const_round_shift_low_8(s12, s14);
+  x[13] = add_dct_const_round_shift_low_8(s13, s15);
+  x[14] = sub_dct_const_round_shift_low_8(s12, s14);
+  x[15] = sub_dct_const_round_shift_low_8(s13, s15);
+
+  // stage 4
+  iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24);
+  iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24);
+  iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24);
+
+  out[0] = x[0];
+  out[1] = vnegq_s16(x[8]);
+  out[2] = x[12];
+  out[3] = vnegq_s16(x[4]);
+  out[4] = x[6];
+  out[5] = x[14];
+  out[6] = x[10];
+  out[7] = x[2];
+  out[8] = x[3];
+  out[9] = x[11];
+  out[10] = x[15];
+  out[11] = x[7];
+  out[12] = x[5];
+  out[13] = vnegq_s16(x[13]);
+  out[14] = x[9];
+  out[15] = vnegq_s16(x[1]);
+
+  if (output) {
+    idct16x16_store_pass1(out, output);
+  } else {
+    if (highbd_flag) {
+      idct16x16_add_store_bd8(out, dest, stride);
+    } else {
+      idct16x16_add_store(out, dest, stride);
+    }
+  }
+}
+
+void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
+                               int stride, int tx_type) {
+  static const iht_2d IHT_16[] = {
+    { vpx_idct16x16_256_add_half1d,
+      vpx_idct16x16_256_add_half1d },  // DCT_DCT  = 0
+    { vpx_iadst16x16_256_add_half1d,
+      vpx_idct16x16_256_add_half1d },  // ADST_DCT = 1
+    { vpx_idct16x16_256_add_half1d,
+      vpx_iadst16x16_256_add_half1d },  // DCT_ADST = 2
+    { vpx_iadst16x16_256_add_half1d,
+      vpx_iadst16x16_256_add_half1d }  // ADST_ADST = 3
+  };
+  const iht_2d ht = IHT_16[tx_type];
+  int16_t row_output[16 * 16];
+
+  // pass 1
+  ht.rows(input, row_output, dest, stride, 0);               // upper 8 rows
+  ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0);  // lower 8 rows
+
+  // pass 2
+  ht.cols(row_output, NULL, dest, stride, 0);               // left 8 columns
+  ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0);  // right 8 columns
+}
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index 025254c3f33..4f0a90f2156 100644
--- a/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,206 +14,63 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
-  int32x4_t q8s32, q9s32;
-  int16x4x2_t d0x2s16, d1x2s16;
-  int32x4x2_t q0x2s32;
-
-  d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
-  d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
-  q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
-  q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
-  q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
-  *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
-  *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
-                                             int16x4_t *d2s16) {
-  *d0s16 = vdup_n_s16(cospi_8_64);
-  *d1s16 = vdup_n_s16(cospi_16_64);
-  *d2s16 = vdup_n_s16(cospi_24_64);
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
-                                           int16x4_t *d5s16, int16x8_t *q3s16) {
-  *d3s16 = vdup_n_s16(sinpi_1_9);
-  *d4s16 = vdup_n_s16(sinpi_2_9);
-  *q3s16 = vdupq_n_s16(sinpi_3_9);
-  *d5s16 = vdup_n_s16(sinpi_4_9);
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
-                              int16x4_t *d2s16, int16x8_t *q8s16,
-                              int16x8_t *q9s16) {
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
-  int16x4_t d26s16, d27s16, d28s16, d29s16;
-  int32x4_t q10s32, q13s32, q14s32, q15s32;
-  int16x8_t q13s16, q14s16;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  d23s16 = vadd_s16(d16s16, d18s16);
-  d24s16 = vsub_s16(d16s16, d18s16);
-
-  q15s32 = vmull_s16(d17s16, *d2s16);
-  q10s32 = vmull_s16(d17s16, *d0s16);
-  q13s32 = vmull_s16(d23s16, *d1s16);
-  q14s32 = vmull_s16(d24s16, *d1s16);
-  q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
-  q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
-  d26s16 = vrshrn_n_s32(q13s32, 14);
-  d27s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d28s16 = vrshrn_n_s32(q10s32, 14);
-
-  q13s16 = vcombine_s16(d26s16, d27s16);
-  q14s16 = vcombine_s16(d28s16, d29s16);
-  *q8s16 = vaddq_s16(q13s16, q14s16);
-  *q9s16 = vsubq_s16(q13s16, q14s16);
-  *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16));  // vswp
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
-                               int16x4_t *d5s16, int16x8_t *q3s16,
-                               int16x8_t *q8s16, int16x8_t *q9s16) {
-  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
-  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d6s16 = vget_low_s16(*q3s16);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-
-  q10s32 = vmull_s16(*d3s16, d16s16);
-  q11s32 = vmull_s16(*d4s16, d16s16);
-  q12s32 = vmull_s16(d6s16, d17s16);
-  q13s32 = vmull_s16(*d5s16, d18s16);
-  q14s32 = vmull_s16(*d3s16, d18s16);
-  q15s32 = vmovl_s16(d16s16);
-  q15s32 = vaddw_s16(q15s32, d19s16);
-  q8s32 = vmull_s16(*d4s16, d19s16);
-  q15s32 = vsubw_s16(q15s32, d18s16);
-  q9s32 = vmull_s16(*d5s16, d19s16);
-
-  q10s32 = vaddq_s32(q10s32, q13s32);
-  q10s32 = vaddq_s32(q10s32, q8s32);
-  q11s32 = vsubq_s32(q11s32, q14s32);
-  q8s32 = vdupq_n_s32(sinpi_3_9);
-  q11s32 = vsubq_s32(q11s32, q9s32);
-  q15s32 = vmulq_s32(q15s32, q8s32);
-
-  q13s32 = vaddq_s32(q10s32, q12s32);
-  q10s32 = vaddq_s32(q10s32, q11s32);
-  q14s32 = vaddq_s32(q11s32, q12s32);
-  q10s32 = vsubq_s32(q10s32, q12s32);
-
-  d16s16 = vrshrn_n_s32(q13s32, 14);
-  d17s16 = vrshrn_n_s32(q14s32, 14);
-  d18s16 = vrshrn_n_s32(q15s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-
-  *q8s16 = vcombine_s16(d16s16, d17s16);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-}
-
 void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  uint8x8_t d26u8, d27u8;
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
-  uint32x2_t d26u32, d27u32;
-  int16x8_t q3s16, q8s16, q9s16;
-  uint16x8_t q8u16, q9u16;
-
-  d26u32 = d27u32 = vdup_n_u32(0);
+  int16x8_t a[2];
+  uint8x8_t s[2], d[2];
+  uint16x8_t sum[2];
 
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
+  assert(!((intptr_t)dest % sizeof(uint32_t)));
+  assert(!(stride % sizeof(uint32_t)));
 
-  TRANSPOSE4X4(&q8s16, &q9s16);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_s16_4x4q(&a[0], &a[1]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate constants
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    case 2:  // idct_iadst
-      // generate constantsyy
-      GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
 
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+    case ADST_DCT:
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
-    case 3:  // iadst_iadst
-      // generate constants
-      GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
-      // first transform rows
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
 
-      // transpose the matrix
-      TRANSPOSE4X4(&q8s16, &q9s16);
-
-      // then transform columns
-      IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+    case DCT_ADST:
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      idct4x4_16_kernel_bd8(a);
+      a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst4(a);
+      transpose_s16_4x4q(&a[0], &a[1]);
+      iadst4(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 4);
-  q9s16 = vrshrq_n_s16(q9s16, 4);
-
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
-  dest += stride;
-  d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
-  dest += stride;
-  d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
-  q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
-  q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
-  d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-  d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
-  dest -= stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
+  s[0] = load_u8(dest, stride);
+  s[1] = load_u8(dest + 2 * stride, stride);
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+  store_u8(dest, stride, d[0]);
+  store_u8(dest + 2 * stride, stride, d[1]);
 }
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index 1c739861c38..46ee632e018 100644
--- a/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,527 +14,55 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
-static int16_t cospi_2_64 = 16305;
-static int16_t cospi_4_64 = 16069;
-static int16_t cospi_6_64 = 15679;
-static int16_t cospi_8_64 = 15137;
-static int16_t cospi_10_64 = 14449;
-static int16_t cospi_12_64 = 13623;
-static int16_t cospi_14_64 = 12665;
-static int16_t cospi_16_64 = 11585;
-static int16_t cospi_18_64 = 10394;
-static int16_t cospi_20_64 = 9102;
-static int16_t cospi_22_64 = 7723;
-static int16_t cospi_24_64 = 6270;
-static int16_t cospi_26_64 = 4756;
-static int16_t cospi_28_64 = 3196;
-static int16_t cospi_30_64 = 1606;
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                              int16x8_t *q10s16, int16x8_t *q11s16,
-                              int16x8_t *q12s16, int16x8_t *q13s16,
-                              int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
-  int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
-  int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
-  d0s16 = vdup_n_s16(cospi_28_64);
-  d1s16 = vdup_n_s16(cospi_4_64);
-  d2s16 = vdup_n_s16(cospi_12_64);
-  d3s16 = vdup_n_s16(cospi_20_64);
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  q2s32 = vmull_s16(d18s16, d0s16);
-  q3s32 = vmull_s16(d19s16, d0s16);
-  q5s32 = vmull_s16(d26s16, d2s16);
-  q6s32 = vmull_s16(d27s16, d2s16);
-
-  q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
-  q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
-  q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
-  d8s16 = vrshrn_n_s32(q2s32, 14);
-  d9s16 = vrshrn_n_s32(q3s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  q4s16 = vcombine_s16(d8s16, d9s16);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-
-  q2s32 = vmull_s16(d18s16, d1s16);
-  q3s32 = vmull_s16(d19s16, d1s16);
-  q9s32 = vmull_s16(d26s16, d3s16);
-  q13s32 = vmull_s16(d27s16, d3s16);
-
-  q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
-  q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
-  q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
-  d14s16 = vrshrn_n_s32(q2s32, 14);
-  d15s16 = vrshrn_n_s32(q3s32, 14);
-  d12s16 = vrshrn_n_s32(q9s32, 14);
-  d13s16 = vrshrn_n_s32(q13s32, 14);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-  q7s16 = vcombine_s16(d14s16, d15s16);
-
-  d0s16 = vdup_n_s16(cospi_16_64);
-
-  q2s32 = vmull_s16(d16s16, d0s16);
-  q3s32 = vmull_s16(d17s16, d0s16);
-  q13s32 = vmull_s16(d16s16, d0s16);
-  q15s32 = vmull_s16(d17s16, d0s16);
-
-  q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
-  q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
-  d0s16 = vdup_n_s16(cospi_24_64);
-  d1s16 = vdup_n_s16(cospi_8_64);
-
-  d18s16 = vrshrn_n_s32(q2s32, 14);
-  d19s16 = vrshrn_n_s32(q3s32, 14);
-  d22s16 = vrshrn_n_s32(q13s32, 14);
-  d23s16 = vrshrn_n_s32(q15s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q2s32 = vmull_s16(d20s16, d0s16);
-  q3s32 = vmull_s16(d21s16, d0s16);
-  q8s32 = vmull_s16(d20s16, d1s16);
-  q12s32 = vmull_s16(d21s16, d1s16);
-
-  q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
-  q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
-  q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
-  q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
-  d26s16 = vrshrn_n_s32(q2s32, 14);
-  d27s16 = vrshrn_n_s32(q3s32, 14);
-  d30s16 = vrshrn_n_s32(q8s32, 14);
-  d31s16 = vrshrn_n_s32(q12s32, 14);
-  *q13s16 = vcombine_s16(d26s16, d27s16);
-  *q15s16 = vcombine_s16(d30s16, d31s16);
-
-  q0s16 = vaddq_s16(*q9s16, *q15s16);
-  q1s16 = vaddq_s16(*q11s16, *q13s16);
-  q2s16 = vsubq_s16(*q11s16, *q13s16);
-  q3s16 = vsubq_s16(*q9s16, *q15s16);
-
-  *q13s16 = vsubq_s16(q4s16, q5s16);
-  q4s16 = vaddq_s16(q4s16, q5s16);
-  *q14s16 = vsubq_s16(q7s16, q6s16);
-  q7s16 = vaddq_s16(q7s16, q6s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-
-  d16s16 = vdup_n_s16(cospi_16_64);
-
-  q9s32 = vmull_s16(d28s16, d16s16);
-  q10s32 = vmull_s16(d29s16, d16s16);
-  q11s32 = vmull_s16(d28s16, d16s16);
-  q12s32 = vmull_s16(d29s16, d16s16);
-
-  q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
-  q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
-  q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
-  q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
-  d10s16 = vrshrn_n_s32(q9s32, 14);
-  d11s16 = vrshrn_n_s32(q10s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q12s32, 14);
-  q5s16 = vcombine_s16(d10s16, d11s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  *q8s16 = vaddq_s16(q0s16, q7s16);
-  *q9s16 = vaddq_s16(q1s16, q6s16);
-  *q10s16 = vaddq_s16(q2s16, q5s16);
-  *q11s16 = vaddq_s16(q3s16, q4s16);
-  *q12s16 = vsubq_s16(q3s16, q4s16);
-  *q13s16 = vsubq_s16(q2s16, q5s16);
-  *q14s16 = vsubq_s16(q1s16, q6s16);
-  *q15s16 = vsubq_s16(q0s16, q7s16);
-}
-
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
-                               int16x8_t *q10s16, int16x8_t *q11s16,
-                               int16x8_t *q12s16, int16x8_t *q13s16,
-                               int16x8_t *q14s16, int16x8_t *q15s16) {
-  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
-  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
-  int16x8_t q2s16, q4s16, q5s16, q6s16;
-  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
-  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
-  d16s16 = vget_low_s16(*q8s16);
-  d17s16 = vget_high_s16(*q8s16);
-  d18s16 = vget_low_s16(*q9s16);
-  d19s16 = vget_high_s16(*q9s16);
-  d20s16 = vget_low_s16(*q10s16);
-  d21s16 = vget_high_s16(*q10s16);
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  d26s16 = vget_low_s16(*q13s16);
-  d27s16 = vget_high_s16(*q13s16);
-  d28s16 = vget_low_s16(*q14s16);
-  d29s16 = vget_high_s16(*q14s16);
-  d30s16 = vget_low_s16(*q15s16);
-  d31s16 = vget_high_s16(*q15s16);
-
-  d14s16 = vdup_n_s16(cospi_2_64);
-  d15s16 = vdup_n_s16(cospi_30_64);
-
-  q1s32 = vmull_s16(d30s16, d14s16);
-  q2s32 = vmull_s16(d31s16, d14s16);
-  q3s32 = vmull_s16(d30s16, d15s16);
-  q4s32 = vmull_s16(d31s16, d15s16);
-
-  d30s16 = vdup_n_s16(cospi_18_64);
-  d31s16 = vdup_n_s16(cospi_14_64);
-
-  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
-  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
-  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
-  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
-  q5s32 = vmull_s16(d22s16, d30s16);
-  q6s32 = vmull_s16(d23s16, d30s16);
-  q7s32 = vmull_s16(d22s16, d31s16);
-  q8s32 = vmull_s16(d23s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
-  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
-  q11s32 = vaddq_s32(q1s32, q5s32);
-  q12s32 = vaddq_s32(q2s32, q6s32);
-  q1s32 = vsubq_s32(q1s32, q5s32);
-  q2s32 = vsubq_s32(q2s32, q6s32);
-
-  d22s16 = vrshrn_n_s32(q11s32, 14);
-  d23s16 = vrshrn_n_s32(q12s32, 14);
-  *q11s16 = vcombine_s16(d22s16, d23s16);
-
-  q12s32 = vaddq_s32(q3s32, q7s32);
-  q15s32 = vaddq_s32(q4s32, q8s32);
-  q3s32 = vsubq_s32(q3s32, q7s32);
-  q4s32 = vsubq_s32(q4s32, q8s32);
-
-  d2s16 = vrshrn_n_s32(q1s32, 14);
-  d3s16 = vrshrn_n_s32(q2s32, 14);
-  d24s16 = vrshrn_n_s32(q12s32, 14);
-  d25s16 = vrshrn_n_s32(q15s32, 14);
-  d6s16 = vrshrn_n_s32(q3s32, 14);
-  d7s16 = vrshrn_n_s32(q4s32, 14);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  d0s16 = vdup_n_s16(cospi_10_64);
-  d1s16 = vdup_n_s16(cospi_22_64);
-  q4s32 = vmull_s16(d26s16, d0s16);
-  q5s32 = vmull_s16(d27s16, d0s16);
-  q2s32 = vmull_s16(d26s16, d1s16);
-  q6s32 = vmull_s16(d27s16, d1s16);
-
-  d30s16 = vdup_n_s16(cospi_26_64);
-  d31s16 = vdup_n_s16(cospi_6_64);
-
-  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
-  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
-  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
-  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
-  q0s32 = vmull_s16(d18s16, d30s16);
-  q13s32 = vmull_s16(d19s16, d30s16);
-
-  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
-  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
-  q10s32 = vmull_s16(d18s16, d31s16);
-  q9s32 = vmull_s16(d19s16, d31s16);
-
-  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
-  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
-  q14s32 = vaddq_s32(q2s32, q10s32);
-  q15s32 = vaddq_s32(q6s32, q9s32);
-  q2s32 = vsubq_s32(q2s32, q10s32);
-  q6s32 = vsubq_s32(q6s32, q9s32);
-
-  d28s16 = vrshrn_n_s32(q14s32, 14);
-  d29s16 = vrshrn_n_s32(q15s32, 14);
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q6s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  q9s32 = vaddq_s32(q4s32, q0s32);
-  q10s32 = vaddq_s32(q5s32, q13s32);
-  q4s32 = vsubq_s32(q4s32, q0s32);
-  q5s32 = vsubq_s32(q5s32, q13s32);
-
-  d30s16 = vdup_n_s16(cospi_8_64);
-  d31s16 = vdup_n_s16(cospi_24_64);
-
-  d18s16 = vrshrn_n_s32(q9s32, 14);
-  d19s16 = vrshrn_n_s32(q10s32, 14);
-  d8s16 = vrshrn_n_s32(q4s32, 14);
-  d9s16 = vrshrn_n_s32(q5s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q5s32 = vmull_s16(d2s16, d30s16);
-  q6s32 = vmull_s16(d3s16, d30s16);
-  q7s32 = vmull_s16(d2s16, d31s16);
-  q0s32 = vmull_s16(d3s16, d31s16);
-
-  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
-  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
-  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
-  q1s32 = vmull_s16(d4s16, d30s16);
-  q3s32 = vmull_s16(d5s16, d30s16);
-  q10s32 = vmull_s16(d4s16, d31s16);
-  q2s32 = vmull_s16(d5s16, d31s16);
-
-  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
-  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
-  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
-  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
-  *q8s16 = vaddq_s16(*q11s16, *q9s16);
-  *q11s16 = vsubq_s16(*q11s16, *q9s16);
-  q4s16 = vaddq_s16(*q12s16, *q14s16);
-  *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
-  q14s32 = vaddq_s32(q5s32, q1s32);
-  q15s32 = vaddq_s32(q6s32, q3s32);
-  q5s32 = vsubq_s32(q5s32, q1s32);
-  q6s32 = vsubq_s32(q6s32, q3s32);
-
-  d18s16 = vrshrn_n_s32(q14s32, 14);
-  d19s16 = vrshrn_n_s32(q15s32, 14);
-  d10s16 = vrshrn_n_s32(q5s32, 14);
-  d11s16 = vrshrn_n_s32(q6s32, 14);
-  *q9s16 = vcombine_s16(d18s16, d19s16);
-
-  q1s32 = vaddq_s32(q7s32, q10s32);
-  q3s32 = vaddq_s32(q0s32, q2s32);
-  q7s32 = vsubq_s32(q7s32, q10s32);
-  q0s32 = vsubq_s32(q0s32, q2s32);
-
-  d28s16 = vrshrn_n_s32(q1s32, 14);
-  d29s16 = vrshrn_n_s32(q3s32, 14);
-  d14s16 = vrshrn_n_s32(q7s32, 14);
-  d15s16 = vrshrn_n_s32(q0s32, 14);
-  *q14s16 = vcombine_s16(d28s16, d29s16);
-
-  d30s16 = vdup_n_s16(cospi_16_64);
-
-  d22s16 = vget_low_s16(*q11s16);
-  d23s16 = vget_high_s16(*q11s16);
-  q2s32 = vmull_s16(d22s16, d30s16);
-  q3s32 = vmull_s16(d23s16, d30s16);
-  q13s32 = vmull_s16(d22s16, d30s16);
-  q1s32 = vmull_s16(d23s16, d30s16);
-
-  d24s16 = vget_low_s16(*q12s16);
-  d25s16 = vget_high_s16(*q12s16);
-  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
-  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
-  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
-  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
-
-  d4s16 = vrshrn_n_s32(q2s32, 14);
-  d5s16 = vrshrn_n_s32(q3s32, 14);
-  d24s16 = vrshrn_n_s32(q13s32, 14);
-  d25s16 = vrshrn_n_s32(q1s32, 14);
-  q2s16 = vcombine_s16(d4s16, d5s16);
-  *q12s16 = vcombine_s16(d24s16, d25s16);
-
-  q13s32 = vmull_s16(d10s16, d30s16);
-  q1s32 = vmull_s16(d11s16, d30s16);
-  q11s32 = vmull_s16(d10s16, d30s16);
-  q0s32 = vmull_s16(d11s16, d30s16);
-
-  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
-  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
-  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
-  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
-
-  d20s16 = vrshrn_n_s32(q13s32, 14);
-  d21s16 = vrshrn_n_s32(q1s32, 14);
-  d12s16 = vrshrn_n_s32(q11s32, 14);
-  d13s16 = vrshrn_n_s32(q0s32, 14);
-  *q10s16 = vcombine_s16(d20s16, d21s16);
-  q6s16 = vcombine_s16(d12s16, d13s16);
-
-  q5s16 = vdupq_n_s16(0);
-
-  *q9s16 = vsubq_s16(q5s16, *q9s16);
-  *q11s16 = vsubq_s16(q5s16, q2s16);
-  *q13s16 = vsubq_s16(q5s16, q6s16);
-  *q15s16 = vsubq_s16(q5s16, q4s16);
-}
-
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i;
-  uint8_t *d1, *d2;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8;
-  uint64x1_t d0u64, d1u64, d2u64, d3u64;
-  int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
-  uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-  q8s16 = vld1q_s16(input);
-  q9s16 = vld1q_s16(input + 8);
-  q10s16 = vld1q_s16(input + 8 * 2);
-  q11s16 = vld1q_s16(input + 8 * 3);
-  q12s16 = vld1q_s16(input + 8 * 4);
-  q13s16 = vld1q_s16(input + 8 * 5);
-  q14s16 = vld1q_s16(input + 8 * 6);
-  q15s16 = vld1q_s16(input + 8 * 7);
-
-  transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                    &q15s16);
+  const int16x8_t cospis = vld1q_s16(kCospi);
+  const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
+  const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
+  int16x8_t a[8];
+
+  a[0] = load_tran_low_to_s16q(input + 0 * 8);
+  a[1] = load_tran_low_to_s16q(input + 1 * 8);
+  a[2] = load_tran_low_to_s16q(input + 2 * 8);
+  a[3] = load_tran_low_to_s16q(input + 3 * 8);
+  a[4] = load_tran_low_to_s16q(input + 4 * 8);
+  a[5] = load_tran_low_to_s16q(input + 5 * 8);
+  a[6] = load_tran_low_to_s16q(input + 6 * 8);
+  a[7] = load_tran_low_to_s16q(input + 7 * 8);
+
+  transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
 
   switch (tx_type) {
-    case 0:  // idct_idct is not supported. Fall back to C
-      vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
-      return;
-    case 1:  // iadst_idct
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // first transform rows
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
-
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    case 2:  // idct_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
 
-      // generate IDCT constants
-      // GENERATE_IDCT_CONSTANTS
-
-      // then transform columns
-      IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                 &q15s16);
+    case ADST_DCT:
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
-    case 3:  // iadst_iadst
-      // generate IADST constants
-      // GENERATE_IADST_CONSTANTS
-
-      // first transform rows
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
-
-      // transpose the matrix
-      transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
-                        &q14s16, &q15s16);
 
-      // then transform columns
-      IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
-                  &q15s16);
+    case DCT_ADST:
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
       break;
-    default:  // iadst_idct
-      assert(0);
+
+    default:
+      assert(tx_type == ADST_ADST);
+      iadst8(a);
+      transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+      iadst8(a);
       break;
   }
 
-  q8s16 = vrshrq_n_s16(q8s16, 5);
-  q9s16 = vrshrq_n_s16(q9s16, 5);
-  q10s16 = vrshrq_n_s16(q10s16, 5);
-  q11s16 = vrshrq_n_s16(q11s16, 5);
-  q12s16 = vrshrq_n_s16(q12s16, 5);
-  q13s16 = vrshrq_n_s16(q13s16, 5);
-  q14s16 = vrshrq_n_s16(q14s16, 5);
-  q15s16 = vrshrq_n_s16(q15s16, 5);
-
-  for (d1 = d2 = dest, i = 0; i < 2; i++) {
-    if (i != 0) {
-      q8s16 = q12s16;
-      q9s16 = q13s16;
-      q10s16 = q14s16;
-      q11s16 = q15s16;
-    }
-
-    d0u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d1u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d2u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-    d3u64 = vld1_u64((uint64_t *)d1);
-    d1 += stride;
-
-    q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
-    q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
-    q10u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
-    q11u16 =
-        vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
-    d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
-    d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-    d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
-    d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
-    d2 += stride;
-    vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
-    d2 += stride;
-  }
+  idct8x8_add8x8_neon(a, dest, stride);
 }
diff --git a/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h
new file mode 100644
index 00000000000..c64822e27c6
--- /dev/null
+++ b/libs/libvpx/vp9/common/arm/neon/vp9_iht_neon.h
@@ -0,0 +1,272 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void iadst4(int16x8_t *const io) {
+  const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
+  int16x4_t x[4];
+  int32x4_t s[8], output[4];
+  const int16x4_t c =
+      create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
+
+  x[0] = vget_low_s16(io[0]);
+  x[1] = vget_low_s16(io[1]);
+  x[2] = vget_high_s16(io[0]);
+  x[3] = vget_high_s16(io[1]);
+
+  s[0] = vmull_lane_s16(x[0], c, 0);
+  s[1] = vmull_lane_s16(x[0], c, 1);
+  s[2] = vmull_lane_s16(x[1], c, 2);
+  s[3] = vmull_lane_s16(x[2], c, 3);
+  s[4] = vmull_lane_s16(x[2], c, 0);
+  s[5] = vmull_lane_s16(x[3], c, 1);
+  s[6] = vmull_lane_s16(x[3], c, 3);
+  s[7] = vaddl_s16(x[0], x[3]);
+  s[7] = vsubw_s16(s[7], x[2]);
+
+  s[0] = vaddq_s32(s[0], s[3]);
+  s[0] = vaddq_s32(s[0], s[5]);
+  s[1] = vsubq_s32(s[1], s[4]);
+  s[1] = vsubq_s32(s[1], s[6]);
+  s[3] = s[2];
+  s[2] = vmulq_s32(c3, s[7]);
+
+  output[0] = vaddq_s32(s[0], s[3]);
+  output[1] = vaddq_s32(s[1], s[3]);
+  output[2] = s[2];
+  output[3] = vaddq_s32(s[0], s[1]);
+  output[3] = vsubq_s32(output[3], s[3]);
+  dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
+}
+
+static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+                                             const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  x[0] = dct_const_round_shift_low_8(t0);
+  x[1] = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0,
+                                                 int16x8_t *const x1,
+                                                 const int16x4_t c) {
+  // Don't add/sub before multiply, which will overflow in iadst8.
+  const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0);
+  const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0);
+  const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0);
+  const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0);
+  int32x4_t t0[2], t1[2];
+
+  t0[0] = vaddq_s32(x0_lo, x1_lo);
+  t0[1] = vaddq_s32(x0_hi, x1_hi);
+  t1[0] = vsubq_s32(x0_lo, x1_lo);
+  t1[1] = vsubq_s32(x0_hi, x1_hi);
+  *x1 = dct_const_round_shift_low_8(t0);
+  *x0 = dct_const_round_shift_low_8(t1);
+}
+
+static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+}
+
+static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+}
+
+static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+}
+
+static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+                                                 const int16x8_t in1,
+                                                 const int16x4_t c,
+                                                 int32x4_t *const s0,
+                                                 int32x4_t *const s1) {
+  s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+  s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+  s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+  s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+  s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+  s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+  s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+  s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+}
+
+static INLINE int16x8_t add_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
+
+  sum[0] = vaddq_s32(in0[0], in1[0]);
+  sum[1] = vaddq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
+
+static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+    const int32x4_t *const in0, const int32x4_t *const in1) {
+  int32x4_t sum[2];
+
+  sum[0] = vsubq_s32(in0[0], in1[0]);
+  sum[1] = vsubq_s32(in0[1], in1[1]);
+  return dct_const_round_shift_low_8(sum);
+}
+
+static INLINE void iadst8(int16x8_t *const io) {
+  const int16x4_t c0 =
+      create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+  const int16x4_t c1 =
+      create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+  const int16x4_t c2 =
+      create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+  int16x8_t x[8], t[4];
+  int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+
+  x[0] = io[7];
+  x[1] = io[0];
+  x[2] = io[5];
+  x[3] = io[2];
+  x[4] = io[3];
+  x[5] = io[4];
+  x[6] = io[1];
+  x[7] = io[6];
+
+  // stage 1
+  iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+  iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+  iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+  iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+
+  x[0] = add_dct_const_round_shift_low_8(s0, s4);
+  x[1] = add_dct_const_round_shift_low_8(s1, s5);
+  x[2] = add_dct_const_round_shift_low_8(s2, s6);
+  x[3] = add_dct_const_round_shift_low_8(s3, s7);
+  x[4] = sub_dct_const_round_shift_low_8(s0, s4);
+  x[5] = sub_dct_const_round_shift_low_8(s1, s5);
+  x[6] = sub_dct_const_round_shift_low_8(s2, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s3, s7);
+
+  // stage 2
+  t[0] = x[0];
+  t[1] = x[1];
+  t[2] = x[2];
+  t[3] = x[3];
+  iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
+  iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
+
+  x[0] = vaddq_s16(t[0], t[2]);
+  x[1] = vaddq_s16(t[1], t[3]);
+  x[2] = vsubq_s16(t[0], t[2]);
+  x[3] = vsubq_s16(t[1], t[3]);
+  x[4] = add_dct_const_round_shift_low_8(s4, s6);
+  x[5] = add_dct_const_round_shift_low_8(s5, s7);
+  x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+  x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+
+  // stage 3
+  iadst_half_butterfly_neon(x + 2, c2);
+  iadst_half_butterfly_neon(x + 6, c2);
+
+  io[0] = x[0];
+  io[1] = vnegq_s16(x[4]);
+  io[2] = x[6];
+  io[3] = vnegq_s16(x[2]);
+  io[4] = x[3];
+  io[5] = vnegq_s16(x[7]);
+  io[6] = x[5];
+  io[7] = vnegq_s16(x[1]);
+}
+
+void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output,
+                                   void *const dest, const int stride,
+                                   const int highbd_flag);
+
+typedef void (*iht_1d)(const void *const input, int16_t *output,
+                       void *const dest, const int stride,
+                       const int highbd_flag);
+
+typedef struct {
+  iht_1d cols, rows;  // vertical and horizontal
+} iht_2d;
+
+#endif  // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
index 3e3530116d1..c031322806b 100644
--- a/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
index 786fbdb794c..aaccd5ca7b9 100644
--- a/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
index e4166775da7..76d15ff8c0d 100644
--- a/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
+++ b/libs/libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
diff --git a/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c
new file mode 100644
index 00000000000..e861596ad48
--- /dev/null
+++ b/libs/libvpx/vp9/common/ppc/vp9_idct_vsx.c
@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+#include "vp9/common/vp9_enums.h"
+
+void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[2], out[2];
+
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store4x4_vsx(in, out, dest, stride);
+}
+
+void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[8], out[8];
+
+  // load input data
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+  in[2] = load_tran_low(2 * 8 * sizeof(*input), input);
+  in[3] = load_tran_low(3 * 8 * sizeof(*input), input);
+  in[4] = load_tran_low(4 * 8 * sizeof(*input), input);
+  in[5] = load_tran_low(5 * 8 * sizeof(*input), input);
+  in[6] = load_tran_low(6 * 8 * sizeof(*input), input);
+  in[7] = load_tran_low(7 * 8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store8x8_vsx(in, dest, stride);
+}
+
+void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0);
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    case ADST_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+    case DCT_ADST:
+      vpx_iadst16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vpx_iadst16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+  }
+
+  vpx_round_store16x16_vsx(in0, in1, dest, stride);
+}
diff --git a/libs/libvpx/vp9/common/vp9_alloccommon.h b/libs/libvpx/vp9/common/vp9_alloccommon.h
index a3a1638572d..8900038ead9 100644
--- a/libs/libvpx/vp9/common/vp9_alloccommon.h
+++ b/libs/libvpx/vp9/common/vp9_alloccommon.h
@@ -8,10 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_
-#define VP9_COMMON_VP9_ALLOCCOMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
+#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
 
-#define INVALID_IDX -1  // Invalid buffer index.
+#define INVALID_IDX (-1)  // Invalid buffer index.
 
 #ifdef __cplusplus
 extern "C" {
@@ -41,4 +41,4 @@ void vp9_swap_current_and_last_seg_map(struct VP9Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ALLOCCOMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_blockd.h b/libs/libvpx/vp9/common/vp9_blockd.h
index 780b29208bc..2ddc0f121ca 100644
--- a/libs/libvpx/vp9/common/vp9_blockd.h
+++ b/libs/libvpx/vp9/common/vp9_blockd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_BLOCKD_H_
-#define VP9_COMMON_VP9_BLOCKD_H_
+#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_
+#define VPX_VP9_COMMON_VP9_BLOCKD_H_
 
 #include "./vpx_config.h"
 
@@ -54,12 +54,13 @@ typedef struct {
 // decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
 
-#define NONE -1
+#define NONE (-1)
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
 #define GOLDEN_FRAME 2
 #define ALTREF_FRAME 3
 #define MAX_REF_FRAMES 4
+
 typedef int8_t MV_REFERENCE_FRAME;
 
 // This structure now relates to 8x8 block regions.
@@ -130,6 +131,8 @@ struct macroblockd_plane {
 
   // encoder
   const int16_t *dequant;
+
+  int *eob;
 };
 
 #define BLOCK_OFFSET(x, i) ((x) + (i)*16)
@@ -173,7 +176,7 @@ typedef struct macroblockd {
   FRAME_CONTEXT *fc;
 
   /* pointers to reference frames */
-  RefBuffer *block_refs[2];
+  const RefBuffer *block_refs[2];
 
   /* pointer to current frame */
   const YV12_BUFFER_CONFIG *cur_buf;
@@ -193,6 +196,8 @@ typedef struct macroblockd {
   int corrupted;
 
   struct vpx_internal_error_info *error_info;
+
+  PARTITION_TYPE *partition;
 } MACROBLOCKD;
 
 static INLINE PLANE_TYPE get_plane_type(int plane) {
@@ -281,8 +286,30 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
+#if CONFIG_MISMATCH_DEBUG
+#define TX_UNIT_SIZE_LOG2 2
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+                                   int mi_row, int tx_blk_col, int tx_blk_row,
+                                   int subsampling_x, int subsampling_y) {
+  *pixel_c = ((mi_col << MI_SIZE_LOG2) >> subsampling_x) +
+             (tx_blk_col << TX_UNIT_SIZE_LOG2);
+  *pixel_r = ((mi_row << MI_SIZE_LOG2) >> subsampling_y) +
+             (tx_blk_row << TX_UNIT_SIZE_LOG2);
+}
+
+static INLINE int get_block_width(BLOCK_SIZE bsize) {
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  return 4 * num_4x4_w;
+}
+
+static INLINE int get_block_height(BLOCK_SIZE bsize) {
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  return 4 * num_4x4_h;
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_BLOCKD_H_
+#endif  // VPX_VP9_COMMON_VP9_BLOCKD_H_
diff --git a/libs/libvpx/vp9/common/vp9_common.h b/libs/libvpx/vp9/common/vp9_common.h
index 666c3beaf03..e3c5535ddba 100644
--- a/libs/libvpx/vp9/common/vp9_common.h
+++ b/libs/libvpx/vp9/common/vp9_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_COMMON_H_
-#define VP9_COMMON_VP9_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_COMMON_H_
+#define VPX_VP9_COMMON_VP9_COMMON_H_
 
 /* Interface header for common constant data structures and lookup tables */
 
@@ -33,14 +33,14 @@ extern "C" {
   }
 
 // Use this for variably-sized arrays.
-#define vp9_copy_array(dest, src, n)       \
-  {                                        \
-    assert(sizeof(*dest) == sizeof(*src)); \
-    memcpy(dest, src, n * sizeof(*src));   \
+#define vp9_copy_array(dest, src, n)           \
+  {                                            \
+    assert(sizeof(*(dest)) == sizeof(*(src))); \
+    memcpy(dest, src, (n) * sizeof(*(src)));   \
   }
 
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, (n) * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
@@ -49,8 +49,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(cm, lval, expr)                                     \
   do {                                                                      \
-    lval = (expr);                                                          \
-    if (!lval)                                                              \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
                          "Failed to allocate " #lval " at %s:%d", __FILE__, \
                          __LINE__);                                         \
@@ -58,8 +58,8 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #else
 #define CHECK_MEM_ERROR(cm, lval, expr)                     \
   do {                                                      \
-    lval = (expr);                                          \
-    if (!lval)                                              \
+    (lval) = (expr);                                        \
+    if (!(lval))                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
                          "Failed to allocate " #lval);      \
   } while (0)
@@ -75,4 +75,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_common_data.c b/libs/libvpx/vp9/common/vp9_common_data.c
index 4a10833229d..809d7317cef 100644
--- a/libs/libvpx/vp9/common/vp9_common_data.c
+++ b/libs/libvpx/vp9/common/vp9_common_data.c
@@ -28,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2,
 const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2,
                                                           4, 2, 4, 8, 4, 8 };
 
-// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
+// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize)))
 const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2,
                                                  2, 2, 3, 3, 3, 3 };
 
diff --git a/libs/libvpx/vp9/common/vp9_common_data.h b/libs/libvpx/vp9/common/vp9_common_data.h
index 5c6a7e8ff3e..a533c5f058d 100644
--- a/libs/libvpx/vp9/common/vp9_common_data.h
+++ b/libs/libvpx/vp9/common/vp9_common_data.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_COMMON_DATA_H_
-#define VP9_COMMON_VP9_COMMON_DATA_H_
+#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_
+#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_
 
 #include "vp9/common/vp9_enums.h"
 #include "vpx/vpx_integer.h"
@@ -42,4 +42,4 @@ extern const uint8_t need_top_left[INTRA_MODES];
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_COMMON_DATA_H_
+#endif  // VPX_VP9_COMMON_VP9_COMMON_DATA_H_
diff --git a/libs/libvpx/vp9/common/vp9_entropy.c b/libs/libvpx/vp9/common/vp9_entropy.c
index a575bda729c..430b917b8fb 100644
--- a/libs/libvpx/vp9/common/vp9_entropy.c
+++ b/libs/libvpx/vp9/common/vp9_entropy.c
@@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
                                           177, 153, 140, 133, 130, 129 };
 #endif
 
+/* clang-format off */
 const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
   // beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
+/* clang-format on */
 
 const uint8_t vp9_coefband_trans_4x4[16] = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
diff --git a/libs/libvpx/vp9/common/vp9_entropy.h b/libs/libvpx/vp9/common/vp9_entropy.h
index 1da49116687..d026651df76 100644
--- a/libs/libvpx/vp9/common/vp9_entropy.h
+++ b/libs/libvpx/vp9/common/vp9_entropy.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPY_H_
-#define VP9_COMMON_VP9_ENTROPY_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_
+#define VPX_VP9_COMMON_VP9_ENTROPY_H_
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/prob.h"
@@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
-
 #define COEFF_PROB_MODELS 255
 
 #define UNCONSTRAINED_NODES 3
@@ -195,4 +194,4 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPY_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPY_H_
diff --git a/libs/libvpx/vp9/common/vp9_entropymode.c b/libs/libvpx/vp9/common/vp9_entropymode.c
index 47cd63e94f9..bda824de3c2 100644
--- a/libs/libvpx/vp9/common/vp9_entropymode.c
+++ b/libs/libvpx/vp9/common/vp9_entropymode.c
@@ -179,29 +179,32 @@ static const vpx_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 101, 21, 107, 181, 192, 103, 19, 67, 125 }  // y = tm
 };
 
-const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
-    {
-      // 8x8 -> 4x4
-      { 158, 97, 94 },  // a/l both not split
-      { 93, 24, 99 },   // a split, l not split
-      { 85, 119, 44 },  // l split, a not split
-      { 62, 59, 67 },   // a/l both split
-      // 16x16 -> 8x8
-      { 149, 53, 53 },  // a/l both not split
-      { 94, 20, 48 },   // a split, l not split
-      { 83, 53, 24 },   // l split, a not split
-      { 52, 18, 18 },   // a/l both split
-      // 32x32 -> 16x16
-      { 150, 40, 39 },  // a/l both not split
-      { 78, 12, 26 },   // a split, l not split
-      { 67, 33, 11 },   // l split, a not split
-      { 24, 7, 5 },     // a/l both split
-      // 64x64 -> 32x32
-      { 174, 35, 49 },  // a/l both not split
-      { 68, 11, 27 },   // a split, l not split
-      { 57, 15, 9 },    // l split, a not split
-      { 12, 3, 3 },     // a/l both split
-    };
+const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+                                       // 8x8 -> 4x4
+                                       { 158, 97, 94 },  // a/l both not split
+                                       { 93, 24, 99 },   // a split, l not split
+                                       { 85, 119, 44 },  // l split, a not split
+                                       { 62, 59, 67 },   // a/l both split
+
+                                       // 16x16 -> 8x8
+                                       { 149, 53, 53 },  // a/l both not split
+                                       { 94, 20, 48 },   // a split, l not split
+                                       { 83, 53, 24 },   // l split, a not split
+                                       { 52, 18, 18 },   // a/l both split
+
+                                       // 32x32 -> 16x16
+                                       { 150, 40, 39 },  // a/l both not split
+                                       { 78, 12, 26 },   // a split, l not split
+                                       { 67, 33, 11 },   // l split, a not split
+                                       { 24, 7, 5 },     // a/l both split
+
+                                       // 64x64 -> 32x32
+                                       { 174, 35, 49 },  // a/l both not split
+                                       { 68, 11, 27 },   // a split, l not split
+                                       { 57, 15, 9 },    // l split, a not split
+                                       { 12, 3, 3 },     // a/l both split
+                                     };
 
 static const vpx_prob
     default_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = {
@@ -260,13 +263,13 @@ const vpx_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2, -PARTITION_HORZ, 4, -PARTITION_VERT, -PARTITION_SPLIT
 };
 
-static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
-  9, 102, 187, 225
-};
+static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { 9, 102,
+                                                                      187,
+                                                                      225 };
 
-static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = {
-  239, 183, 119, 96, 41
-};
+static const vpx_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { 239, 183,
+                                                                    119, 96,
+                                                                    41 };
 
 static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = { 50, 126, 123, 221,
                                                            226 };
@@ -331,8 +334,8 @@ static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp9_copy(fc->inter_mode_probs, default_inter_mode_probs);
 }
 
-const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(SWITCHABLE_FILTERS)] =
-    { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP };
+const vpx_tree_index vp9_switchable_interp_tree[TREE_SIZE(
+    SWITCHABLE_FILTERS)] = { -EIGHTTAP, 2, -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP };
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   int i, j;
diff --git a/libs/libvpx/vp9/common/vp9_entropymode.h b/libs/libvpx/vp9/common/vp9_entropymode.h
index 0ee663fe883..a756c8d0b8c 100644
--- a/libs/libvpx/vp9/common/vp9_entropymode.h
+++ b/libs/libvpx/vp9/common/vp9_entropymode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
-#define VP9_COMMON_VP9_ENTROPYMODE_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
+#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
@@ -104,4 +104,4 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPYMODE_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_
diff --git a/libs/libvpx/vp9/common/vp9_entropymv.c b/libs/libvpx/vp9/common/vp9_entropymv.c
index a18a290cfd0..b6f052d088e 100644
--- a/libs/libvpx/vp9/common/vp9_entropymv.c
+++ b/libs/libvpx/vp9/common/vp9_entropymv.c
@@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   18,          -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
 };
 
-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
-  -0, -1,
-};
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
 
 const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2,  -1,
                                                                4,  -2, -3 };
diff --git a/libs/libvpx/vp9/common/vp9_entropymv.h b/libs/libvpx/vp9/common/vp9_entropymv.h
index e2fe37a327a..ee9d37973ff 100644
--- a/libs/libvpx/vp9/common/vp9_entropymv.h
+++ b/libs/libvpx/vp9/common/vp9_entropymv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENTROPYMV_H_
-#define VP9_COMMON_VP9_ENTROPYMV_H_
+#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_
+#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_
 
 #include "./vpx_config.h"
 
@@ -25,7 +25,7 @@ struct VP9Common;
 
 void vp9_init_mv_probs(struct VP9Common *cm);
 
-void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp);
 
 static INLINE int use_mv_hp(const MV *ref) {
   const int kMvRefThresh = 64;  // threshold for use of high-precision 1/8 mv
@@ -127,10 +127,10 @@ typedef struct {
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
-void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx);
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENTROPYMV_H_
+#endif  // VPX_VP9_COMMON_VP9_ENTROPYMV_H_
diff --git a/libs/libvpx/vp9/common/vp9_enums.h b/libs/libvpx/vp9/common/vp9_enums.h
index 056b298b3db..b33a3a2978c 100644
--- a/libs/libvpx/vp9/common/vp9_enums.h
+++ b/libs/libvpx/vp9/common/vp9_enums.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ENUMS_H_
-#define VP9_COMMON_VP9_ENUMS_H_
+#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_
+#define VPX_VP9_COMMON_VP9_ENUMS_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -41,6 +41,8 @@ typedef enum BITSTREAM_PROFILE {
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
+typedef enum PARSE_RECON_FLAG { PARSE = 1, RECON = 2 } PARSE_RECON_FLAG;
+
 #define BLOCK_4X4 0
 #define BLOCK_4X8 1
 #define BLOCK_8X4 2
@@ -140,4 +142,4 @@ typedef uint8_t PREDICTION_MODE;
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ENUMS_H_
+#endif  // VPX_VP9_COMMON_VP9_ENUMS_H_
diff --git a/libs/libvpx/vp9/common/vp9_filter.c b/libs/libvpx/vp9/common/vp9_filter.c
index 6c43af8ce80..adbda6c825b 100644
--- a/libs/libvpx/vp9/common/vp9_filter.c
+++ b/libs/libvpx/vp9/common/vp9_filter.c
@@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel,
   { 0, -3, 2, 41, 63, 29, -2, -2 },   { 0, -3, 1, 38, 64, 32, -1, -3 }
 };
 
-const InterpKernel *vp9_filter_kernels[4] = {
-  sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters
+// 4-tap filter
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+  { 0, 0, 0, 128, 0, 0, 0, 0 },     { 0, 0, -4, 126, 8, -2, 0, 0 },
+  { 0, 0, -6, 120, 18, -4, 0, 0 },  { 0, 0, -8, 114, 28, -6, 0, 0 },
+  { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 },
+  { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+  { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+  { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 },
+  { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 },
+  { 0, 0, -4, 18, 120, -6, 0, 0 },  { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+
+const InterpKernel *vp9_filter_kernels[5] = {
+  sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters,
+  sub_pel_filters_4
 };
diff --git a/libs/libvpx/vp9/common/vp9_filter.h b/libs/libvpx/vp9/common/vp9_filter.h
index 9d2b8e1dbf2..0382c88e7c0 100644
--- a/libs/libvpx/vp9/common/vp9_filter.h
+++ b/libs/libvpx/vp9/common/vp9_filter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_FILTER_H_
-#define VP9_COMMON_VP9_FILTER_H_
+#ifndef VPX_VP9_COMMON_VP9_FILTER_H_
+#define VPX_VP9_COMMON_VP9_FILTER_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -25,6 +25,7 @@ extern "C" {
 #define EIGHTTAP_SHARP 2
 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */
 #define BILINEAR 3
+#define FOURTAP 4
 // The codec can operate in four possible inter prediction filter mode:
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
@@ -32,10 +33,10 @@ extern "C" {
 
 typedef uint8_t INTERP_FILTER;
 
-extern const InterpKernel *vp9_filter_kernels[4];
+extern const InterpKernel *vp9_filter_kernels[5];
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_FILTER_H_
+#endif  // VPX_VP9_COMMON_VP9_FILTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_frame_buffers.h b/libs/libvpx/vp9/common/vp9_frame_buffers.h
index e2cfe61b662..11be838c020 100644
--- a/libs/libvpx/vp9/common/vp9_frame_buffers.h
+++ b/libs/libvpx/vp9/common/vp9_frame_buffers.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_
-#define VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
 
 #include "vpx/vpx_frame_buffer.h"
 #include "vpx/vpx_integer.h"
@@ -50,4 +50,4 @@ int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_FRAME_BUFFERS_H_
+#endif  // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_
diff --git a/libs/libvpx/vp9/common/vp9_idct.h b/libs/libvpx/vp9/common/vp9_idct.h
index 3e83b8402de..94eeaf599e1 100644
--- a/libs/libvpx/vp9/common/vp9_idct.h
+++ b/libs/libvpx/vp9/common/vp9_idct.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_IDCT_H_
-#define VP9_COMMON_VP9_IDCT_H_
+#ifndef VPX_VP9_COMMON_VP9_IDCT_H_
+#define VPX_VP9_COMMON_VP9_IDCT_H_
 
 #include <assert.h>
 
@@ -78,4 +78,4 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_IDCT_H_
+#endif  // VPX_VP9_COMMON_VP9_IDCT_H_
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.c b/libs/libvpx/vp9/common/vp9_loopfilter.c
index c7c343aed5d..95d6029f3b5 100644
--- a/libs/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.c
@@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
-                    MODE_INFO **mi, const int mode_info_stride,
+                    MODE_INFO **mi8x8, const int mode_info_stride,
                     LOOP_FILTER_MASK *lfm) {
   int idx_32, idx_16, idx_8;
   const loop_filter_info_n *const lfi_n = &cm->lf_info;
-  MODE_INFO **mip = mi;
-  MODE_INFO **mip2 = mi;
+  MODE_INFO **mip = mi8x8;
+  MODE_INFO **mip2 = mi8x8;
 
   // These are offsets to the next mi in the 64x64 block. It is what gets
   // added to the mi ptr as we go through each loop. It helps us to avoid
@@ -1087,13 +1087,19 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
   const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t *const dst0 = dst->buf;
-  unsigned int mask_16x16[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_8x8[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_4x4[MI_BLOCK_SIZE] = { 0 };
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = { 0 };
+  unsigned int mask_16x16[MI_BLOCK_SIZE];
+  unsigned int mask_8x8[MI_BLOCK_SIZE];
+  unsigned int mask_4x4[MI_BLOCK_SIZE];
+  unsigned int mask_4x4_int[MI_BLOCK_SIZE];
   uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
   int r, c;
 
+  vp9_zero(mask_16x16);
+  vp9_zero(mask_8x8);
+  vp9_zero(mask_4x4);
+  vp9_zero(mask_4x4_int);
+  vp9_zero(lfl);
+
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
@@ -1174,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0);
+    border_mask = ~(mi_col == 0 ? 1 : 0);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(
@@ -1330,6 +1336,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm,
   uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
   uint16_t mask_4x4_int = lfm->int_4x4_uv;
 
+  vp9_zero(lfl_uv);
+
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
 
   // Vertical pass: do 2 rows at one time
diff --git a/libs/libvpx/vp9/common/vp9_loopfilter.h b/libs/libvpx/vp9/common/vp9_loopfilter.h
index 481a6cdc63c..39648a72c32 100644
--- a/libs/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libs/libvpx/vp9/common/vp9_loopfilter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_LOOPFILTER_H_
-#define VP9_COMMON_VP9_LOOPFILTER_H_
+#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_
+#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_
 
 #include "vpx_ports/mem.h"
 #include "./vpx_config.h"
@@ -97,7 +97,7 @@ struct VP9LfSyncData;
 // This function sets up the bit masks for the entire 64x64 region represented
 // by mi_row, mi_col.
 void vp9_setup_mask(struct VP9Common *const cm, const int mi_row,
-                    const int mi_col, MODE_INFO **mi_8x8,
+                    const int mi_col, MODE_INFO **mi8x8,
                     const int mode_info_stride, LOOP_FILTER_MASK *lfm);
 
 void vp9_filter_block_plane_ss00(struct VP9Common *const cm,
@@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm);
 void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl);
 
 void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
-                           struct macroblockd *mbd, int filter_level,
+                           struct macroblockd *xd, int frame_filter_level,
                            int y_only, int partial_frame);
 
 // Get the superblock lfm for a given mi_row, mi_col.
@@ -157,4 +157,4 @@ int vp9_loop_filter_worker(void *arg1, void *unused);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_LOOPFILTER_H_
+#endif  // VPX_VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_mfqe.h b/libs/libvpx/vp9/common/vp9_mfqe.h
index dfff8c23d65..f53e1c2f9d9 100644
--- a/libs/libvpx/vp9/common/vp9_mfqe.h
+++ b/libs/libvpx/vp9/common/vp9_mfqe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_MFQE_H_
-#define VP9_COMMON_VP9_MFQE_H_
+#ifndef VPX_VP9_COMMON_VP9_MFQE_H_
+#define VPX_VP9_COMMON_VP9_MFQE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,4 +28,4 @@ void vp9_mfqe(struct VP9Common *cm);
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MFQE_H_
+#endif  // VPX_VP9_COMMON_VP9_MFQE_H_
diff --git a/libs/libvpx/vp9/common/vp9_mv.h b/libs/libvpx/vp9/common/vp9_mv.h
index 4c8eac7213d..14dde7dd052 100644
--- a/libs/libvpx/vp9/common/vp9_mv.h
+++ b/libs/libvpx/vp9/common/vp9_mv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_MV_H_
-#define VP9_COMMON_VP9_MV_H_
+#ifndef VPX_VP9_COMMON_VP9_MV_H_
+#define VPX_VP9_COMMON_VP9_MV_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -52,4 +52,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MV_H_
+#endif  // VPX_VP9_COMMON_VP9_MV_H_
diff --git a/libs/libvpx/vp9/common/vp9_mvref_common.h b/libs/libvpx/vp9/common/vp9_mvref_common.h
index 2b2c1ba9eed..5db6772dca5 100644
--- a/libs/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libs/libvpx/vp9/common/vp9_mvref_common.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_
-#define VP9_COMMON_VP9_MVREF_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
+#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
 
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_blockd.h"
@@ -263,10 +263,10 @@ static INLINE int_mv scale_mv(const MODE_INFO *mi, int ref,
                                  mv_ref_list, Done)                           \
   do {                                                                        \
     if (is_inter_block(mbmi)) {                                               \
-      if ((mbmi)->ref_frame[0] != ref_frame)                                  \
+      if ((mbmi)->ref_frame[0] != (ref_frame))                                \
         ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias),        \
                         refmv_count, mv_ref_list, Done);                      \
-      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != ref_frame &&        \
+      if (has_second_ref(mbmi) && (mbmi)->ref_frame[1] != (ref_frame) &&      \
           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int)                       \
         ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias),        \
                         refmv_count, mv_ref_list, Done);                      \
@@ -320,4 +320,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_onyxc_int.h b/libs/libvpx/vp9/common/vp9_onyxc_int.h
index 1d96d92c244..662b8ef5e12 100644
--- a/libs/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libs/libvpx/vp9/common/vp9_onyxc_int.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_ONYXC_INT_H_
-#define VP9_COMMON_VP9_ONYXC_INT_H_
+#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_
+#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -37,10 +37,9 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 1 scratch frame for the new frame, 3 for scaled references on the encoder.
-// TODO(jkoleszar): These 3 extra references could probably come from the
-// normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 4)
+// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on
+// the encoder.
+#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -70,6 +69,7 @@ typedef struct {
   int mi_rows;
   int mi_cols;
   uint8_t released;
+  int frame_index;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
 } RefCntBuffer;
@@ -128,6 +128,8 @@ typedef struct VP9Common {
 
   int new_fb_idx;
 
+  int cur_show_frame_fb_idx;
+
 #if CONFIG_VP9_POSTPROC
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG post_proc_buffer_int;
@@ -256,8 +258,16 @@ typedef struct VP9Common {
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
   int above_context_alloc_cols;
+
+  int lf_row;
 } VP9_COMMON;
 
+static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) {
+  if (index < 0 || index >= FRAME_BUFFERS) return NULL;
+  if (cm->error.error_code != VPX_CODEC_OK) return NULL;
+  return &cm->buffer_pool->frame_bufs[index].buf;
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
   if (cm->ref_frame_map[index] < 0) return NULL;
@@ -405,4 +415,4 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_ONYXC_INT_H_
+#endif  // VPX_VP9_COMMON_VP9_ONYXC_INT_H_
diff --git a/libs/libvpx/vp9/common/vp9_postproc.c b/libs/libvpx/vp9/common/vp9_postproc.c
index dfc315eeacf..5373b021812 100644
--- a/libs/libvpx/vp9/common/vp9_postproc.c
+++ b/libs/libvpx/vp9/common/vp9_postproc.c
@@ -293,7 +293,7 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) {
 }
 
 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *ppflags) {
+                        vp9_ppflags_t *ppflags, int unscaled_width) {
   const int q = VPXMIN(105, cm->lf.filter_level * 2);
   const int flags = ppflags->post_proc_flag;
   YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer;
@@ -359,7 +359,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
   if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) {
     if (!cm->postproc_state.limits) {
       cm->postproc_state.limits =
-          vpx_calloc(cm->width, sizeof(*cm->postproc_state.limits));
+          vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits));
     }
   }
 
diff --git a/libs/libvpx/vp9/common/vp9_postproc.h b/libs/libvpx/vp9/common/vp9_postproc.h
index 6059094114b..67efc1b4e4b 100644
--- a/libs/libvpx/vp9/common/vp9_postproc.h
+++ b/libs/libvpx/vp9/common/vp9_postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_POSTPROC_H_
-#define VP9_COMMON_VP9_POSTPROC_H_
+#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_
+#define VPX_VP9_COMMON_VP9_POSTPROC_H_
 
 #include "vpx_ports/mem.h"
 #include "vpx_scale/yv12config.h"
@@ -38,7 +38,7 @@ struct VP9Common;
 #define MFQE_PRECISION 4
 
 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
-                        vp9_ppflags_t *flags);
+                        vp9_ppflags_t *ppflags, int unscaled_width);
 
 void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
                  uint8_t *limits);
@@ -50,4 +50,4 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_POSTPROC_H_
+#endif  // VPX_VP9_COMMON_VP9_POSTPROC_H_
diff --git a/libs/libvpx/vp9/common/vp9_ppflags.h b/libs/libvpx/vp9/common/vp9_ppflags.h
index b8b647bf18d..a0e30176269 100644
--- a/libs/libvpx/vp9/common/vp9_ppflags.h
+++ b/libs/libvpx/vp9/common/vp9_ppflags.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PPFLAGS_H_
-#define VP9_COMMON_VP9_PPFLAGS_H_
+#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_
+#define VPX_VP9_COMMON_VP9_PPFLAGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ typedef struct {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PPFLAGS_H_
+#endif  // VPX_VP9_COMMON_VP9_PPFLAGS_H_
diff --git a/libs/libvpx/vp9/common/vp9_pred_common.c b/libs/libvpx/vp9/common/vp9_pred_common.c
index a7ddc0b9515..375cb4d76c7 100644
--- a/libs/libvpx/vp9/common/vp9_pred_common.c
+++ b/libs/libvpx/vp9/common/vp9_pred_common.c
@@ -13,6 +13,32 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+int vp9_compound_reference_allowed(const VP9_COMMON *cm) {
+  int i;
+  for (i = 1; i < REFS_PER_FRAME; ++i)
+    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
+
+  return 0;
+}
+
+void vp9_setup_compound_reference_mode(VP9_COMMON *cm) {
+  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
+    cm->comp_fixed_ref = ALTREF_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = GOLDEN_FRAME;
+  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
+             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
+    cm->comp_fixed_ref = GOLDEN_FRAME;
+    cm->comp_var_ref[0] = LAST_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  } else {
+    cm->comp_fixed_ref = LAST_FRAME;
+    cm->comp_var_ref[0] = GOLDEN_FRAME;
+    cm->comp_var_ref[1] = ALTREF_FRAME;
+  }
+}
+
 int vp9_get_reference_mode_context(const VP9_COMMON *cm,
                                    const MACROBLOCKD *xd) {
   int ctx;
@@ -229,9 +255,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
         else
           pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
       } else {
-        pred_context = 1 +
-                       2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
-                            edge_mi->ref_frame[1] == GOLDEN_FRAME);
+        pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+                                edge_mi->ref_frame[1] == GOLDEN_FRAME);
       }
     } else {  // inter/inter
       const int above_has_second = has_second_ref(above_mi);
diff --git a/libs/libvpx/vp9/common/vp9_pred_common.h b/libs/libvpx/vp9/common/vp9_pred_common.h
index 8400bd70f1d..ee59669359c 100644
--- a/libs/libvpx/vp9/common/vp9_pred_common.h
+++ b/libs/libvpx/vp9/common/vp9_pred_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_PRED_COMMON_H_
-#define VP9_COMMON_VP9_PRED_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_
+#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -145,6 +145,10 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
   return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
+int vp9_compound_reference_allowed(const VP9_COMMON *cm);
+
+void vp9_setup_compound_reference_mode(VP9_COMMON *cm);
+
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
@@ -176,12 +180,6 @@ static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
   }
 }
 
-static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
-                                            const MACROBLOCKD *xd,
-                                            const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
-}
-
 static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
                                           struct tx_counts *tx_counts) {
   switch (max_tx_size) {
@@ -196,4 +194,4 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_PRED_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_quant_common.h b/libs/libvpx/vp9/common/vp9_quant_common.h
index 4bae4a89677..ec8b9f4c6a7 100644
--- a/libs/libvpx/vp9/common/vp9_quant_common.h
+++ b/libs/libvpx/vp9/common/vp9_quant_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_
-#define VP9_COMMON_VP9_QUANT_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
+#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
 
 #include "vpx/vpx_codec.h"
 #include "vp9/common/vp9_seg_common.h"
@@ -33,4 +33,4 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_QUANT_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.c b/libs/libvpx/vp9/common/vp9_reconinter.c
index a108a65153b..04f41e6a336 100644
--- a/libs/libvpx/vp9/common/vp9_reconinter.c
+++ b/libs/libvpx/vp9/common/vp9_reconinter.c
@@ -63,14 +63,14 @@ static INLINE int round_mv_comp_q4(int value) {
 }
 
 static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
-  MV res = {
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.row + mi->bmi[1].as_mv[idx].as_mv.row +
-        mi->bmi[2].as_mv[idx].as_mv.row + mi->bmi[3].as_mv[idx].as_mv.row),
-    round_mv_comp_q4(
-        mi->bmi[0].as_mv[idx].as_mv.col + mi->bmi[1].as_mv[idx].as_mv.col +
-        mi->bmi[2].as_mv[idx].as_mv.col + mi->bmi[3].as_mv[idx].as_mv.col)
-  };
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
   return res;
 }
 
@@ -136,7 +136,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
     struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+    uint8_t *const dst = dst_buf->buf + (int64_t)dst_buf->stride * y + x;
     const MV mv = mi->sb_type < BLOCK_8X8
                       ? average_split_mvs(pd, mi, ref, block)
                       : mi->mv[ref].as_mv;
@@ -178,7 +178,7 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       xs = sf->x_step_q4;
       ys = sf->y_step_q4;
     } else {
-      pre = pre_buf->buf + (y * pre_buf->stride + x);
+      pre = pre_buf->buf + ((int64_t)y * pre_buf->stride + x);
       scaled_mv.row = mv_q4.row;
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
diff --git a/libs/libvpx/vp9/common/vp9_reconinter.h b/libs/libvpx/vp9/common/vp9_reconinter.h
index bb9291a2646..12b545831a8 100644
--- a/libs/libvpx/vp9/common/vp9_reconinter.h
+++ b/libs/libvpx/vp9/common/vp9_reconinter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_RECONINTER_H_
-#define VP9_COMMON_VP9_RECONINTER_H_
+#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_
+#define VPX_VP9_COMMON_VP9_RECONINTER_H_
 
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_onyxc_int.h"
@@ -61,24 +61,25 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
-                               int dst_stride, const MV *mv_q3,
+                               int dst_stride, const MV *src_mv,
                                const struct scale_factors *sf, int w, int h,
-                               int do_avg, const InterpKernel *kernel,
+                               int ref, const InterpKernel *kernel,
                                enum mv_precision precision, int x, int y);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_build_inter_predictor(
     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride,
-    const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg,
+    const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref,
     const InterpKernel *kernel, enum mv_precision precision, int x, int y,
     int bd);
 #endif
 
-static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
-                                       const struct scale_factors *sf) {
+static INLINE int64_t scaled_buffer_offset(int x_offset, int y_offset,
+                                           int stride,
+                                           const struct scale_factors *sf) {
   const int x = sf ? sf->scale_value_x(x_offset, sf) : x_offset;
   const int y = sf ? sf->scale_value_y(y_offset, sf) : y_offset;
-  return y * stride + x;
+  return (int64_t)y * stride + x;
 }
 
 static INLINE void setup_pred_plane(struct buf_2d *dst, uint8_t *src,
@@ -103,4 +104,4 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_RECONINTER_H_
+#endif  // VPX_VP9_COMMON_VP9_RECONINTER_H_
diff --git a/libs/libvpx/vp9/common/vp9_reconintra.h b/libs/libvpx/vp9/common/vp9_reconintra.h
index 78e41c88118..426a35ebfa2 100644
--- a/libs/libvpx/vp9/common/vp9_reconintra.h
+++ b/libs/libvpx/vp9/common/vp9_reconintra.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_RECONINTRA_H_
-#define VP9_COMMON_VP9_RECONINTRA_H_
+#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_
+#define VPX_VP9_COMMON_VP9_RECONINTRA_H_
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
@@ -28,4 +28,4 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_RECONINTRA_H_
+#endif  // VPX_VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
index 22b67ecacee..6980b9b7fbd 100644
--- a/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libs/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -62,18 +62,18 @@ ()
 
 add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
-add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
 
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   # Note that there are more specializations appended when
   # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add sse2/;
-  specialize qw/vp9_iht8x8_64_add sse2/;
-  specialize qw/vp9_iht16x16_256_add sse2/;
+  specialize qw/vp9_iht4x4_16_add neon sse2 vsx/;
+  specialize qw/vp9_iht8x8_64_add neon sse2 vsx/;
+  specialize qw/vp9_iht16x16_256_add neon sse2 vsx/;
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
-    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
-    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+    specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add dspr2 msa/;
     specialize qw/vp9_iht16x16_256_add dspr2 msa/;
   }
 }
@@ -100,7 +100,13 @@ ()
 
   add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
 
-  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+  add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
+
+  if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+    specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/;
+    specialize qw/vp9_highbd_iht16x16_256_add neon sse4_1/;
+  }
 }
 
 #
@@ -123,28 +129,22 @@ ()
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
-
-add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error avx2 sse2/;
 
   specialize qw/vp9_block_error_fp avx2 sse2/;
 
-  specialize qw/vp9_fdct8x8_quant neon ssse3/;
-
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 } else {
   specialize qw/vp9_block_error avx2 msa sse2/;
 
   specialize qw/vp9_block_error_fp neon avx2 sse2/;
-
-  specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 # fdct functions
@@ -177,11 +177,20 @@ ()
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx/;
 
+#
+# Apply temporal filter
+#
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
-add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
-specialize qw/vp9_temporal_filter_apply sse4_1/;
+add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
+specialize qw/vp9_apply_temporal_filter sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
+    specialize qw/vp9_highbd_apply_temporal_filter sse4_1/;
+  }
 }
 
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
@@ -199,7 +208,7 @@ ()
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
-  add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
+  add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count";
 
 }
 # End vp9_high encoder functions
diff --git a/libs/libvpx/vp9/common/vp9_scale.h b/libs/libvpx/vp9/common/vp9_scale.h
index ada8dbaad5c..2f3b609483e 100644
--- a/libs/libvpx/vp9/common/vp9_scale.h
+++ b/libs/libvpx/vp9/common/vp9_scale.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SCALE_H_
-#define VP9_COMMON_VP9_SCALE_H_
+#ifndef VPX_VP9_COMMON_VP9_SCALE_H_
+#define VPX_VP9_COMMON_VP9_SCALE_H_
 
 #include "vp9/common/vp9_mv.h"
 #include "vpx_dsp/vpx_convolve.h"
@@ -20,7 +20,7 @@ extern "C" {
 
 #define REF_SCALE_SHIFT 14
 #define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
-#define REF_INVALID_SCALE -1
+#define REF_INVALID_SCALE (-1)
 
 struct scale_factors {
   int x_scale_fp;  // horizontal fixed point scale factor
@@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h,
-                                       int use_high);
+                                       int use_highbd);
 #else
 void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
                                        int other_h, int this_w, int this_h);
@@ -68,4 +68,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SCALE_H_
+#endif  // VPX_VP9_COMMON_VP9_SCALE_H_
diff --git a/libs/libvpx/vp9/common/vp9_scan.h b/libs/libvpx/vp9/common/vp9_scan.h
index b3520e7dccb..72a9a5ec472 100644
--- a/libs/libvpx/vp9/common/vp9_scan.h
+++ b/libs/libvpx/vp9/common/vp9_scan.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SCAN_H_
-#define VP9_COMMON_VP9_SCAN_H_
+#ifndef VPX_VP9_COMMON_VP9_SCAN_H_
+#define VPX_VP9_COMMON_VP9_SCAN_H_
 
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -55,4 +55,4 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SCAN_H_
+#endif  // VPX_VP9_COMMON_VP9_SCAN_H_
diff --git a/libs/libvpx/vp9/common/vp9_seg_common.h b/libs/libvpx/vp9/common/vp9_seg_common.h
index b9bf75d5802..b63e4f49990 100644
--- a/libs/libvpx/vp9/common/vp9_seg_common.h
+++ b/libs/libvpx/vp9/common/vp9_seg_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_SEG_COMMON_H_
-#define VP9_COMMON_VP9_SEG_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_
+#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_
 
 #include "vpx_dsp/prob.h"
 
@@ -78,4 +78,4 @@ extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)];
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_SEG_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_SEG_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_thread_common.c b/libs/libvpx/vp9/common/vp9_thread_common.c
index 8d44e91f2e5..c79d9b7f08c 100644
--- a/libs/libvpx/vp9/common/vp9_thread_common.c
+++ b/libs/libvpx/vp9/common/vp9_thread_common.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+#include <limits.h>
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
@@ -38,11 +40,11 @@ static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) {
   const int nsync = lf_sync->sync_range;
 
   if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1];
+    pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1];
     mutex_lock(mutex);
 
     while (c > lf_sync->cur_sb_col[r - 1] - nsync) {
-      pthread_cond_wait(&lf_sync->cond_[r - 1], mutex);
+      pthread_cond_wait(&lf_sync->cond[r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
   }
@@ -69,12 +71,12 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c,
   }
 
   if (sig) {
-    mutex_lock(&lf_sync->mutex_[r]);
+    mutex_lock(&lf_sync->mutex[r]);
 
     lf_sync->cur_sb_col[r] = cur;
 
-    pthread_cond_signal(&lf_sync->cond_[r]);
-    pthread_mutex_unlock(&lf_sync->mutex_[r]);
+    pthread_cond_signal(&lf_sync->cond[r]);
+    pthread_mutex_unlock(&lf_sync->mutex[r]);
   }
 #else
   (void)lf_sync;
@@ -91,6 +93,7 @@ static INLINE void thread_loop_filter_rows(
     int y_only, VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
   const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  const int num_active_workers = lf_sync->num_active_workers;
   int mi_row, mi_col;
   enum lf_path path;
   if (y_only)
@@ -102,8 +105,10 @@ static INLINE void thread_loop_filter_rows(
   else
     path = LF_PATH_SLOW;
 
+  assert(num_active_workers > 0);
+
   for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+       mi_row += num_active_workers * MI_BLOCK_SIZE) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
     LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0);
 
@@ -157,10 +162,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   // Number of superblock rows and cols
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
-  // Decoder may allocate more threads than number of tiles based on user's
-  // input.
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int num_workers = VPXMIN(nworkers, tile_cols);
+  const int num_tile_cols = 1 << cm->log2_tile_cols;
+  // Limit the number of workers to prevent changes in frame dimensions from
+  // causing incorrect sync calculations when sb_rows < threads/tile_cols.
+  // Further restrict them by the number of tile columns should the user
+  // request more as this implementation doesn't scale well beyond that.
+  const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows));
   int i;
 
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
@@ -168,6 +175,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
     vp9_loop_filter_dealloc(lf_sync);
     vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
   }
+  lf_sync->num_active_workers = num_workers;
 
   // Initialize cur_sb_col to -1 for all SB rows.
   memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
@@ -231,6 +239,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
                       workers, num_workers, lf_sync);
 }
 
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
+                     int num_workers) {
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+
+  if (!frame_filter_level) return;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  lf_sync->corrupted = 0;
+
+  memset(lf_sync->num_tiles_done, 0,
+         sizeof(*lf_sync->num_tiles_done) * sb_rows);
+  cm->lf_row = 0;
+}
+
 // Set up nsync by width.
 static INLINE int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
@@ -253,19 +283,38 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex_,
-                    vpx_malloc(sizeof(*lf_sync->mutex_) * rows));
-    if (lf_sync->mutex_) {
+    CHECK_MEM_ERROR(cm, lf_sync->mutex,
+                    vpx_malloc(sizeof(*lf_sync->mutex) * rows));
+    if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&lf_sync->mutex_[i], NULL);
+        pthread_mutex_init(&lf_sync->mutex[i], NULL);
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond_,
-                    vpx_malloc(sizeof(*lf_sync->cond_) * rows));
-    if (lf_sync->cond_) {
+    CHECK_MEM_ERROR(cm, lf_sync->cond,
+                    vpx_malloc(sizeof(*lf_sync->cond) * rows));
+    if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&lf_sync->cond_[i], NULL);
+        pthread_cond_init(&lf_sync->cond[i], NULL);
+      }
+    }
+    pthread_mutex_init(&lf_sync->lf_mutex, NULL);
+
+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
+    if (lf_sync->recon_done_mutex) {
+      int i;
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
+    if (lf_sync->recon_done_cond) {
+      int i;
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
       }
     }
   }
@@ -274,10 +323,16 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   CHECK_MEM_ERROR(cm, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
+  lf_sync->num_active_workers = lf_sync->num_workers;
 
   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
+  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+                  vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
+                                 mi_cols_aligned_to_sb(cm->mi_rows) >>
+                             MI_BLOCK_SIZE_LOG2));
+
   // Set up nsync.
   lf_sync->sync_range = get_sync_range(width);
 }
@@ -288,27 +343,149 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
 #if CONFIG_MULTITHREAD
     int i;
 
-    if (lf_sync->mutex_ != NULL) {
+    if (lf_sync->mutex != NULL) {
       for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_mutex_destroy(&lf_sync->mutex_[i]);
+        pthread_mutex_destroy(&lf_sync->mutex[i]);
       }
-      vpx_free(lf_sync->mutex_);
+      vpx_free(lf_sync->mutex);
     }
-    if (lf_sync->cond_ != NULL) {
+    if (lf_sync->cond != NULL) {
       for (i = 0; i < lf_sync->rows; ++i) {
-        pthread_cond_destroy(&lf_sync->cond_[i]);
+        pthread_cond_destroy(&lf_sync->cond[i]);
       }
-      vpx_free(lf_sync->cond_);
+      vpx_free(lf_sync->cond);
+    }
+    if (lf_sync->recon_done_mutex != NULL) {
+      int i;
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
+      }
+      vpx_free(lf_sync->recon_done_mutex);
+    }
+
+    pthread_mutex_destroy(&lf_sync->lf_mutex);
+    if (lf_sync->recon_done_cond != NULL) {
+      int i;
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
+      }
+      vpx_free(lf_sync->recon_done_cond);
     }
 #endif  // CONFIG_MULTITHREAD
+
     vpx_free(lf_sync->lfdata);
     vpx_free(lf_sync->cur_sb_col);
+    vpx_free(lf_sync->num_tiles_done);
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
     vp9_zero(*lf_sync);
   }
 }
 
+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
+  int return_val = -1;
+  int cur_row;
+  const int max_rows = cm->mi_rows;
+
+#if CONFIG_MULTITHREAD
+  const int tile_cols = 1 << cm->log2_tile_cols;
+
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  if (cm->lf_row < max_rows) {
+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+    if (cm->lf_row < max_rows) {
+      /* If this is not the last row, make sure the next row is also decoded.
+       * This is because the intra predict has to happen before loop filter */
+      cur_row += 1;
+    }
+  }
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+
+  if (return_val == -1) return return_val;
+
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
+  if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
+    pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
+                      &lf_sync->recon_done_mutex[cur_row]);
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  if (lf_sync->corrupted) {
+    int row = return_val >> MI_BLOCK_SIZE_LOG2;
+    pthread_mutex_lock(&lf_sync->mutex[row]);
+    lf_sync->cur_sb_col[row] = INT_MAX;
+    pthread_cond_signal(&lf_sync->cond[row]);
+    pthread_mutex_unlock(&lf_sync->mutex[row]);
+    return_val = -1;
+  }
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+#else
+  (void)lf_sync;
+  if (cm->lf_row < max_rows) {
+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+    if (cm->lf_row < max_rows) {
+      /* If this is not the last row, make sure the next row is also decoded.
+       * This is because the intra predict has to happen before loop filter */
+      cur_row += 1;
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  return return_val;
+}
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+  int mi_row;
+  VP9_COMMON *cm = lf_data->cm;
+
+  while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) {
+    lf_data->start = mi_row;
+    lf_data->stop = mi_row + MI_BLOCK_SIZE;
+
+    thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                            lf_data->start, lf_data->stop, lf_data->y_only,
+                            lf_sync);
+  }
+}
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+                 int corrupted) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  lf_sync->corrupted |= corrupted;
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+  lf_sync->num_tiles_done[row] += 1;
+  if (num_tiles == lf_sync->num_tiles_done[row]) {
+    if (is_last_row) {
+      /* The last 2 rows wait on the last row to be done.
+       * So, we have to broadcast the signal in this case.
+       */
+      pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
+    } else {
+      pthread_cond_signal(&lf_sync->recon_done_cond[row]);
+    }
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+#else
+  (void)lf_sync;
+  (void)num_tiles;
+  (void)row;
+  (void)is_last_row;
+  (void)corrupted;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync) {
+  thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                          lf_data->start, lf_data->stop, lf_data->y_only,
+                          lf_sync);
+}
+
 // Accumulate frame counts.
 void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
                                  const FRAME_COUNTS *counts, int is_dec) {
diff --git a/libs/libvpx/vp9/common/vp9_thread_common.h b/libs/libvpx/vp9/common/vp9_thread_common.h
index 0f7c3ff7488..94c9de65932 100644
--- a/libs/libvpx/vp9/common/vp9_thread_common.h
+++ b/libs/libvpx/vp9/common/vp9_thread_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_
-#define VP9_COMMON_VP9_THREAD_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
+#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_util/vpx_thread.h"
@@ -24,8 +24,8 @@ struct FRAME_COUNTS;
 // Loopfilter row synchronization
 typedef struct VP9LfSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t *mutex;
+  pthread_cond_t *cond;
 #endif
   // Allocate memory to store the loop-filtered superblock index in each row.
   int *cur_sb_col;
@@ -36,7 +36,16 @@ typedef struct VP9LfSyncData {
 
   // Row-based parallel loopfilter data
   LFWorkerData *lfdata;
-  int num_workers;
+  int num_workers;         // number of allocated workers.
+  int num_active_workers;  // number of scheduled workers.
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t lf_mutex;
+  pthread_mutex_t *recon_done_mutex;
+  pthread_cond_t *recon_done_cond;
+#endif
+  int *num_tiles_done;
+  int corrupted;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
@@ -53,6 +62,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
                               int partial_frame, VPxWorker *workers,
                               int num_workers, VP9LfSync *lf_sync);
 
+// Multi-threaded loopfilter initialisations
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,
+                     int frame_filter_level, int num_workers);
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync);
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row,
+                 int corrupted);
+
+void vp9_loopfilter_job(LFWorkerData *lf_data, VP9LfSync *lf_sync);
+
 void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
                                  const struct FRAME_COUNTS *counts, int is_dec);
 
@@ -60,4 +80,4 @@ void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_THREAD_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_
diff --git a/libs/libvpx/vp9/common/vp9_tile_common.h b/libs/libvpx/vp9/common/vp9_tile_common.h
index 1b11c2680de..4ccf0a3d5f6 100644
--- a/libs/libvpx/vp9/common/vp9_tile_common.h
+++ b/libs/libvpx/vp9/common/vp9_tile_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_VP9_TILE_COMMON_H_
-#define VP9_COMMON_VP9_TILE_COMMON_H_
+#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_
+#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,4 +37,4 @@ void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_VP9_TILE_COMMON_H_
+#endif  // VPX_VP9_COMMON_VP9_TILE_COMMON_H_
diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 00000000000..57b79a732da
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+      s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+      x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+  highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+                                s15);
+
+  x0[0] = _mm_add_epi64(s0[0], s8[0]);
+  x0[1] = _mm_add_epi64(s0[1], s8[1]);
+  x1[0] = _mm_add_epi64(s1[0], s9[0]);
+  x1[1] = _mm_add_epi64(s1[1], s9[1]);
+  x2[0] = _mm_add_epi64(s2[0], s10[0]);
+  x2[1] = _mm_add_epi64(s2[1], s10[1]);
+  x3[0] = _mm_add_epi64(s3[0], s11[0]);
+  x3[1] = _mm_add_epi64(s3[1], s11[1]);
+  x4[0] = _mm_add_epi64(s4[0], s12[0]);
+  x4[1] = _mm_add_epi64(s4[1], s12[1]);
+  x5[0] = _mm_add_epi64(s5[0], s13[0]);
+  x5[1] = _mm_add_epi64(s5[1], s13[1]);
+  x6[0] = _mm_add_epi64(s6[0], s14[0]);
+  x6[1] = _mm_add_epi64(s6[1], s14[1]);
+  x7[0] = _mm_add_epi64(s7[0], s15[0]);
+  x7[1] = _mm_add_epi64(s7[1], s15[1]);
+  x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+  x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+  x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+  x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+  x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+  x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+  x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+  x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+  x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x0[0] = pack_4(x0[0], x0[1]);
+  x1[0] = pack_4(x1[0], x1[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 2
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  s4[0] = x4[0];
+  s5[0] = x5[0];
+  s6[0] = x6[0];
+  s7[0] = x7[0];
+  x0[0] = _mm_add_epi32(s0[0], s4[0]);
+  x1[0] = _mm_add_epi32(s1[0], s5[0]);
+  x2[0] = _mm_add_epi32(s2[0], s6[0]);
+  x3[0] = _mm_add_epi32(s3[0], s7[0]);
+  x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+  x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+  x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+  x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+  highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+  highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+                                s11);
+  highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+                                s12);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+                                s14);
+
+  x8[0] = _mm_add_epi64(s8[0], s12[0]);
+  x8[1] = _mm_add_epi64(s8[1], s12[1]);
+  x9[0] = _mm_add_epi64(s9[0], s13[0]);
+  x9[1] = _mm_add_epi64(s9[1], s13[1]);
+  x10[0] = _mm_add_epi64(s10[0], s14[0]);
+  x10[1] = _mm_add_epi64(s10[1], s14[1]);
+  x11[0] = _mm_add_epi64(s11[0], s15[0]);
+  x11[1] = _mm_add_epi64(s11[1], s15[1]);
+  x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+  x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+  x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+  x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+  x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+  x8[0] = dct_const_round_shift_64bit(x8[0]);
+  x8[1] = dct_const_round_shift_64bit(x8[1]);
+  x9[0] = dct_const_round_shift_64bit(x9[0]);
+  x9[1] = dct_const_round_shift_64bit(x9[1]);
+  x10[0] = dct_const_round_shift_64bit(x10[0]);
+  x10[1] = dct_const_round_shift_64bit(x10[1]);
+  x11[0] = dct_const_round_shift_64bit(x11[0]);
+  x11[1] = dct_const_round_shift_64bit(x11[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x8[0] = pack_4(x8[0], x8[1]);
+  x9[0] = pack_4(x9[0], x9[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 3
+  s0[0] = x0[0];
+  s1[0] = x1[0];
+  s2[0] = x2[0];
+  s3[0] = x3[0];
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+  s8[0] = x8[0];
+  s9[0] = x9[0];
+  s10[0] = x10[0];
+  s11[0] = x11[0];
+  highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+                                s13);
+  highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+                                s14);
+
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x8[0] = _mm_add_epi32(s8[0], s10[0]);
+  x9[0] = _mm_add_epi32(s9[0], s11[0]);
+  x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+  x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+  x12[0] = _mm_add_epi64(s12[0], s14[0]);
+  x12[1] = _mm_add_epi64(s12[1], s14[1]);
+  x13[0] = _mm_add_epi64(s13[0], s15[0]);
+  x13[1] = _mm_add_epi64(s13[1], s15[1]);
+  x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+  x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+  x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+  x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+  x12[0] = dct_const_round_shift_64bit(x12[0]);
+  x12[1] = dct_const_round_shift_64bit(x12[1]);
+  x13[0] = dct_const_round_shift_64bit(x13[0]);
+  x13[1] = dct_const_round_shift_64bit(x13[1]);
+  x14[0] = dct_const_round_shift_64bit(x14[0]);
+  x14[1] = dct_const_round_shift_64bit(x14[1]);
+  x15[0] = dct_const_round_shift_64bit(x15[0]);
+  x15[1] = dct_const_round_shift_64bit(x15[1]);
+  x12[0] = pack_4(x12[0], x12[1]);
+  x13[0] = pack_4(x13[0], x13[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  // stage 4
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x7[0], x6[0]);
+  s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+  s10[0] = _mm_add_epi32(x11[0], x10[0]);
+  s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+  s14[0] = _mm_add_epi32(x14[0], x15[0]);
+  s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+  highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+  highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+  highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+  highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x10[0] = dct_const_round_shift_64bit(s10[0]);
+  x10[1] = dct_const_round_shift_64bit(s10[1]);
+  x11[0] = dct_const_round_shift_64bit(s11[0]);
+  x11[1] = dct_const_round_shift_64bit(s11[1]);
+  x14[0] = dct_const_round_shift_64bit(s14[0]);
+  x14[1] = dct_const_round_shift_64bit(s14[1]);
+  x15[0] = dct_const_round_shift_64bit(s15[0]);
+  x15[1] = dct_const_round_shift_64bit(s15[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+  x10[0] = pack_4(x10[0], x10[1]);
+  x11[0] = pack_4(x11[0], x11[1]);
+  x14[0] = pack_4(x14[0], x14[1]);
+  x15[0] = pack_4(x15[0], x15[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+  io[2] = x12[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[4] = x6[0];
+  io[5] = x14[0];
+  io[6] = x10[0];
+  io[7] = x2[0];
+  io[8] = x3[0];
+  io[9] = x11[0];
+  io[10] = x15[0];
+  io[11] = x7[0];
+  io[12] = x5[0];
+  io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+  io[14] = x9[0];
+  io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int tx_type, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        idct16_8col(in, in);
+      } else {
+        vpx_iadst16_8col_sse2(in);
+      }
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        idct16_8col(out, out);
+      } else {
+        vpx_iadst16_8col_sse2(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+        vpx_highbd_idct16_4col_sse4_1(in);
+      } else {
+        highbd_iadst16_4col_sse4_1(in);
+      }
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+        vpx_highbd_idct16_4col_sse4_1(out);
+      } else {
+        highbd_iadst16_4col_sse4_1(out);
+      }
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
new file mode 100644
index 00000000000..af158536f92
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c
@@ -0,0 +1,131 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
+  const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
+  const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
+  const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
+  const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
+  __m128i temp[2];
+
+  transpose_32bit_4x4(io, io);
+
+  extend_64bit(io[0], temp);
+  s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
+  s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
+
+  extend_64bit(io[1], temp);
+  s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  extend_64bit(io[2], temp);
+  s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
+  s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
+  s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
+
+  extend_64bit(io[3], temp);
+  s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
+  s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
+  s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
+  s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
+
+  t0[0] = _mm_add_epi64(s0[0], s3[0]);
+  t0[1] = _mm_add_epi64(s0[1], s3[1]);
+  t0[0] = _mm_add_epi64(t0[0], s5[0]);
+  t0[1] = _mm_add_epi64(t0[1], s5[1]);
+  t1[0] = _mm_sub_epi64(s1[0], s4[0]);
+  t1[1] = _mm_sub_epi64(s1[1], s4[1]);
+  t1[0] = _mm_sub_epi64(t1[0], s6[0]);
+  t1[1] = _mm_sub_epi64(t1[1], s6[1]);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);
+  temp[0] = _mm_add_epi32(temp[0], io[3]);
+  extend_64bit(temp[0], temp);
+  t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
+  t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
+
+  s0[0] = _mm_add_epi64(t0[0], s2[0]);
+  s0[1] = _mm_add_epi64(t0[1], s2[1]);
+  s1[0] = _mm_add_epi64(t1[0], s2[0]);
+  s1[1] = _mm_add_epi64(t1[1], s2[1]);
+  s3[0] = _mm_add_epi64(t0[0], t1[0]);
+  s3[1] = _mm_add_epi64(t0[1], t1[1]);
+  s3[0] = _mm_sub_epi64(s3[0], s2[0]);
+  s3[1] = _mm_sub_epi64(s3[1], s2[1]);
+
+  s0[0] = dct_const_round_shift_64bit(s0[0]);
+  s0[1] = dct_const_round_shift_64bit(s0[1]);
+  s1[0] = dct_const_round_shift_64bit(s1[0]);
+  s1[1] = dct_const_round_shift_64bit(s1[1]);
+  s2[0] = dct_const_round_shift_64bit(t2[0]);
+  s2[1] = dct_const_round_shift_64bit(t2[1]);
+  s3[0] = dct_const_round_shift_64bit(s3[0]);
+  s3[1] = dct_const_round_shift_64bit(s3[1]);
+  io[0] = pack_4(s0[0], s0[1]);
+  io[1] = pack_4(s1[0], s1[1]);
+  io[2] = pack_4(s2[0], s2[1]);
+  io[3] = pack_4(s3[0], s3[1]);
+}
+
+void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      idct4_sse2(io_short);
+    } else {
+      iadst4_sse2(io_short);
+    }
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      highbd_idct4_sse4_1(io);
+    } else {
+      highbd_iadst4_sse4_1(io);
+    }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 00000000000..7d949b6dbce
--- /dev/null
+++ b/libs/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+                                                      const int c,
+                                                      __m128i *const s) {
+  const __m128i pair_c = pair_set_epi32(4 * c, 0);
+  __m128i x[2];
+
+  extend_64bit(in, x);
+  s[0] = _mm_mul_epi32(pair_c, x[0]);
+  s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+                                                 const __m128i in1,
+                                                 const int c0, const int c1,
+                                                 __m128i *const s0,
+                                                 __m128i *const s1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i t00[2], t01[2], t10[2], t11[2];
+  __m128i x0[2], x1[2];
+
+  extend_64bit(in0, x0);
+  extend_64bit(in1, x1);
+  t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+  t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+  t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+  t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+  t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+  t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+  t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+  t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+  s0[0] = _mm_add_epi64(t00[0], t11[0]);
+  s0[1] = _mm_add_epi64(t00[1], t11[1]);
+  s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+  s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+  __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+  __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+  highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+  x0[0] = _mm_add_epi64(s0[0], s4[0]);
+  x0[1] = _mm_add_epi64(s0[1], s4[1]);
+  x1[0] = _mm_add_epi64(s1[0], s5[0]);
+  x1[1] = _mm_add_epi64(s1[1], s5[1]);
+  x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+  x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+  x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+  x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+  highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+  highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+  x2[0] = _mm_add_epi64(s2[0], s6[0]);
+  x2[1] = _mm_add_epi64(s2[1], s6[1]);
+  x3[0] = _mm_add_epi64(s3[0], s7[0]);
+  x3[1] = _mm_add_epi64(s3[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+  x0[0] = dct_const_round_shift_64bit(x0[0]);
+  x0[1] = dct_const_round_shift_64bit(x0[1]);
+  x1[0] = dct_const_round_shift_64bit(x1[0]);
+  x1[1] = dct_const_round_shift_64bit(x1[1]);
+  x2[0] = dct_const_round_shift_64bit(x2[0]);
+  x2[1] = dct_const_round_shift_64bit(x2[1]);
+  x3[0] = dct_const_round_shift_64bit(x3[0]);
+  x3[1] = dct_const_round_shift_64bit(x3[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
+  s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
+  s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
+  s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 2
+  x0[0] = _mm_add_epi32(s0[0], s2[0]);
+  x1[0] = _mm_add_epi32(s1[0], s3[0]);
+  x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+  x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+  highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+  highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+  x4[0] = _mm_add_epi64(s4[0], s6[0]);
+  x4[1] = _mm_add_epi64(s4[1], s6[1]);
+  x5[0] = _mm_add_epi64(s5[0], s7[0]);
+  x5[1] = _mm_add_epi64(s5[1], s7[1]);
+  x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+  x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+  x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+  x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+  x4[0] = dct_const_round_shift_64bit(x4[0]);
+  x4[1] = dct_const_round_shift_64bit(x4[1]);
+  x5[0] = dct_const_round_shift_64bit(x5[0]);
+  x5[1] = dct_const_round_shift_64bit(x5[1]);
+  x6[0] = dct_const_round_shift_64bit(x6[0]);
+  x6[1] = dct_const_round_shift_64bit(x6[1]);
+  x7[0] = dct_const_round_shift_64bit(x7[0]);
+  x7[1] = dct_const_round_shift_64bit(x7[1]);
+  x4[0] = pack_4(x4[0], x4[1]);
+  x5[0] = pack_4(x5[0], x5[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  // stage 3
+  s2[0] = _mm_add_epi32(x2[0], x3[0]);
+  s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+  s6[0] = _mm_add_epi32(x6[0], x7[0]);
+  s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+  highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+  highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+  highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+  highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+  x2[0] = dct_const_round_shift_64bit(s2[0]);
+  x2[1] = dct_const_round_shift_64bit(s2[1]);
+  x3[0] = dct_const_round_shift_64bit(s3[0]);
+  x3[1] = dct_const_round_shift_64bit(s3[1]);
+  x6[0] = dct_const_round_shift_64bit(s6[0]);
+  x6[1] = dct_const_round_shift_64bit(s6[1]);
+  x7[0] = dct_const_round_shift_64bit(s7[0]);
+  x7[1] = dct_const_round_shift_64bit(s7[1]);
+  x2[0] = pack_4(x2[0], x2[1]);
+  x3[0] = pack_4(x3[0], x3[1]);
+  x6[0] = pack_4(x6[0], x6[1]);
+  x7[0] = pack_4(x7[0], x7[1]);
+
+  io[0] = x0[0];
+  io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+  io[2] = x6[0];
+  io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+  io[4] = x3[0];
+  io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+  io[6] = x5[0];
+  io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                     int stride, int tx_type, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+  io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+  io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+  io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+  io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+  io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+  io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+  io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+  io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_idct8_sse2(io_short);
+    } else {
+      iadst8_sse2(io_short);
+    }
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+
+    if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+      vpx_highbd_idct8x8_half1d_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+    } else {
+      highbd_iadst8_sse4_1(io);
+      io[8] = temp[0];
+      io[9] = temp[1];
+      io[10] = temp[2];
+      io[11] = temp[3];
+      highbd_iadst8_sse4_1(&io[8]);
+    }
+    highbd_idct8x8_final_round(io);
+  }
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index 6996260e261..ad693718c0e 100644
--- a/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libs/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,8 +10,6 @@
 
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"
 
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
@@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final round and shift
@@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
-      idct8_sse2(in);
-      idct8_sse2(in);
+    case DCT_DCT:
+      vpx_idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
-      idct8_sse2(in);
+    case ADST_DCT:
+      vpx_idct8_sse2(in);
       iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst8_sse2(in);
-      idct8_sse2(in);
+      vpx_idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst8_sse2(in);
       iadst8_sse2(in);
       break;
-    default: assert(0); break;
   }
 
   // Final rounding and shift
@@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    default:
+      assert(tx_type == ADST_ADST);
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    default: assert(0); break;
   }
 
   write_buffer_8x16(dest, in0, stride);
diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.c b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
index d0e896c13fc..7d66cb2b27d 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -23,6 +23,9 @@
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_util/vpx_thread.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
@@ -42,34 +45,15 @@
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_job_queue.h"
 
 #define MAX_VP9_HEADER_SIZE 80
 
-static int is_compound_reference_allowed(const VP9_COMMON *cm) {
-  int i;
-  for (i = 1; i < REFS_PER_FRAME; ++i)
-    if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1;
-
-  return 0;
-}
-
-static void setup_compound_reference_mode(VP9_COMMON *cm) {
-  if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-      cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
-    cm->comp_fixed_ref = ALTREF_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = GOLDEN_FRAME;
-  } else if (cm->ref_frame_sign_bias[LAST_FRAME] ==
-             cm->ref_frame_sign_bias[ALTREF_FRAME]) {
-    cm->comp_fixed_ref = GOLDEN_FRAME;
-    cm->comp_var_ref[0] = LAST_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  } else {
-    cm->comp_fixed_ref = LAST_FRAME;
-    cm->comp_var_ref[0] = GOLDEN_FRAME;
-    cm->comp_var_ref[1] = ALTREF_FRAME;
-  }
-}
+typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi,
+                                  int plane, int row, int col, TX_SIZE tx_size);
+
+typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi,
+                                 int plane, int row, int col, TX_SIZE tx_size);
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
   return len != 0 && len <= (size_t)(end - start);
@@ -118,7 +102,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
 
 static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm,
                                                 vpx_reader *r) {
-  if (is_compound_reference_allowed(cm)) {
+  if (vp9_compound_reference_allowed(cm)) {
     return vpx_read_bit(r)
                ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE)
                : SINGLE_REFERENCE;
@@ -351,20 +335,121 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   }
 }
 
+static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
+                                     int plane, int row, int col,
+                                     TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
+
+  if (mi->sb_type < BLOCK_8X8)
+    if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  if (!mi->skip) {
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_TYPE tx_type =
+        (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    const scan_order *sc = (plane || xd->lossless)
+                               ? &vp9_default_scan_orders[tx_size]
+                               : &vp9_scan_orders[tx_size][tx_type];
+    *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
+                                       mi->segment_id);
+    /* Keep the alignment to 16 */
+    pd->dqcoeff += (16 << (tx_size << 1));
+    pd->eob++;
+  }
+}
+
+static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd,
+                                                       MODE_INFO *const mi,
+                                                       int plane, int row,
+                                                       int col,
+                                                       TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode;
+  uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (mi->sb_type < BLOCK_8X8)
+    if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode;
+
+  vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride,
+                          dst, pd->dst.stride, col, row, plane);
+
+  if (!mi->skip) {
+    const TX_TYPE tx_type =
+        (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
+    if (*pd->eob > 0) {
+      inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst,
+                                    pd->dst.stride, *pd->eob);
+    }
+    /* Keep the alignment to 16 */
+    pd->dqcoeff += (16 << (tx_size << 1));
+    pd->eob++;
+  }
+}
+
 static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
-                                   int plane, int row, int col,
-                                   TX_SIZE tx_size) {
+                                   int plane, int row, int col, TX_SIZE tx_size,
+                                   int mi_row, int mi_col) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
+                                          mi->segment_id);
+  uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
+
+  if (eob > 0) {
+    inverse_transform_block_inter(xd, plane, tx_size, dst, pd->dst.stride, eob);
+  }
+#if CONFIG_MISMATCH_DEBUG
+  {
+    int pixel_c, pixel_r;
+    int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_check_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r, blk_w,
+                            blk_h, xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#else
+  (void)mi_row;
+  (void)mi_col;
+#endif
+  return eob;
+}
+
+static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
+                                    int plane, int row, int col,
+                                    TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const scan_order *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
+  *pd->eob = eob;
+  pd->dqcoeff += (16 << (tx_size << 1));
+  pd->eob++;
+
+  return eob;
+}
+
+static int reconstruct_inter_block_row_mt(TileWorkerData *twd,
+                                          MODE_INFO *const mi, int plane,
+                                          int row, int col, TX_SIZE tx_size) {
+  MACROBLOCKD *const xd = &twd->xd;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int eob = *pd->eob;
+
+  (void)mi;
   if (eob > 0) {
     inverse_transform_block_inter(
         xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
         pd->dst.stride, eob);
   }
+  pd->dqcoeff += (16 << (tx_size << 1));
+  pd->eob++;
+
   return eob;
 }
 
@@ -715,6 +800,25 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl,
   }
 }
 
+static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                    int mi_row, int mi_col, int bw, int bh,
+                                    int bwl, int bhl) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  xd->mi = cm->mi_grid_visible + offset;
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_skip_context(xd, mi_row, mi_col);
+
+  // Distance of Mb to the various image edges. These are specified to 8th pel
+  // as they are always compared to values that are in 1/8th pel units
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  return xd->mi[0];
+}
+
 static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                               BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
                               int bh, int x_mis, int y_mis, int bwl, int bhl) {
@@ -744,6 +848,66 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return xd->mi[0];
 }
 
+static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi,
+                                      TileWorkerData *twd,
+                                      predict_recon_func func) {
+  int eobtotal = 0;
+  int plane;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+    const int step = (1 << tx_size);
+    int row, col;
+    const int max_blocks_wide =
+        num_4x4_w + (xd->mb_to_right_edge >= 0
+                         ? 0
+                         : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    const int max_blocks_high =
+        num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                         ? 0
+                         : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+    xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+    xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
+    for (row = 0; row < max_blocks_high; row += step)
+      for (col = 0; col < max_blocks_wide; col += step)
+        eobtotal += func(twd, mi, plane, row, col, tx_size);
+  }
+  return eobtotal;
+}
+
+static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi,
+                                       TileWorkerData *twd,
+                                       intra_recon_func func) {
+  int plane;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
+    const int num_4x4_w = pd->n4_w;
+    const int num_4x4_h = pd->n4_h;
+    const int step = (1 << tx_size);
+    int row, col;
+    const int max_blocks_wide =
+        num_4x4_w + (xd->mb_to_right_edge >= 0
+                         ? 0
+                         : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    const int max_blocks_high =
+        num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                         ? 0
+                         : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+    xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide;
+    xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high;
+
+    for (row = 0; row < max_blocks_high; row += step)
+      for (col = 0; col < max_blocks_wide; col += step)
+        func(twd, mi, plane, row, col, tx_size);
+  }
+}
+
 static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
                          int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
   VP9_COMMON *const cm = &pbi->common;
@@ -801,6 +965,24 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
   } else {
     // Prediction
     dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+#if CONFIG_MISMATCH_DEBUG
+    {
+      int plane;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]);
+        const int bw = get_block_width(plane_bsize);
+        const int bh = get_block_height(plane_bsize);
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+        mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c,
+                                 pixel_r, bw, bh,
+                                 xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#endif
 
     // Reconstruction
     if (!mi->skip) {
@@ -829,8 +1011,8 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
 
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal +=
-                reconstruct_inter_block(twd, mi, plane, row, col, tx_size);
+            eobtotal += reconstruct_inter_block(twd, mi, plane, row, col,
+                                                tx_size, mi_row, mi_col);
       }
 
       if (!less8x8 && eobtotal == 0) mi->skip = 1;  // skip loopfilter
@@ -844,6 +1026,98 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
   }
 }
 
+static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  MACROBLOCKD *const xd = &twd->xd;
+
+  MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+  if (!is_inter_block(mi)) {
+    predict_recon_intra(xd, mi, twd,
+                        predict_and_reconstruct_intra_block_row_mt);
+  } else {
+    // Prediction
+    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+
+    // Reconstruction
+    if (!mi->skip) {
+      predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt);
+    }
+  }
+
+  vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh);
+}
+
+static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
+                        int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int bw = 1 << (bwl - 1);
+  const int bh = 1 << (bhl - 1);
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  vpx_reader *r = &twd->bit_reader;
+  MACROBLOCKD *const xd = &twd->xd;
+
+  MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis,
+                              y_mis, bwl, bhl);
+
+  if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
+    const BLOCK_SIZE uv_subsize =
+        ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y];
+    if (uv_subsize == BLOCK_INVALID)
+      vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME,
+                         "Invalid block size.");
+  }
+
+  vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis);
+
+  if (mi->skip) {
+    dec_reset_skip_context(xd);
+  }
+
+  if (!is_inter_block(mi)) {
+    predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt);
+  } else {
+    if (!mi->skip) {
+      tran_low_t *dqcoeff[MAX_MB_PLANE];
+      int *eob[MAX_MB_PLANE];
+      int plane;
+      int eobtotal;
+      // Based on eobtotal and bsize, this may be mi->skip may be set to true
+      // In that case dqcoeff and eob need to be backed up and restored as
+      // recon_block will not increment these pointers for skip cases
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *const pd = &xd->plane[plane];
+        dqcoeff[plane] = pd->dqcoeff;
+        eob[plane] = pd->eob;
+      }
+      eobtotal = predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt);
+
+      if (bsize >= BLOCK_8X8 && eobtotal == 0) {
+        mi->skip = 1;  // skip loopfilter
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          struct macroblockd_plane *pd = &xd->plane[plane];
+          pd->dqcoeff = dqcoeff[plane];
+          pd->eob = eob[plane];
+        }
+      }
+    }
+  }
+
+  xd->corrupted |= vpx_reader_has_error(r);
+}
+
 static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row,
                                               int mi_col, int bsl) {
   const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col;
@@ -950,6 +1224,75 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi,
     dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
 }
 
+static void process_partition(TileWorkerData *twd, VP9Decoder *const pbi,
+                              int mi_row, int mi_col, BLOCK_SIZE bsize,
+                              int n4x4_l2, int parse_recon_flag,
+                              process_block_fn_t process_block) {
+  VP9_COMMON *const cm = &pbi->common;
+  const int n8x8_l2 = n4x4_l2 - 1;
+  const int num_8x8_wh = 1 << n8x8_l2;
+  const int hbs = num_8x8_wh >> 1;
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  MACROBLOCKD *const xd = &twd->xd;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  if (parse_recon_flag & PARSE) {
+    *xd->partition =
+        read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2);
+  }
+
+  partition = *xd->partition;
+  xd->partition++;
+
+  subsize = get_subsize(bsize, partition);
+  if (!hbs) {
+    // calculate bmode block dimensions (log 2)
+    xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
+    xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
+    process_block(twd, pbi, mi_row, mi_col, subsize, 1, 1);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2);
+        break;
+      case PARTITION_HORZ:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2);
+        if (has_rows)
+          process_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2,
+                        n8x8_l2);
+        break;
+      case PARTITION_VERT:
+        process_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2);
+        if (has_cols)
+          process_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                        n4x4_l2);
+        break;
+      case PARTITION_SPLIT:
+        process_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2,
+                          parse_recon_flag, process_block);
+        process_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize,
+                          n8x8_l2, parse_recon_flag, process_block);
+        break;
+      default: assert(0 && "Invalid partition type");
+    }
+  }
+
+  if (parse_recon_flag & PARSE) {
+    // update partition context
+    if ((bsize == BLOCK_8X8 || partition != PARTITION_SPLIT) &&
+        bsize >= BLOCK_8X8)
+      dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh);
+  }
+}
+
 static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end,
                                 size_t read_size,
                                 struct vpx_internal_error_info *error_info,
@@ -1148,9 +1491,15 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) {
     // Allocations in vp9_alloc_context_buffers() depend on individual
     // dimensions as well as the overall size.
     if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
-      if (vp9_alloc_context_buffers(cm, width, height))
+      if (vp9_alloc_context_buffers(cm, width, height)) {
+        // The cm->mi_* values have been cleared and any existing context
+        // buffers have been freed. Clear cm->width and cm->height to be
+        // consistent and to force a realloc next time.
+        cm->width = 0;
+        cm->height = 0;
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate context buffers");
+      }
     } else {
       vp9_set_mb_mi(cm, width, height);
     }
@@ -1348,6 +1697,318 @@ static void get_tile_buffers(VP9Decoder *pbi, const uint8_t *data,
   }
 }
 
+static void map_write(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+                      int sync_idx) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+  row_mt_worker_data->recon_map[map_idx] = 1;
+  pthread_cond_signal(&row_mt_worker_data->recon_sync_cond[sync_idx]);
+  pthread_mutex_unlock(&row_mt_worker_data->recon_sync_mutex[sync_idx]);
+#else
+  (void)row_mt_worker_data;
+  (void)map_idx;
+  (void)sync_idx;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static void map_read(RowMTWorkerData *const row_mt_worker_data, int map_idx,
+                     int sync_idx) {
+#if CONFIG_MULTITHREAD
+  volatile int8_t *map = row_mt_worker_data->recon_map + map_idx;
+  pthread_mutex_t *const mutex =
+      &row_mt_worker_data->recon_sync_mutex[sync_idx];
+  pthread_mutex_lock(mutex);
+  while (!(*map)) {
+    pthread_cond_wait(&row_mt_worker_data->recon_sync_cond[sync_idx], mutex);
+  }
+  pthread_mutex_unlock(mutex);
+#else
+  (void)row_mt_worker_data;
+  (void)map_idx;
+  (void)sync_idx;
+#endif  // CONFIG_MULTITHREAD
+}
+
+static int lpf_map_write_check(VP9LfSync *lf_sync, int row, int num_tile_cols) {
+  int return_val = 0;
+#if CONFIG_MULTITHREAD
+  int corrupted;
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  corrupted = lf_sync->corrupted;
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+  if (!corrupted) {
+    pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+    lf_sync->num_tiles_done[row] += 1;
+    if (num_tile_cols == lf_sync->num_tiles_done[row]) return_val = 1;
+    pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+  }
+#else
+  (void)lf_sync;
+  (void)row;
+  (void)num_tile_cols;
+#endif
+  return return_val;
+}
+
+static void vp9_tile_done(VP9Decoder *pbi) {
+#if CONFIG_MULTITHREAD
+  int terminate;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int all_parse_done = 1 << pbi->common.log2_tile_cols;
+  pthread_mutex_lock(&row_mt_worker_data->recon_done_mutex);
+  row_mt_worker_data->num_tiles_done++;
+  terminate = all_parse_done == row_mt_worker_data->num_tiles_done;
+  pthread_mutex_unlock(&row_mt_worker_data->recon_done_mutex);
+  if (terminate) {
+    vp9_jobq_terminate(&row_mt_worker_data->jobq);
+  }
+#else
+  (void)pbi;
+#endif
+}
+
+static void vp9_jobq_alloc(VP9Decoder *pbi) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const size_t jobq_size = (tile_cols * sb_rows * 2 + sb_rows) * sizeof(Job);
+
+  if (jobq_size > row_mt_worker_data->jobq_size) {
+    vpx_free(row_mt_worker_data->jobq_buf);
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
+                  jobq_size);
+    row_mt_worker_data->jobq_size = jobq_size;
+  }
+}
+
+static void recon_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int is_last_row, VP9LfSync *lf_sync,
+                           int cur_tile_col) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+  const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+  int mi_col_start = tile_data->xd.tile.mi_col_start;
+  int mi_col_end = tile_data->xd.tile.mi_col_end;
+  int mi_col;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += MI_BLOCK_SIZE) {
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (cur_sb_row * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+
+    // Top Dependency
+    if (cur_sb_row) {
+      map_read(row_mt_worker_data, ((cur_sb_row - 1) * sb_cols) + c,
+               ((cur_sb_row - 1) * tile_cols) + cur_tile_col);
+    }
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + (sb_num * PARTITIONS_PER_SB);
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, RECON,
+                      recon_block);
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      // Queue LPF_JOB
+      int is_lpf_job_ready = 0;
+
+      if (mi_col + MI_BLOCK_SIZE >= mi_col_end) {
+        // Checks if this row has been decoded in all tiles
+        is_lpf_job_ready = lpf_map_write_check(lf_sync, cur_sb_row, tile_cols);
+
+        if (is_lpf_job_ready) {
+          Job lpf_job;
+          lpf_job.job_type = LPF_JOB;
+          if (cur_sb_row > 0) {
+            lpf_job.row_num = mi_row - MI_BLOCK_SIZE;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+          if (is_last_row) {
+            lpf_job.row_num = mi_row;
+            vp9_jobq_queue(&row_mt_worker_data->jobq, &lpf_job,
+                           sizeof(lpf_job));
+          }
+        }
+      }
+    }
+    map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+              (cur_sb_row * tile_cols) + cur_tile_col);
+  }
+}
+
+static void parse_tile_row(TileWorkerData *tile_data, VP9Decoder *pbi,
+                           int mi_row, int cur_tile_col, uint8_t **data_end) {
+  int mi_col;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  TileInfo *tile = &tile_data->xd.tile;
+  TileBuffer *const buf = &pbi->tile_buffers[cur_tile_col];
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+
+  vp9_zero(tile_data->dqcoeff);
+  vp9_tile_init(tile, cm, 0, cur_tile_col);
+
+  /* Update reader only at the beginning of each row in a tile */
+  if (mi_row == 0) {
+    setup_token_decoder(buf->data, *data_end, buf->size, &tile_data->error_info,
+                        &tile_data->bit_reader, pbi->decrypt_cb,
+                        pbi->decrypt_state);
+  }
+  vp9_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
+  tile_data->xd.error_info = &tile_data->error_info;
+
+  vp9_zero(tile_data->xd.left_context);
+  vp9_zero(tile_data->xd.left_seg_context);
+  for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
+       mi_col += MI_BLOCK_SIZE) {
+    const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
+    const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+    int plane;
+    const int sb_num = (r * (aligned_cols >> MI_BLOCK_SIZE_LOG2) + c);
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      tile_data->xd.plane[plane].eob =
+          row_mt_worker_data->eob[plane] + (sb_num << EOBS_PER_SB_LOG2);
+      tile_data->xd.plane[plane].dqcoeff =
+          row_mt_worker_data->dqcoeff[plane] + (sb_num << DQCOEFFS_PER_SB_LOG2);
+    }
+    tile_data->xd.partition =
+        row_mt_worker_data->partition + sb_num * PARTITIONS_PER_SB;
+    process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4, PARSE,
+                      parse_block);
+  }
+}
+
+static int row_decode_worker_hook(void *arg1, void *arg2) {
+  ThreadData *const thread_data = (ThreadData *)arg1;
+  uint8_t **data_end = (uint8_t **)arg2;
+  VP9Decoder *const pbi = thread_data->pbi;
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+  const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  Job job;
+  LFWorkerData *lf_data = thread_data->lf_data;
+  VP9LfSync *lf_sync = thread_data->lf_sync;
+  volatile int corrupted = 0;
+
+  while (!vp9_jobq_dequeue(&row_mt_worker_data->jobq, &job, sizeof(job), 1)) {
+    int mi_col;
+    const int mi_row = job.row_num;
+
+    if (job.job_type == LPF_JOB) {
+      lf_data->start = mi_row;
+      lf_data->stop = lf_data->start + MI_BLOCK_SIZE;
+
+      if (cm->lf.filter_level && !cm->skip_loop_filter &&
+          mi_row < cm->mi_rows) {
+        vp9_loopfilter_job(lf_data, lf_sync);
+      }
+    } else if (job.job_type == RECON_JOB) {
+      const int cur_sb_row = mi_row >> MI_BLOCK_SIZE_LOG2;
+      const int is_last_row = sb_rows - 1 == cur_sb_row;
+      TileWorkerData twd_recon;
+      TileWorkerData *const tile_data_recon = &twd_recon;
+      int mi_col_start, mi_col_end;
+
+      tile_data_recon->xd = pbi->mb;
+      vp9_tile_init(&tile_data_recon->xd.tile, cm, 0, job.tile_col);
+      vp9_init_macroblockd(cm, &tile_data_recon->xd, tile_data_recon->dqcoeff);
+      mi_col_start = tile_data_recon->xd.tile.mi_col_start;
+      mi_col_end = tile_data_recon->xd.tile.mi_col_end;
+
+      if (setjmp(tile_data_recon->error_info.jmp)) {
+        const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+        tile_data_recon->error_info.setjmp = 0;
+        corrupted = 1;
+        for (mi_col = mi_col_start; mi_col < mi_col_end;
+             mi_col += MI_BLOCK_SIZE) {
+          const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
+          map_write(row_mt_worker_data, (cur_sb_row * sb_cols) + c,
+                    (cur_sb_row * tile_cols) + job.tile_col);
+        }
+        if (is_last_row) {
+          vp9_tile_done(pbi);
+        }
+        continue;
+      }
+
+      tile_data_recon->error_info.setjmp = 1;
+      tile_data_recon->xd.error_info = &tile_data_recon->error_info;
+
+      recon_tile_row(tile_data_recon, pbi, mi_row, is_last_row, lf_sync,
+                     job.tile_col);
+
+      if (corrupted)
+        vpx_internal_error(&tile_data_recon->error_info,
+                           VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      if (is_last_row) {
+        vp9_tile_done(pbi);
+      }
+    } else if (job.job_type == PARSE_JOB) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[job.tile_col];
+
+      if (setjmp(tile_data->error_info.jmp)) {
+        tile_data->error_info.setjmp = 0;
+        corrupted = 1;
+        vp9_tile_done(pbi);
+        continue;
+      }
+
+      tile_data->xd = pbi->mb;
+      tile_data->xd.counts =
+          cm->frame_parallel_decoding_mode ? 0 : &tile_data->counts;
+
+      tile_data->error_info.setjmp = 1;
+
+      parse_tile_row(tile_data, pbi, mi_row, job.tile_col, data_end);
+
+      corrupted |= tile_data->xd.corrupted;
+      if (corrupted)
+        vpx_internal_error(&tile_data->error_info, VPX_CODEC_CORRUPT_FRAME,
+                           "Failed to decode tile data");
+
+      /* Queue in the recon_job for this row */
+      {
+        Job recon_job;
+        recon_job.row_num = mi_row;
+        recon_job.tile_col = job.tile_col;
+        recon_job.job_type = RECON_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &recon_job,
+                       sizeof(recon_job));
+      }
+
+      /* Queue next parse job */
+      if (mi_row + MI_BLOCK_SIZE < cm->mi_rows) {
+        Job parse_job;
+        parse_job.row_num = mi_row + MI_BLOCK_SIZE;
+        parse_job.tile_col = job.tile_col;
+        parse_job.job_type = PARSE_JOB;
+        vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job,
+                       sizeof(parse_job));
+      }
+    }
+  }
+
+  return !corrupted;
+}
+
 static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
                                    const uint8_t *data_end) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1426,7 +2087,29 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
         vp9_zero(tile_data->xd.left_seg_context);
         for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
              mi_col += MI_BLOCK_SIZE) {
-          decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+          if (pbi->row_mt == 1) {
+            int plane;
+            RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+            for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+              tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
+              tile_data->xd.plane[plane].dqcoeff =
+                  row_mt_worker_data->dqcoeff[plane];
+            }
+            tile_data->xd.partition = row_mt_worker_data->partition;
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              PARSE, parse_block);
+
+            for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+              tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane];
+              tile_data->xd.plane[plane].dqcoeff =
+                  row_mt_worker_data->dqcoeff[plane];
+            }
+            tile_data->xd.partition = row_mt_worker_data->partition;
+            process_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4,
+                              RECON, recon_block);
+          } else {
+            decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
+          }
         }
         pbi->mb.corrupted |= tile_data->xd.corrupted;
         if (pbi->mb.corrupted)
@@ -1471,6 +2154,25 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
   return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
+static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows,
+                                 int num_tiles_left, int total_num_tiles) {
+  do {
+    int mi_row;
+    const int aligned_rows = mi_cols_aligned_to_sb(mi_rows);
+    const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+    const int corrupted = 1;
+    for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) {
+      const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+      vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2,
+                  is_last_row, corrupted);
+    }
+    /* If there are multiple tiles, the second tile should start marking row
+     * progress from row 0.
+     */
+    start_row = 0;
+  } while (num_tiles_left--);
+}
+
 // On entry 'tile_data->data_end' points to the end of the input frame, on exit
 // it is updated to reflect the bitreader position of the final tile column if
 // present in the tile buffer group or NULL otherwise.
@@ -1481,6 +2183,12 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   TileInfo *volatile tile = &tile_data->xd.tile;
   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
   const uint8_t *volatile bit_reader_end = NULL;
+  VP9_COMMON *cm = &pbi->common;
+
+  LFWorkerData *lf_data = tile_data->lf_data;
+  VP9LfSync *lf_sync = tile_data->lf_sync;
+
+  volatile int mi_row = 0;
   volatile int n = tile_data->buf_start;
   tile_data->error_info.setjmp = 1;
 
@@ -1488,14 +2196,26 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
     tile_data->data_end = NULL;
+    if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+      const int num_tiles_left = tile_data->buf_end - n;
+      const int mi_row_start = mi_row;
+      set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left,
+                           1 << cm->log2_tile_cols);
+    }
     return 0;
   }
 
   tile_data->xd.corrupted = 0;
 
   do {
-    int mi_row, mi_col;
+    int mi_col;
     const TileBuffer *const buf = pbi->tile_buffers + n;
+
+    /* Initialize to 0 is safe since we do not deal with streams that have
+     * more than one row of tiles. (So tile->mi_row_start will be 0)
+     */
+    assert(cm->log2_tile_rows == 0);
+    mi_row = 0;
     vp9_zero(tile_data->dqcoeff);
     vp9_tile_init(tile, &pbi->common, 0, buf->col);
     setup_token_decoder(buf->data, tile_data->data_end, buf->size,
@@ -1513,6 +2233,14 @@ static int tile_worker_hook(void *arg1, void *arg2) {
            mi_col += MI_BLOCK_SIZE) {
         decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
       }
+      if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+        const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+        const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+        const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+        vp9_set_row(lf_sync, 1 << cm->log2_tile_cols,
+                    mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row,
+                    tile_data->xd.corrupted);
+      }
     }
 
     if (buf->col == final_col) {
@@ -1520,31 +2248,38 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
   } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
 
+  if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    /* This was not incremented in the tile loop, so increment before tiles left
+     * calculation
+     */
+    ++n;
+    set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n,
+                         1 << cm->log2_tile_cols);
+  }
+
+  if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    vp9_loopfilter_rows(lf_data, lf_sync);
+  }
+
   tile_data->data_end = bit_reader_end;
   return !tile_data->xd.corrupted;
 }
 
 // sorts in descending order
 static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBuffer *const buf1 = (const TileBuffer *)a;
-  const TileBuffer *const buf2 = (const TileBuffer *)b;
-  return (int)(buf2->size - buf1->size);
+  const TileBuffer *const buf_a = (const TileBuffer *)a;
+  const TileBuffer *const buf_b = (const TileBuffer *)b;
+  return (buf_a->size < buf_b->size) - (buf_a->size > buf_b->size);
 }
 
-static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
-                                      const uint8_t *data_end) {
+static INLINE void init_mt(VP9Decoder *pbi) {
+  int n;
   VP9_COMMON *const cm = &pbi->common;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const uint8_t *bit_reader_end = NULL;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
-  int n;
-
-  assert(tile_cols <= (1 << 6));
-  assert(tile_rows == 1);
-  (void)tile_rows;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
@@ -1562,12 +2297,173 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     }
   }
 
+  // Initialize LPF
+  if ((pbi->lpf_mt_opt || pbi->row_mt) && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
+                    pbi->num_tile_workers);
+  }
+
+  // Note: this memset assumes above_context[0], [1] and [2]
+  // are allocated as part of the same buffer.
+  memset(cm->above_context, 0,
+         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
+
+  memset(cm->above_seg_context, 0,
+         sizeof(*cm->above_seg_context) * aligned_mi_cols);
+
+  vp9_reset_lfm(cm);
+}
+
+static const uint8_t *decode_tiles_row_wise_mt(VP9Decoder *pbi,
+                                               const uint8_t *data,
+                                               const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = pbi->max_threads;
+  int i, n;
+  int col;
+  int corrupted = 0;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  memset(row_mt_worker_data->recon_map, 0,
+         sb_rows * sb_cols * sizeof(*row_mt_worker_data->recon_map));
+
+  init_mt(pbi);
+
+  // Reset tile decoding hook
+  for (n = 0; n < num_workers; ++n) {
+    VPxWorker *const worker = &pbi->tile_workers[n];
+    ThreadData *const thread_data = &pbi->row_mt_worker_data->thread_data[n];
+    winterface->sync(worker);
+
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      thread_data->lf_sync = lf_row_sync;
+      thread_data->lf_data = &thread_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(thread_data->lf_data, new_fb, cm,
+                                 pbi->mb.plane);
+    }
+
+    thread_data->pbi = pbi;
+
+    worker->hook = row_decode_worker_hook;
+    worker->data1 = thread_data;
+    worker->data2 = (void *)&row_mt_worker_data->data_end;
+  }
+
+  for (col = 0; col < tile_cols; ++col) {
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+    tile_data->xd = pbi->mb;
+    tile_data->xd.counts =
+        cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
+  }
+
+  /* Reset the jobq to start of the jobq buffer */
+  vp9_jobq_reset(&row_mt_worker_data->jobq);
+  row_mt_worker_data->num_tiles_done = 0;
+  row_mt_worker_data->data_end = NULL;
+
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
+                   &pbi->tile_buffers);
+
+  // Initialize thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (col = 0; col < tile_cols; ++col) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[col];
+      vp9_zero(tile_data->counts);
+    }
+  }
+
+  // queue parse jobs for 0th row of every tile
+  for (col = 0; col < tile_cols; ++col) {
+    Job parse_job;
+    parse_job.row_num = 0;
+    parse_job.tile_col = col;
+    parse_job.job_type = PARSE_JOB;
+    vp9_jobq_queue(&row_mt_worker_data->jobq, &parse_job, sizeof(parse_job));
+  }
+
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
+    worker->had_error = 0;
+    if (i == num_workers - 1) {
+      winterface->execute(worker);
+    } else {
+      winterface->launch(worker);
+    }
+  }
+
+  for (; n > 0; --n) {
+    VPxWorker *const worker = &pbi->tile_workers[n - 1];
+    // TODO(jzern): The tile may have specific error data associated with
+    // its vpx_internal_error_info which could be propagated to the main info
+    // in cm. Additionally once the threads have been synced and an error is
+    // detected, there's no point in continuing to decode tiles.
+    corrupted |= !winterface->sync(worker);
+  }
+
+  pbi->mb.corrupted = corrupted;
+
+  {
+    /* Set data end */
+    TileWorkerData *const tile_data = &pbi->tile_worker_data[tile_cols - 1];
+    row_mt_worker_data->data_end = vpx_reader_find_end(&tile_data->bit_reader);
+  }
+
+  // Accumulate thread frame counts.
+  if (!cm->frame_parallel_decoding_mode) {
+    for (i = 0; i < tile_cols; ++i) {
+      TileWorkerData *const tile_data = &pbi->tile_worker_data[i];
+      vp9_accumulate_frame_counts(&cm->counts, &tile_data->counts, 1);
+    }
+  }
+
+  return row_mt_worker_data->data_end;
+}
+
+static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
+                                      const uint8_t *data_end) {
+  VP9_COMMON *const cm = &pbi->common;
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  const uint8_t *bit_reader_end = NULL;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int num_workers = VPXMIN(pbi->max_threads, tile_cols);
+  int n;
+
+  assert(tile_cols <= (1 << 6));
+  assert(tile_rows == 1);
+  (void)tile_rows;
+
+  init_mt(pbi);
+
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
     VPxWorker *const worker = &pbi->tile_workers[n];
     TileWorkerData *const tile_data =
         &pbi->tile_worker_data[n + pbi->total_tiles];
     winterface->sync(worker);
+
+    if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) {
+      tile_data->lf_sync = lf_row_sync;
+      tile_data->lf_data = &tile_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane);
+      tile_data->lf_data->y_only = 0;
+    }
+
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
@@ -1576,15 +2472,6 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     worker->data2 = pbi;
   }
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
-  vp9_reset_lfm(cm);
-
   // Load tile data into tile_buffers
   get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows,
                    &pbi->tile_buffers);
@@ -1724,6 +2611,22 @@ static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm,
   }
 }
 
+static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) {
+  if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    BufferPool *const pool = cm->buffer_pool;
+    int i;
+    for (i = 0; i < FRAME_BUFFERS; ++i) {
+      if (i == cm->new_fb_idx) continue;
+      frame_bufs[i].ref_count = 0;
+      if (!frame_bufs[i].released) {
+        pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer);
+        frame_bufs[i].released = 1;
+      }
+    }
+  }
+}
+
 static size_t read_uncompressed_header(VP9Decoder *pbi,
                                        struct vpx_read_bit_buffer *rb) {
   VP9_COMMON *const cm = &pbi->common;
@@ -1788,6 +2691,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     setup_frame_size(cm, rb);
     if (pbi->need_resync) {
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+      flush_all_fb_on_key(cm);
       pbi->need_resync = 0;
     }
   } else {
@@ -1911,6 +2815,35 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   setup_segmentation_dequant(cm);
 
   setup_tile_info(cm, rb);
+  if (pbi->row_mt == 1) {
+    int num_sbs = 1;
+    const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+    const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2;
+    const int num_jobs = sb_rows << cm->log2_tile_cols;
+
+    if (pbi->row_mt_worker_data == NULL) {
+      CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+                      vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
+#if CONFIG_MULTITHREAD
+      pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
+#endif
+    }
+
+    if (pbi->max_threads > 1) {
+      const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+      const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2;
+
+      num_sbs = sb_cols * sb_rows;
+    }
+
+    if (num_sbs > pbi->row_mt_worker_data->num_sbs ||
+        num_jobs > pbi->row_mt_worker_data->num_jobs) {
+      vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+      vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs,
+                               pbi->max_threads, num_jobs);
+    }
+    vp9_jobq_alloc(pbi);
+  }
   sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
@@ -1953,7 +2886,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data,
 
     cm->reference_mode = read_frame_reference_mode(cm, &r);
     if (cm->reference_mode != SINGLE_REFERENCE)
-      setup_compound_reference_mode(cm);
+      vp9_setup_compound_reference_mode(cm);
     read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
@@ -2021,6 +2954,12 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+  bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_r();
+#endif
   xd->cur_buf = new_fb;
 
   if (!first_partition_size) {
@@ -2069,20 +3008,28 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
-    // Multi-threaded tile decoder
-    *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    if (!xd->corrupted) {
-      if (!cm->skip_loop_filter) {
-        // If multiple threads are used to decode tiles, then we use those
-        // threads to do parallel loopfiltering.
-        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
-                                 0, 0, pbi->tile_workers, pbi->num_tile_workers,
-                                 &pbi->lf_row_sync);
-      }
+  if (pbi->max_threads > 1 && tile_rows == 1 &&
+      (tile_cols > 1 || pbi->row_mt == 1)) {
+    if (pbi->row_mt == 1) {
+      *p_data_end =
+          decode_tiles_row_wise_mt(pbi, data + first_partition_size, data_end);
     } else {
-      vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                         "Decode failed. Frame data is corrupted.");
+      // Multi-threaded tile decoder
+      *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
+      if (!pbi->lpf_mt_opt) {
+        if (!xd->corrupted) {
+          if (!cm->skip_loop_filter) {
+            // If multiple threads are used to decode tiles, then we use those
+            // threads to do parallel loopfiltering.
+            vp9_loop_filter_frame_mt(
+                new_fb, cm, pbi->mb.plane, cm->lf.filter_level, 0, 0,
+                pbi->tile_workers, pbi->num_tile_workers, &pbi->lf_row_sync);
+          }
+        } else {
+          vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                             "Decode failed. Frame data is corrupted.");
+        }
+      }
     }
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
diff --git a/libs/libvpx/vp9/decoder/vp9_decodeframe.h b/libs/libvpx/vp9/decoder/vp9_decodeframe.h
index 44717f546a5..ba95e72344b 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodeframe.h
+++ b/libs/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODEFRAME_H_
-#define VP9_DECODER_VP9_DECODEFRAME_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_
+#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,4 +32,4 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODEFRAME_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODEFRAME_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.c b/libs/libvpx/vp9/decoder/vp9_decodemv.c
index 0a781413b17..943fe478a6f 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libs/libvpx/vp9/decoder/vp9_decodemv.c
@@ -696,7 +696,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
   VP9_COMMON *const cm = &pbi->common;
   const BLOCK_SIZE bsize = mi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
-  int_mv best_ref_mvs[2];
+  int_mv best_ref_mvs[2] = { { 0 }, { 0 } };
   int ref, is_compound;
   uint8_t inter_mode_ctx;
   const POSITION *const mv_ref_search = mv_ref_blocks[bsize];
diff --git a/libs/libvpx/vp9/decoder/vp9_decodemv.h b/libs/libvpx/vp9/decoder/vp9_decodemv.h
index b460cb8fb17..11b45ace063 100644
--- a/libs/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libs/libvpx/vp9/decoder/vp9_decodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODEMV_H_
-#define VP9_DECODER_VP9_DECODEMV_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_
+#define VPX_VP9_DECODER_VP9_DECODEMV_H_
 
 #include "vpx_dsp/bitreader.h"
 
@@ -26,4 +26,4 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODEMV_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODEMV_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.c b/libs/libvpx/vp9/decoder/vp9_decoder.c
index a913fa560cf..0aed3d717c5 100644
--- a/libs/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.c
@@ -55,6 +55,94 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) {
          cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
 }
 
+void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
+                              VP9_COMMON *cm, int num_sbs, int max_threads,
+                              int num_jobs) {
+  int plane;
+  const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) *
+                              sizeof(*row_mt_worker_data->dqcoeff[0]);
+  row_mt_worker_data->num_jobs = num_jobs;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+    CHECK_MEM_ERROR(
+        cm, row_mt_worker_data->recon_sync_mutex,
+        vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
+    if (row_mt_worker_data->recon_sync_mutex) {
+      for (i = 0; i < num_jobs; ++i) {
+        pthread_mutex_init(&row_mt_worker_data->recon_sync_mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(
+        cm, row_mt_worker_data->recon_sync_cond,
+        vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
+    if (row_mt_worker_data->recon_sync_cond) {
+      for (i = 0; i < num_jobs; ++i) {
+        pthread_cond_init(&row_mt_worker_data->recon_sync_cond[i], NULL);
+      }
+    }
+  }
+#endif
+  row_mt_worker_data->num_sbs = num_sbs;
+  for (plane = 0; plane < 3; ++plane) {
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
+                    vpx_memalign(16, dqcoeff_size));
+    memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+                    vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
+                               sizeof(*row_mt_worker_data->eob[plane])));
+  }
+  CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+                  vpx_calloc(num_sbs * PARTITIONS_PER_SB,
+                             sizeof(*row_mt_worker_data->partition)));
+  CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+                  vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
+
+  // allocate memory for thread_data
+  if (row_mt_worker_data->thread_data == NULL) {
+    const size_t thread_size =
+        max_threads * sizeof(*row_mt_worker_data->thread_data);
+    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+                    vpx_memalign(32, thread_size));
+  }
+}
+
+void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) {
+  if (row_mt_worker_data != NULL) {
+    int plane;
+#if CONFIG_MULTITHREAD
+    int i;
+    if (row_mt_worker_data->recon_sync_mutex != NULL) {
+      for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+        pthread_mutex_destroy(&row_mt_worker_data->recon_sync_mutex[i]);
+      }
+      vpx_free(row_mt_worker_data->recon_sync_mutex);
+      row_mt_worker_data->recon_sync_mutex = NULL;
+    }
+    if (row_mt_worker_data->recon_sync_cond != NULL) {
+      for (i = 0; i < row_mt_worker_data->num_jobs; ++i) {
+        pthread_cond_destroy(&row_mt_worker_data->recon_sync_cond[i]);
+      }
+      vpx_free(row_mt_worker_data->recon_sync_cond);
+      row_mt_worker_data->recon_sync_cond = NULL;
+    }
+#endif
+    for (plane = 0; plane < 3; ++plane) {
+      vpx_free(row_mt_worker_data->eob[plane]);
+      row_mt_worker_data->eob[plane] = NULL;
+      vpx_free(row_mt_worker_data->dqcoeff[plane]);
+      row_mt_worker_data->dqcoeff[plane] = NULL;
+    }
+    vpx_free(row_mt_worker_data->partition);
+    row_mt_worker_data->partition = NULL;
+    vpx_free(row_mt_worker_data->recon_map);
+    row_mt_worker_data->recon_map = NULL;
+    vpx_free(row_mt_worker_data->thread_data);
+    row_mt_worker_data->thread_data = NULL;
+  }
+}
+
 static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) {
   cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip));
   if (!cm->mip) return 1;
@@ -69,6 +157,7 @@ static void vp9_dec_free_mi(VP9_COMMON *cm) {
   cm->mip = NULL;
   vpx_free(cm->mi_grid_base);
   cm->mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
 VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
@@ -139,6 +228,18 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
     vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
+  if (pbi->row_mt == 1) {
+    vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data);
+    if (pbi->row_mt_worker_data != NULL) {
+      vp9_jobq_deinit(&pbi->row_mt_worker_data->jobq);
+      vpx_free(pbi->row_mt_worker_data->jobq_buf);
+#if CONFIG_MULTITHREAD
+      pthread_mutex_destroy(&pbi->row_mt_worker_data->recon_done_mutex);
+#endif
+    }
+    vpx_free(pbi->row_mt_worker_data);
+  }
+
   vp9_remove_common(&pbi->common);
   vpx_free(pbi);
 }
@@ -260,6 +361,44 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     cm->frame_refs[ref_index].idx = -1;
 }
 
+static void release_fb_on_decoder_exit(VP9Decoder *pbi) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  VP9_COMMON *volatile const cm = &pbi->common;
+  BufferPool *volatile const pool = cm->buffer_pool;
+  RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+  int i;
+
+  // Synchronize all threads immediately as a subsequent decode call may
+  // cause a resize invalidating some allocations.
+  winterface->sync(&pbi->lf_worker);
+  for (i = 0; i < pbi->num_tile_workers; ++i) {
+    winterface->sync(&pbi->tile_workers[i]);
+  }
+
+  // Release all the reference buffers if worker thread is holding them.
+  if (pbi->hold_ref_buf == 1) {
+    int ref_index = 0, mask;
+    for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+      const int old_idx = cm->ref_frame_map[ref_index];
+      // Current thread releases the holding of reference frame.
+      decrease_ref_count(old_idx, frame_bufs, pool);
+
+      // Release the reference frame in reference map.
+      if (mask & 1) {
+        decrease_ref_count(old_idx, frame_bufs, pool);
+      }
+      ++ref_index;
+    }
+
+    // Current thread releases the holding of reference frame.
+    for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
+      const int old_idx = cm->ref_frame_map[ref_index];
+      decrease_ref_count(old_idx, frame_bufs, pool);
+    }
+    pbi->hold_ref_buf = 0;
+  }
+}
+
 int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
                                 const uint8_t **psource) {
   VP9_COMMON *volatile const cm = &pbi->common;
@@ -297,6 +436,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) {
+    pbi->ready_for_new_data = 1;
+    release_fb_on_decoder_exit(pbi);
+    vpx_clear_system_state();
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Unable to find free frame buffer");
     return cm->error.error_code;
@@ -309,44 +451,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
-    const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-    int i;
-
     cm->error.setjmp = 0;
     pbi->ready_for_new_data = 1;
-
-    // Synchronize all threads immediately as a subsequent decode call may
-    // cause a resize invalidating some allocations.
-    winterface->sync(&pbi->lf_worker);
-    for (i = 0; i < pbi->num_tile_workers; ++i) {
-      winterface->sync(&pbi->tile_workers[i]);
-    }
-
-    // Release all the reference buffers if worker thread is holding them.
-    if (pbi->hold_ref_buf == 1) {
-      int ref_index = 0, mask;
-      for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        // Current thread releases the holding of reference frame.
-        decrease_ref_count(old_idx, frame_bufs, pool);
-
-        // Release the reference frame in reference map.
-        if (mask & 1) {
-          decrease_ref_count(old_idx, frame_bufs, pool);
-        }
-        ++ref_index;
-      }
-
-      // Current thread releases the holding of reference frame.
-      for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) {
-        const int old_idx = cm->ref_frame_map[ref_index];
-        decrease_ref_count(old_idx, frame_bufs, pool);
-      }
-      pbi->hold_ref_buf = 0;
-    }
+    release_fb_on_decoder_exit(pbi);
     // Release current frame.
     decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-
     vpx_clear_system_state();
     return -1;
   }
@@ -364,6 +473,8 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
     if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm);
   }
 
+  if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx;
+
   // Update progress in frame parallel decode.
   cm->last_width = cm->width;
   cm->last_height = cm->height;
@@ -394,7 +505,7 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
 
 #if CONFIG_VP9_POSTPROC
   if (!cm->show_existing_frame) {
-    ret = vp9_post_proc_frame(cm, sd, flags);
+    ret = vp9_post_proc_frame(cm, sd, flags, cm->width);
   } else {
     *sd = *cm->frame_to_show;
     ret = 0;
diff --git a/libs/libvpx/vp9/decoder/vp9_decoder.h b/libs/libvpx/vp9/decoder/vp9_decoder.h
index 4b26c314d37..4a22aa6b5ba 100644
--- a/libs/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libs/libvpx/vp9/decoder/vp9_decoder.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DECODER_H_
-#define VP9_DECODER_VP9_DECODER_H_
+#ifndef VPX_VP9_DECODER_VP9_DECODER_H_
+#define VPX_VP9_DECODER_VP9_DECODER_H_
 
 #include "./vpx_config.h"
 
@@ -21,11 +21,24 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
+#include "./vp9_job_queue.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define EOBS_PER_SB_LOG2 8
+#define DQCOEFFS_PER_SB_LOG2 12
+#define PARTITIONS_PER_SB 85
+
+typedef enum JobType { PARSE_JOB, RECON_JOB, LPF_JOB } JobType;
+
+typedef struct ThreadData {
+  struct VP9Decoder *pbi;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
+} ThreadData;
+
 typedef struct TileBuffer {
   const uint8_t *data;
   size_t size;
@@ -37,12 +50,46 @@ typedef struct TileWorkerData {
   int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive
   vpx_reader bit_reader;
   FRAME_COUNTS counts;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
+typedef void (*process_block_fn_t)(TileWorkerData *twd,
+                                   struct VP9Decoder *const pbi, int mi_row,
+                                   int mi_col, BLOCK_SIZE bsize, int bwl,
+                                   int bhl);
+
+typedef struct RowMTWorkerData {
+  int num_sbs;
+  int *eob[MAX_MB_PLANE];
+  PARTITION_TYPE *partition;
+  tran_low_t *dqcoeff[MAX_MB_PLANE];
+  int8_t *recon_map;
+  const uint8_t *data_end;
+  uint8_t *jobq_buf;
+  JobQueueRowMt jobq;
+  size_t jobq_size;
+  int num_tiles_done;
+  int num_jobs;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t recon_done_mutex;
+  pthread_mutex_t *recon_sync_mutex;
+  pthread_cond_t *recon_sync_cond;
+#endif
+  ThreadData *thread_data;
+} RowMTWorkerData;
+
+/* Structure to queue and dequeue row decode jobs */
+typedef struct Job {
+  int row_num;
+  int tile_col;
+  JobType job_type;
+} Job;
+
 typedef struct VP9Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -72,10 +119,14 @@ typedef struct VP9Decoder {
   int inv_tile_order;
   int need_resync;   // wait for key/intra-only frame.
   int hold_ref_buf;  // hold the reference buffer.
+
+  int row_mt;
+  int lpf_mt_opt;
+  RowMTWorkerData *row_mt_worker_data;
 } VP9Decoder;
 
 int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size,
-                                const uint8_t **dest);
+                                const uint8_t **psource);
 
 int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp9_ppflags_t *flags);
@@ -109,6 +160,11 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool);
 
 void vp9_decoder_remove(struct VP9Decoder *pbi);
 
+void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
+                              VP9_COMMON *cm, int num_sbs, int max_threads,
+                              int num_jobs);
+void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data);
+
 static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
                                       BufferPool *const pool) {
   if (idx >= 0 && frame_bufs[idx].ref_count > 0) {
@@ -129,4 +185,4 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DECODER_H_
+#endif  // VPX_VP9_DECODER_VP9_DECODER_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.c b/libs/libvpx/vp9/decoder/vp9_detokenize.c
index 4bd016dc7db..e250a5a354d 100644
--- a/libs/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libs/libvpx/vp9/decoder/vp9_detokenize.c
@@ -33,6 +33,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
                             int *count, unsigned int *range) {
   const unsigned int split = (*range * prob + (256 - prob)) >> CHAR_BIT;
   const BD_VALUE bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
+#if CONFIG_BITSTREAM_DEBUG
+  const int queue_r = bitstream_queue_get_read();
+  const int frame_idx = bitstream_queue_get_frame_read();
+  int ref_result, ref_prob;
+  bitstream_queue_pop(&ref_result, &ref_prob);
+  if (prob != ref_prob) {
+    fprintf(stderr,
+            "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d "
+            "queue_r %d\n",
+            frame_idx, prob, ref_prob, queue_r);
+
+    assert(0);
+  }
+#endif
 
   if (*count < 0) {
     r->value = *value;
@@ -51,6 +65,20 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
       *value <<= shift;
       *count -= shift;
     }
+#if CONFIG_BITSTREAM_DEBUG
+    {
+      const int bit = 1;
+      if (bit != ref_result) {
+        fprintf(
+            stderr,
+            "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+            "queue_r %d\n",
+            frame_idx, bit, ref_result, queue_r);
+
+        assert(0);
+      }
+    }
+#endif
     return 1;
   }
   *range = split;
@@ -60,6 +88,19 @@ static INLINE int read_bool(vpx_reader *r, int prob, BD_VALUE *value,
     *value <<= shift;
     *count -= shift;
   }
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    const int bit = 0;
+    if (bit != ref_result) {
+      fprintf(stderr,
+              "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_result, queue_r);
+
+      assert(0);
+    }
+  }
+#endif
   return 0;
 }
 
diff --git a/libs/libvpx/vp9/decoder/vp9_detokenize.h b/libs/libvpx/vp9/decoder/vp9_detokenize.h
index 7b0d876016d..a32052fffff 100644
--- a/libs/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libs/libvpx/vp9/decoder/vp9_detokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DETOKENIZE_H_
-#define VP9_DECODER_VP9_DETOKENIZE_H_
+#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_
+#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_
 
 #include "vpx_dsp/bitreader.h"
 #include "vp9/decoder/vp9_decoder.h"
@@ -27,4 +27,4 @@ int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DETOKENIZE_H_
+#endif  // VPX_VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_dsubexp.h b/libs/libvpx/vp9/decoder/vp9_dsubexp.h
index 5a8ec8300c1..b0c7750736b 100644
--- a/libs/libvpx/vp9/decoder/vp9_dsubexp.h
+++ b/libs/libvpx/vp9/decoder/vp9_dsubexp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_DECODER_VP9_DSUBEXP_H_
-#define VP9_DECODER_VP9_DSUBEXP_H_
+#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_
+#define VPX_VP9_DECODER_VP9_DSUBEXP_H_
 
 #include "vpx_dsp/bitreader.h"
 
@@ -23,4 +23,4 @@ void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p);
 }  // extern "C"
 #endif
 
-#endif  // VP9_DECODER_VP9_DSUBEXP_H_
+#endif  // VPX_VP9_DECODER_VP9_DSUBEXP_H_
diff --git a/libs/libvpx/vp9/decoder/vp9_job_queue.c b/libs/libvpx/vp9/decoder/vp9_job_queue.c
new file mode 100644
index 00000000000..9a31f5a6d09
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_job_queue.c
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "vpx/vpx_integer.h"
+
+#include "vp9/decoder/vp9_job_queue.h"
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_init(&jobq->mutex, NULL);
+  pthread_cond_init(&jobq->cond, NULL);
+#endif
+  jobq->buf_base = buf;
+  jobq->buf_wr = buf;
+  jobq->buf_rd = buf;
+  jobq->buf_end = buf + buf_size;
+  jobq->terminate = 0;
+}
+
+void vp9_jobq_reset(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->buf_wr = jobq->buf_base;
+  jobq->buf_rd = jobq->buf_base;
+  jobq->terminate = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+void vp9_jobq_deinit(JobQueueRowMt *jobq) {
+  vp9_jobq_reset(jobq);
+#if CONFIG_MULTITHREAD
+  pthread_mutex_destroy(&jobq->mutex);
+  pthread_cond_destroy(&jobq->cond);
+#endif
+}
+
+void vp9_jobq_terminate(JobQueueRowMt *jobq) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  jobq->terminate = 1;
+#if CONFIG_MULTITHREAD
+  pthread_cond_broadcast(&jobq->cond);
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+}
+
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_wr + job_size) {
+    memcpy(jobq->buf_wr, job, job_size);
+    jobq->buf_wr = jobq->buf_wr + job_size;
+#if CONFIG_MULTITHREAD
+    pthread_cond_signal(&jobq->cond);
+#endif
+    ret = 0;
+  } else {
+    /* Wrap around case is not supported */
+    assert(0);
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+  return ret;
+}
+
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking) {
+  int ret = 0;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&jobq->mutex);
+#endif
+  if (jobq->buf_end >= jobq->buf_rd + job_size) {
+    while (1) {
+      if (jobq->buf_wr >= jobq->buf_rd + job_size) {
+        memcpy(job, jobq->buf_rd, job_size);
+        jobq->buf_rd = jobq->buf_rd + job_size;
+        ret = 0;
+        break;
+      } else {
+        /* If all the entries have been dequeued, then break and return */
+        if (jobq->terminate == 1) {
+          ret = 1;
+          break;
+        }
+        if (blocking == 1) {
+#if CONFIG_MULTITHREAD
+          pthread_cond_wait(&jobq->cond, &jobq->mutex);
+#endif
+        } else {
+          /* If there is no job available,
+           * and this is non blocking call then return fail */
+          ret = 1;
+          break;
+        }
+      }
+    }
+  } else {
+    /* Wrap around case is not supported */
+    ret = 1;
+  }
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&jobq->mutex);
+#endif
+
+  return ret;
+}
diff --git a/libs/libvpx/vp9/decoder/vp9_job_queue.h b/libs/libvpx/vp9/decoder/vp9_job_queue.h
new file mode 100644
index 00000000000..bc23bf9c2c1
--- /dev/null
+++ b/libs/libvpx/vp9/decoder/vp9_job_queue.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
+
+#include "vpx_util/vpx_thread.h"
+
+typedef struct {
+  // Pointer to buffer base which contains the jobs
+  uint8_t *buf_base;
+
+  // Pointer to current address where new job can be added
+  uint8_t *volatile buf_wr;
+
+  // Pointer to current address from where next job can be obtained
+  uint8_t *volatile buf_rd;
+
+  // Pointer to end of job buffer
+  uint8_t *buf_end;
+
+  int terminate;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t mutex;
+  pthread_cond_t cond;
+#endif
+} JobQueueRowMt;
+
+void vp9_jobq_init(JobQueueRowMt *jobq, uint8_t *buf, size_t buf_size);
+void vp9_jobq_reset(JobQueueRowMt *jobq);
+void vp9_jobq_deinit(JobQueueRowMt *jobq);
+void vp9_jobq_terminate(JobQueueRowMt *jobq);
+int vp9_jobq_queue(JobQueueRowMt *jobq, void *job, size_t job_size);
+int vp9_jobq_dequeue(JobQueueRowMt *jobq, void *job, size_t job_size,
+                     int blocking);
+
+#endif  // VPX_VP9_DECODER_VP9_JOB_QUEUE_H_
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
deleted file mode 100644
index 513718e7cb1..00000000000
--- a/libs/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vp9/common/vp9_blockd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vp9_fdct8x8_quant_neon(const int16_t *input, int stride,
-                            tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  tran_low_t temp_buffer[64];
-  (void)coeff_ptr;
-
-  vpx_fdct8x8_neon(input, temp_buffer, stride);
-  vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr,
-                       qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr,
-                       iscan_ptr);
-}
diff --git a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 97a09bdff6f..8b62b450cef 100644
--- a/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libs/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
     store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
   }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
   {
     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
                                              vget_high_s16(v_eobmax_76543210));
@@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
   }
+#endif  // __aarch64__
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -122,7 +126,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
                                 const int16_t *quant_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan_ptr) {
+                                const int16_t *scan, const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
 
@@ -134,8 +138,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
   const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
 
   // Process dc and the first seven ac coeffs.
-  const uint16x8_t iscan =
-      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+  const uint16x8_t v_iscan =
+      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
   const int16x8_t coeff_abs = vabsq_s16(coeff);
@@ -169,12 +173,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
   dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
 
-  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
   store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
 
-  iscan_ptr += 8;
+  iscan += 8;
   coeff_ptr += 8;
   qcoeff_ptr += 8;
   dqcoeff_ptr += 8;
@@ -188,8 +192,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
 
     // Process the rest of the ac coeffs.
     for (i = 8; i < 32 * 32; i += 8) {
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
       const int16x8_t coeff_abs = vabsq_s16(coeff);
@@ -215,17 +219,20 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
           vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
 
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
 
-      iscan_ptr += 8;
+      iscan += 8;
       coeff_ptr += 8;
       qcoeff_ptr += 8;
       dqcoeff_ptr += 8;
     }
 
+#ifdef __aarch64__
+    *eob_ptr = vmaxvq_u16(eob_max);
+#else
     {
       const uint16x4_t eob_max_0 =
           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
       vst1_lane_u16(eob_ptr, eob_max_2, 0);
     }
+#endif  // __aarch64__
   }
 }
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
index 188d04d8f65..61786d8f662 100644
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_error_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -79,6 +80,7 @@
     return err;                                                              \
   }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 BLOCK_ERROR_BLOCKSIZE_MSA(16);
 BLOCK_ERROR_BLOCKSIZE_MSA(64);
 BLOCK_ERROR_BLOCKSIZE_MSA(256);
@@ -103,3 +105,4 @@ int64_t vp9_block_error_msa(const tran_low_t *coeff_ptr,
 
   return err;
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
index 0831e591488..efbbe830dbd 100644
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
index fa36f09ab85..9c5cc12ef04 100644
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
index 604db853c41..26d81aa9ef8 100644
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_enums.h"
 #include "vp9/encoder/mips/msa/vp9_fdct_msa.h"
 
diff --git a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
index 794bec70b64..fa1af2fc571 100644
--- a/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/libs/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
-#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
+#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
 
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 #include "vpx_dsp/mips/txfm_macros_msa.h"
@@ -113,4 +113,4 @@
     PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \
                 out0, out1, out2, out3);                                    \
   }
-#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */
+#endif  // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_
diff --git a/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
new file mode 100644
index 00000000000..4f88b8fff6f
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -0,0 +1,292 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+  // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right
+  // shift.
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+  return vec_xor(vec_add(a, mask), mask);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+  a = vec_max(a, vec_perm(a, a, vec_perm64));
+  a = vec_max(a, vec_perm(a, a, vec_perm32));
+  return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *round_ptr,
+                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_coeff0, zero_coeff1;
+
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
+
+  (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // First set of 8 coeff starts with DC + 7 AC
+  qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+  zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+  qcoeff0 = vec_sign(qcoeff0, coeff0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+  // Remove DC value from round and quant
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+
+  // Remove DC value from dequant
+  dequant = vec_splat(dequant, 1);
+
+  // Second set of 8 coeff starts with (all AC)
+  qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+  zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+  qcoeff1 = vec_sign(qcoeff1, coeff1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+  // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per
+  // loop iteration.
+  // for 8x8: 16 + 2 x 24 = 64
+  // for 16x16: 16 + 10 x 24 = 256
+  if (n_coeffs > 16) {
+    int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+    bool16x8_t zero_coeff2;
+
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+
+    do {
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      scan0 = vec_vsx_ld(off0, iscan);
+      scan1 = vec_vsx_ld(off1, iscan);
+      scan2 = vec_vsx_ld(off2, iscan);
+
+      qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
+      zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+      qcoeff0 = vec_sign(qcoeff0, coeff0);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+
+      qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant);
+      zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+      qcoeff1 = vec_sign(qcoeff1, coeff1);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+
+      qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant);
+      zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+      qcoeff2 = vec_sign(qcoeff2, coeff2);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+      eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0] + 1;
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+  return vec_sr(a, vec_shift_sign_s32);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+                                            int16x8_t dequant) {
+  int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+  int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+  dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+  dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+  dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+  return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t *round_ptr,
+                               const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+  // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+  // (32 * 32 - 16) / 24 = 42
+  int num_itr = 42;
+  // Offsets are in bytes, 16 coeffs = 32 bytes
+  int off0 = 32;
+  int off1 = 48;
+  int off2 = 64;
+
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1;
+
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+  int16x8_t scan0 = vec_vsx_ld(0, iscan);
+  int16x8_t scan1 = vec_vsx_ld(16, iscan);
+  int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2));
+  int16x8_t abs_coeff0 = vec_abs(coeff0);
+  int16x8_t abs_coeff1 = vec_abs(coeff1);
+
+  (void)scan;
+  (void)skip_block;
+  (void)n_coeffs;
+  assert(!skip_block);
+
+  mask0 = vec_cmpge(abs_coeff0, thres);
+  round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+  // First set of 8 coeff starts with DC + 7 AC
+  qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+  qcoeff0 = vec_and(qcoeff0, mask0);
+  zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+  qcoeff0 = vec_sign(qcoeff0, coeff0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+
+  dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+
+  // Remove DC value from thres, round, quant and dequant
+  thres = vec_splat(thres, 1);
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+  dequant = vec_splat(dequant, 1);
+
+  mask1 = vec_cmpge(abs_coeff1, thres);
+
+  // Second set of 8 coeff starts with (all AC)
+  qcoeff1 =
+      vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16);
+  qcoeff1 = vec_and(qcoeff1, mask1);
+  zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+  qcoeff1 = vec_sign(qcoeff1, coeff1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1));
+
+  do {
+    int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2;
+    bool16x8_t zero_coeff2, mask2;
+    coeff0 = vec_vsx_ld(off0, coeff_ptr);
+    coeff1 = vec_vsx_ld(off1, coeff_ptr);
+    coeff2 = vec_vsx_ld(off2, coeff_ptr);
+    scan0 = vec_vsx_ld(off0, iscan);
+    scan1 = vec_vsx_ld(off1, iscan);
+    scan2 = vec_vsx_ld(off2, iscan);
+
+    abs_coeff0 = vec_abs(coeff0);
+    abs_coeff1 = vec_abs(coeff1);
+    abs_coeff2 = vec_abs(coeff2);
+
+    qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16);
+    qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16);
+    qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16);
+
+    mask0 = vec_cmpge(abs_coeff0, thres);
+    mask1 = vec_cmpge(abs_coeff1, thres);
+    mask2 = vec_cmpge(abs_coeff2, thres);
+
+    qcoeff0 = vec_and(qcoeff0, mask0);
+    qcoeff1 = vec_and(qcoeff1, mask1);
+    qcoeff2 = vec_and(qcoeff2, mask2);
+
+    zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16);
+    zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16);
+    zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16);
+
+    qcoeff0 = vec_sign(qcoeff0, coeff0);
+    qcoeff1 = vec_sign(qcoeff1, coeff1);
+    qcoeff2 = vec_sign(qcoeff2, coeff2);
+
+    vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+    vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+    vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+    dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant);
+    dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant);
+    dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant);
+
+    vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+    vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+    vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+    eob = vec_max(eob, vec_or(scan0, zero_coeff0));
+    eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2));
+    eob = vec_max(eob, eob2);
+
+    off0 += 48;
+    off1 += 48;
+    off2 += 48;
+    num_itr--;
+  } while (num_itr != 0);
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0] + 1;
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
index e508cb44ac2..22a657e035c 100644
--- a/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
+++ b/libs/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -15,8 +15,8 @@
  *  for altref frames.  Go to alt_ref_aq_private.h for implmentation details.
  */
 
-#ifndef VP9_ENCODER_VP9_ALT_REF_AQ_H_
-#define VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -124,4 +124,4 @@ void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ALT_REF_AQ_H_
+#endif  // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_360.h b/libs/libvpx/vp9/encoder/vp9_aq_360.h
index b1b56561d82..749d3c198ab 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_360.h
+++ b/libs/libvpx/vp9/encoder/vp9_aq_360.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_360_H_
-#define VP9_ENCODER_VP9_AQ_360_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_
+#define VPX_VP9_ENCODER_VP9_AQ_360_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -24,4 +24,4 @@ void vp9_360aq_frame_setup(VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_360_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_complexity.h b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h
index a00d34e7025..d3cb34c0132 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_complexity.h
+++ b/libs/libvpx/vp9/encoder/vp9_aq_complexity.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
-#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,4 +33,4 @@ void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2f2f0055a7c..adb12c10c64 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -21,6 +21,14 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
+static const uint8_t VP9_VAR_OFFS[64] = {
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
+};
+
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
@@ -39,13 +47,16 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   }
   assert(MAXQ <= 255);
   memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+  cr->counter_encode_maxq_scene_change = 0;
   return cr;
 }
 
 void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
-  vpx_free(cr->map);
-  vpx_free(cr->last_coded_q_map);
-  vpx_free(cr);
+  if (cr != NULL) {
+    vpx_free(cr->map);
+    vpx_free(cr->last_coded_q_map);
+    vpx_free(cr);
+  }
 }
 
 // Check if this coding block, of size bsize, should be considered for refresh
@@ -318,6 +329,28 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) {
     rc->baseline_gf_interval = 10;
 }
 
+static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index,
+                                     int sb_col_index) {
+  unsigned int source_variance;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int ystride = cpi->Source->y_stride;
+  unsigned int sse;
+  const BLOCK_SIZE bsize = BLOCK_64X64;
+  src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+  source_variance =
+      cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse);
+  if (source_variance == 0) {
+    uint64_t block_sad;
+    const uint8_t *last_src_y = cpi->Last_Source->y_buffer;
+    const int last_ystride = cpi->Last_Source->y_stride;
+    last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6);
+    block_sad =
+        cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride);
+    if (block_sad == 0) return 1;
+  }
+  return 0;
+}
+
 // Update the segmentation map, and related quantities: cyclic refresh map,
 // refresh sb_index, and target number of blocks to be refreshed.
 // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
@@ -368,8 +401,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     int sb_col_index = i - sb_row_index * sb_cols;
     int mi_row = sb_row_index * MI_BLOCK_SIZE;
     int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int flat_static_blocks = 0;
+    int compute_content = 1;
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) compute_content = 0;
+#endif
+    if (cpi->Last_Source == NULL ||
+        cpi->Last_Source->y_width != cpi->Source->y_width ||
+        cpi->Last_Source->y_height != cpi->Source->y_height)
+      compute_content = 0;
     bl_index = mi_row * cm->mi_cols + mi_col;
     // Loop through all 8x8 blocks in superblock and update map.
     xmis =
@@ -400,11 +442,21 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
     // Enforce constant segment over superblock.
     // If segment is at least half of superblock, set to 1.
     if (sum_map >= xmis * ymis / 2) {
-      for (y = 0; y < ymis; y++)
-        for (x = 0; x < xmis; x++) {
-          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
-        }
-      cr->target_num_seg_blocks += xmis * ymis;
+      // This superblock is a candidate for refresh:
+      // compute spatial variance and exclude blocks that are spatially flat
+      // and stationary. Note: this is currently only done for screne content
+      // mode.
+      if (compute_content && cr->skip_flat_static_blocks)
+        flat_static_blocks =
+            is_superblock_flat_static(cpi, sb_row_index, sb_col_index);
+      if (!flat_static_blocks) {
+        // Label this superblock as segment 1.
+        for (y = 0; y < ymis; y++)
+          for (x = 0; x < xmis; x++) {
+            seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+          }
+        cr->target_num_seg_blocks += xmis * ymis;
+      }
     }
     i++;
     if (i == sbs_in_frame) {
@@ -413,7 +465,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
   cr->sb_index = i;
   cr->reduce_refresh = 0;
-  if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
@@ -425,11 +478,20 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   int target_refresh = 0;
   double weight_segment_target = 0;
   double weight_segment = 0;
-  int thresh_low_motion = (cm->width < 720) ? 55 : 20;
+  int thresh_low_motion = 20;
+  int qp_thresh = VPXMIN((cpi->oxcf.content == VP9E_CONTENT_SCREEN) ? 35 : 20,
+                         rc->best_quality << 1);
+  int qp_max_thresh = 117 * MAXQ >> 7;
   cr->apply_cyclic_refresh = 1;
-  if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+  if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 ||
+      is_lossless_requested(&cpi->oxcf) ||
+      rc->avg_frame_qindex[INTER_FRAME] < qp_thresh ||
+      (cpi->use_svc &&
+       cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
       (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
-       rc->frames_since_key > 40)) {
+       rc->frames_since_key > 40) ||
+      (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
+       rc->frames_since_key > 20)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
@@ -454,20 +516,32 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
       cr->rate_boost_fac = 13;
     }
   }
+  // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and
+  // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2
+  // (rate_boost_fac = 10 disables segment#2).
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+    // Only enable feature of skipping flat_static blocks for top layer
+    // under screen content mode.
+    if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+      cr->skip_flat_static_blocks = 1;
+    cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10;
+    // Increase the amount of refresh on scene change that is encoded at max Q,
+    // increase for a few cycles of the refresh period (~100 / percent_refresh).
+    if (cr->counter_encode_maxq_scene_change < 30)
+      cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15;
+    cr->rate_ratio_qdelta = 2.0;
+    cr->rate_boost_fac = 10;
+  }
   // Adjust some parameters for low resolutions.
-  if (cm->width <= 352 && cm->height <= 288) {
+  if (cm->width * cm->height <= 352 * 288) {
     if (rc->avg_frame_bandwidth < 3000) {
-      cr->motion_thresh = 16;
+      cr->motion_thresh = 64;
       cr->rate_boost_fac = 13;
     } else {
       cr->max_qdelta_perc = 70;
       cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
     }
   }
-  if (cpi->svc.spatial_layer_id > 0) {
-    cr->motion_thresh = 4;
-    cr->rate_boost_fac = 12;
-  }
   if (cpi->oxcf.rc_mode == VPX_VBR) {
     // To be adjusted for VBR mode, e.g., based on gf period and boost.
     // For now use smaller qp-delta (than CBR), no second boosted seg, and
@@ -492,6 +566,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
                    num8x8bl;
   if (weight_segment_target < 7 * weight_segment / 8)
     weight_segment = weight_segment_target;
+  // For screen-content: don't include target for the weight segment,
+  // since for all flat areas the segment is reset, so its more accurate
+  // to just use the previous actual number of seg blocks for the weight.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    weight_segment =
+        (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) /
+        num8x8bl;
   cr->weight_segment = weight_segment;
 }
 
@@ -501,23 +582,31 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   struct segmentation *const seg = &cm->seg;
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
   if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
-  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
+  // Reset if resoluton change has occurred.
+  if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi);
+  if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) ||
+      scene_change_detected) {
     // Set segmentation map to 0 and disable.
     unsigned char *const seg_map = cpi->segmentation_map;
     memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
     vp9_disable_segmentation(&cm->seg);
-    if (cm->frame_type == KEY_FRAME) {
+    if (cm->frame_type == KEY_FRAME || scene_change_detected) {
       memset(cr->last_coded_q_map, MAXQ,
              cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
       cr->sb_index = 0;
       cr->reduce_refresh = 0;
+      cr->counter_encode_maxq_scene_change = 0;
     }
     return;
   } else {
     int qindex_delta = 0;
     int qindex2;
     const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
+    cr->counter_encode_maxq_scene_change++;
     vpx_clear_system_state();
     // Set rate threshold to some multiple (set to 2 for now) of the target
     // rate (target is given by sb64_target_rate and scaled by 256).
@@ -567,9 +656,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) {
     cr->qindex_delta[2] = qindex_delta;
     vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
 
-    // Reset if resoluton change has occurred.
-    if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi);
-
     // Update the segmentation and refresh map.
     cyclic_refresh_update_map(cpi);
   }
@@ -583,8 +669,19 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
   CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
   memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
-  memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols);
+  memset(cr->last_coded_q_map, MAXQ,
+         cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
   cr->sb_index = 0;
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
+  cr->counter_encode_maxq_scene_change = 0;
+}
+
+void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) {
+  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+  // For now apply hard limit to frame-level decrease in q, if the cyclic
+  // refresh is active (percent_refresh > 0).
+  if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) {
+    *q = cpi->rc.q_1_frame - 8;
+  }
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
index 77fa67c9e16..b6d7fdeae77 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
+++ b/libs/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
-#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
 
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
@@ -68,6 +68,8 @@ struct CYCLIC_REFRESH {
   int reduce_refresh;
   double weight_segment;
   int apply_cyclic_refresh;
+  int counter_encode_maxq_scene_change;
+  int skip_flat_static_blocks;
 };
 
 struct VP9_COMP;
@@ -102,10 +104,6 @@ void vp9_cyclic_refresh_update_sb_postencode(struct VP9_COMP *const cpi,
                                              int mi_row, int mi_col,
                                              BLOCK_SIZE bsize);
 
-// Update the segmentation map, and related quantities: cyclic refresh map,
-// refresh sb_index, and target number of blocks to be refreshed.
-void vp9_cyclic_refresh_update__map(struct VP9_COMP *const cpi);
-
 // From the just encoded frame: update the actual number of blocks that were
 // applied the segment delta q, and the amount of low motion in the frame.
 // Also check conditions for forcing golden update, or preventing golden
@@ -139,8 +137,10 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) {
     return CR_SEGMENT_ID_BASE;
 }
 
+void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.c b/libs/libvpx/vp9/encoder/vp9_aq_variance.c
index 477f62ba5ad..1f9ce2354ce 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_variance.c
+++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.c
@@ -19,6 +19,7 @@
 
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
+#include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_segmentation.h"
 
 #define ENERGY_MIN (-4)
@@ -108,7 +109,7 @@ static void aq_variance(const uint8_t *a, int a_stride, const uint8_t *b,
 #if CONFIG_VP9_HIGHBITDEPTH
 static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
                                  const uint8_t *b8, int b_stride, int w, int h,
-                                 uint64_t *sse, uint64_t *sum) {
+                                 uint64_t *sse, int64_t *sum) {
   int i, j;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
@@ -127,15 +128,6 @@ static void aq_highbd_variance64(const uint8_t *a8, int a_stride,
   }
 }
 
-static void aq_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                 const uint8_t *b8, int b_stride, int w, int h,
-                                 unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  aq_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
@@ -153,11 +145,13 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
     int avg;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+      uint64_t sse64 = 0;
+      int64_t sum64 = 0;
+      aq_highbd_variance64(x->plane[0].src.buf, x->plane[0].src.stride,
                            CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
-                           &sse, &avg);
-      sse >>= 2 * (xd->bd - 8);
-      avg >>= (xd->bd - 8);
+                           &sse64, &sum64);
+      sse = (unsigned int)(sse64 >> (2 * (xd->bd - 8)));
+      avg = (int)(sum64 >> (xd->bd - 8));
     } else {
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride, vp9_64_zeros, 0,
                   bw, bh, &sse, &avg);
@@ -192,6 +186,40 @@ double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   return log(var + 1.0);
 }
 
+// Get the range of sub block energy values;
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int *min_e,
+                              int *max_e) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+  const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+  int x, y;
+
+  if (xmis < bw || ymis < bh) {
+    vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+    *min_e = vp9_block_energy(cpi, mb, bsize);
+    *max_e = *min_e;
+  } else {
+    int energy;
+    *min_e = ENERGY_MAX;
+    *max_e = ENERGY_MIN;
+
+    for (y = 0; y < ymis; ++y) {
+      for (x = 0; x < xmis; ++x) {
+        vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x);
+        energy = vp9_block_energy(cpi, mb, BLOCK_8X8);
+        *min_e = VPXMIN(*min_e, energy);
+        *max_e = VPXMAX(*max_e, energy);
+      }
+    }
+  }
+
+  // Re-instate source pointers back to what they should have been on entry.
+  vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col);
+}
+
 #define DEFAULT_E_MIDPOINT 10.0
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
   double energy;
diff --git a/libs/libvpx/vp9/encoder/vp9_aq_variance.h b/libs/libvpx/vp9/encoder/vp9_aq_variance.h
index 211a69f392c..a4f872879d7 100644
--- a/libs/libvpx/vp9/encoder/vp9_aq_variance.h
+++ b/libs/libvpx/vp9/encoder/vp9_aq_variance.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_
-#define VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -20,11 +20,15 @@ extern "C" {
 unsigned int vp9_vaq_segment_id(int energy);
 void vp9_vaq_frame_setup(VP9_COMP *cpi);
 
+void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int *min_e,
+                              int *max_e);
 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+
 double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_AQ_VARIANCE_H_
+#endif  // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.c b/libs/libvpx/vp9/encoder/vp9_bitstream.c
index d346cd57aa0..3eff4ce830d 100644
--- a/libs/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.c
@@ -18,6 +18,9 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -39,8 +42,10 @@ static const struct vp9_token intra_mode_encodings[INTRA_MODES] = {
   { 0, 1 },  { 6, 3 },   { 28, 5 },  { 30, 5 }, { 58, 6 },
   { 59, 6 }, { 126, 7 }, { 127, 7 }, { 62, 6 }, { 2, 2 }
 };
-static const struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
-    { { 0, 1 }, { 2, 2 }, { 3, 2 } };
+static const struct vp9_token
+    switchable_interp_encodings[SWITCHABLE_FILTERS] = { { 0, 1 },
+                                                        { 2, 2 },
+                                                        { 3, 2 } };
 static const struct vp9_token partition_encodings[PARTITION_TYPES] = {
   { 0, 1 }, { 2, 2 }, { 6, 3 }, { 7, 3 }
 };
@@ -86,7 +91,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm,
   BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const vpx_prob *const tx_probs =
-      get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
+      get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs);
   vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
   if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
@@ -217,7 +222,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd,
     }
 
     if (is_compound) {
-      vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME,
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1],
                 vp9_get_pred_prob_comp_ref_p(cm, xd));
     } else {
       const int bit0 = mi->ref_frame[0] != LAST_FRAME;
@@ -459,7 +465,8 @@ static void write_modes_sb(
           write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
                         max_mv_magnitude, interp_filter_selected);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
                        max_mv_magnitude, interp_filter_selected);
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
@@ -469,7 +476,6 @@ static void write_modes_sb(
         write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
                        subsize, max_mv_magnitude, interp_filter_selected);
         break;
-      default: assert(0);
     }
   }
 
@@ -618,9 +624,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       return;
     }
 
-    case ONE_LOOP_REDUCED: {
+    default: {
       int updates = 0;
       int noupdates_before_first = 0;
+      assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -670,7 +677,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
       }
       return;
     }
-    default: assert(0);
   }
 }
 
@@ -909,10 +915,24 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
            (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
     int arf_idx = cpi->alt_fb_idx;
-    if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      arf_idx = gf_group->arf_update_idx[gf_group->index];
+    GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+    if (cpi->multi_layer_arf) {
+      for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) {
+        if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx &&
+            arf_idx != cpi->gld_fb_idx) {
+          int idx;
+          for (idx = 0; idx < gf_group->stack_size; ++idx)
+            if (arf_idx == gf_group->arf_index_stack[idx]) break;
+          if (idx == gf_group->stack_size) break;
+        }
+      }
     }
+    cpi->twopass.gf_group.top_arf_idx = arf_idx;
+
+    if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+        cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+      return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id];
     return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
            (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
            (cpi->refresh_alt_ref_frame << arf_idx);
@@ -1117,11 +1137,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi,
         ((cpi->svc.number_temporal_layers > 1 &&
           cpi->oxcf.rc_mode == VPX_CBR) ||
          (cpi->svc.number_spatial_layers > 1 &&
-          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) ||
-         (is_two_pass_svc(cpi) &&
-          cpi->svc.encode_empty_frame_state == ENCODING &&
-          cpi->svc.layer_context[0].frames_from_key_frame <
-              cpi->svc.number_temporal_layers + 1))) {
+          cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) {
       found = 0;
     } else if (cfg != NULL) {
       found =
@@ -1153,8 +1169,10 @@ static void write_profile(BITSTREAM_PROFILE profile,
     case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break;
     case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break;
     case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break;
-    case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break;
-    default: assert(0);
+    default:
+      assert(profile == PROFILE_3);
+      vpx_wb_write_literal(wb, 6, 3);
+      break;
   }
 }
 
@@ -1191,7 +1209,13 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
   write_profile(cm->profile, wb);
 
-  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+  // If to use show existing frame.
+  vpx_wb_write_bit(wb, cm->show_existing_frame);
+  if (cm->show_existing_frame) {
+    vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3);
+    return;
+  }
+
   vpx_wb_write_bit(wb, cm->frame_type);
   vpx_wb_write_bit(wb, cm->show_frame);
   vpx_wb_write_bit(wb, cm->error_resilient_mode);
@@ -1201,14 +1225,6 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
   } else {
-    // In spatial svc if it's not error_resilient_mode then we need to code all
-    // visible frames as invisible. But we need to keep the show_frame flag so
-    // that the publisher could know whether it is supposed to be visible.
-    // So we will code the show_frame flag as it is. Then code the intra_only
-    // bit here. This will make the bitstream incompatible. In the player we
-    // will change to show_frame flag to 0, then add an one byte frame with
-    // show_existing_frame flag which tells the decoder which frame we want to
-    // show.
     if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode)
@@ -1340,7 +1356,20 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   struct vpx_write_bit_buffer wb = { data, 0 };
   struct vpx_write_bit_buffer saved_wb;
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_reset_write();
+#endif
+
   write_uncompressed_header(cpi, &wb);
+
+  // Skip the rest coding process if use show existing frame.
+  if (cpi->common.show_existing_frame) {
+    uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
+    data += uncompressed_hdr_size;
+    *size = data - dest;
+    return;
+  }
+
   saved_wb = wb;
   vpx_wb_write_literal(&wb, 0, 16);  // don't know in advance first part. size
 
diff --git a/libs/libvpx/vp9/encoder/vp9_bitstream.h b/libs/libvpx/vp9/encoder/vp9_bitstream.h
index 339c3fecb13..208651dc227 100644
--- a/libs/libvpx/vp9/encoder/vp9_bitstream.h
+++ b/libs/libvpx/vp9/encoder/vp9_bitstream.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_BITSTREAM_H_
-#define VP9_ENCODER_VP9_BITSTREAM_H_
+#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_
+#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,16 +38,12 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi);
 void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) {
-  return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
-         cpi->rc.is_src_frame_alt_ref &&
-         (!cpi->use_svc ||  // Add spatial svc base layer case here
-          (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 &&
-           cpi->svc.layer_context[0].gold_ref_idx >= 0 &&
-           cpi->oxcf.ss_enable_auto_arf[0]));
+  return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref &&
+         !cpi->use_svc;
 }
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_BITSTREAM_H_
+#endif  // VPX_VP9_ENCODER_VP9_BITSTREAM_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_block.h b/libs/libvpx/vp9/encoder/vp9_block.h
index 724205dd578..37a4605ad8c 100644
--- a/libs/libvpx/vp9/encoder/vp9_block.h
+++ b/libs/libvpx/vp9/encoder/vp9_block.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_BLOCK_H_
-#define VP9_ENCODER_VP9_BLOCK_H_
+#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_
+#define VPX_VP9_ENCODER_VP9_BLOCK_H_
 
 #include "vpx_util/vpx_thread.h"
 
@@ -34,8 +34,8 @@ struct macroblock_plane {
   struct buf_2d src;
 
   // Quantizer setings
+  DECLARE_ALIGNED(16, int16_t, round_fp[8]);
   int16_t *quant_fp;
-  int16_t *round_fp;
   int16_t *quant;
   int16_t *quant_shift;
   int16_t *zbin;
@@ -92,6 +92,8 @@ struct macroblock {
   int sadperbit4;
   int rddiv;
   int rdmult;
+  int cb_rdmult;
+  int segment_id;
   int mb_energy;
 
   // These are set to their default values at the beginning, and then adjusted
@@ -115,6 +117,12 @@ struct macroblock {
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
+  // sharpness is used to disable skip mode and change rd_mult
+  int sharpness;
+
+  // aq mode is used to adjust rd based on segment.
+  int adjust_rdmult_by_segment;
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   MvLimits mv_limits;
@@ -180,6 +188,8 @@ struct macroblock {
 
   int sb_pickmode_part;
 
+  int zero_temp_sad_source;
+
   // For each superblock: saves the content value (e.g., low/high sad/sumdiff)
   // based on source sad, prior to encoding the frame.
   uint8_t content_state_sb;
@@ -199,10 +209,13 @@ struct macroblock {
   void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
                               int stride, int eob, int bd);
 #endif
+  DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]);
+
+  struct scale_factors *me_sf;
 };
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_BLOCK_H_
+#endif  // VPX_VP9_ENCODER_VP9_BLOCK_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_blockiness.c b/libs/libvpx/vp9/encoder/vp9_blockiness.c
index 9ab57b57c76..da68a3c3c3d 100644
--- a/libs/libvpx/vp9/encoder/vp9_blockiness.c
+++ b/libs/libvpx/vp9/encoder/vp9_blockiness.c
@@ -11,6 +11,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/system_state.h"
+#include "vp9/encoder/vp9_blockiness.h"
 
 static int horizontal_filter(const uint8_t *s) {
   return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
diff --git a/libs/libvpx/vp9/encoder/vp9_blockiness.h b/libs/libvpx/vp9/encoder/vp9_blockiness.h
new file mode 100644
index 00000000000..e840cb25180
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_blockiness.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
+                          const uint8_t *img2, int img2_pitch, int width,
+                          int height);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.c b/libs/libvpx/vp9/encoder/vp9_context_tree.c
index 2f7e5443325..b74b9027cab 100644
--- a/libs/libvpx/vp9/encoder/vp9_context_tree.c
+++ b/libs/libvpx/vp9/encoder/vp9_context_tree.c
@@ -12,7 +12,10 @@
 #include "vp9/encoder/vp9_encoder.h"
 
 static const BLOCK_SIZE square[] = {
-  BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+  BLOCK_8X8,
+  BLOCK_16X16,
+  BLOCK_32X32,
+  BLOCK_64X64,
 };
 
 static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
@@ -136,17 +139,22 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
 }
 
 void vp9_free_pc_tree(ThreadData *td) {
-  const int tree_nodes = 64 + 16 + 4 + 1;
   int i;
 
-  // Set up all 4x4 mode contexts
-  for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+  if (td == NULL) return;
 
-  // Sets up all the leaf nodes in the tree.
-  for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+  if (td->leaf_tree != NULL) {
+    // Set up all 4x4 mode contexts
+    for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]);
+    vpx_free(td->leaf_tree);
+    td->leaf_tree = NULL;
+  }
 
-  vpx_free(td->pc_tree);
-  td->pc_tree = NULL;
-  vpx_free(td->leaf_tree);
-  td->leaf_tree = NULL;
+  if (td->pc_tree != NULL) {
+    const int tree_nodes = 64 + 16 + 4 + 1;
+    // Sets up all the leaf nodes in the tree.
+    for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]);
+    vpx_free(td->pc_tree);
+    td->pc_tree = NULL;
+  }
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_context_tree.h b/libs/libvpx/vp9/encoder/vp9_context_tree.h
index 73423c07587..4e301cc17df 100644
--- a/libs/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libs/libvpx/vp9/encoder/vp9_context_tree.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_
-#define VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
+#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_block.h"
@@ -56,6 +56,7 @@ typedef struct {
   // scope of refactoring.
   int rate;
   int64_t dist;
+  int64_t rdcost;
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   unsigned int newmv_sse;
@@ -75,6 +76,8 @@ typedef struct {
 
   // Used for the machine learning-based early termination
   int32_t sum_y_eobs;
+  // Skip certain ref frames during RD search of rectangular partitions.
+  uint8_t skip_ref_frame_mask;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
@@ -88,6 +91,9 @@ typedef struct PC_TREE {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
   };
+  // Obtained from a simple motion search. Used by the ML based partition search
+  // speed feature.
+  MV mv;
 } PC_TREE;
 
 void vp9_setup_pc_tree(struct VP9Common *cm, struct ThreadData *td);
@@ -97,4 +103,4 @@ void vp9_free_pc_tree(struct ThreadData *td);
 }  // extern "C"
 #endif
 
-#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */
+#endif  // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_cost.h b/libs/libvpx/vp9/encoder/vp9_cost.h
index 70a1a2e0e93..638d72a9161 100644
--- a/libs/libvpx/vp9/encoder/vp9_cost.h
+++ b/libs/libvpx/vp9/encoder/vp9_cost.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_COST_H_
-#define VP9_ENCODER_VP9_COST_H_
+#ifndef VPX_VP9_ENCODER_VP9_COST_H_
+#define VPX_VP9_ENCODER_VP9_COST_H_
 
 #include "vpx_dsp/prob.h"
 #include "vpx/vpx_integer.h"
@@ -55,4 +55,4 @@ void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_COST_H_
+#endif  // VPX_VP9_ENCODER_VP9_COST_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_dct.c b/libs/libvpx/vp9/encoder/vp9_dct.c
index 5c66562a569..2f42c6afc22 100644
--- a/libs/libvpx/vp9/encoder/vp9_dct.c
+++ b/libs/libvpx/vp9/encoder/vp9_dct.c
@@ -554,109 +554,6 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
   }
 }
 
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *round_ptr,
-                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
-  int eob = -1;
-
-  int i, j;
-  tran_low_t intermediate[64];
-
-  (void)iscan;
-
-  // Transform columns
-  {
-    tran_low_t *output = intermediate;
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    int i;
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      s0 = (input[0 * stride] + input[7 * stride]) * 4;
-      s1 = (input[1 * stride] + input[6 * stride]) * 4;
-      s2 = (input[2 * stride] + input[5 * stride]) * 4;
-      s3 = (input[3 * stride] + input[4 * stride]) * 4;
-      s4 = (input[3 * stride] - input[4 * stride]) * 4;
-      s5 = (input[2 * stride] - input[5 * stride]) * 4;
-      s6 = (input[1 * stride] - input[6 * stride]) * 4;
-      s7 = (input[0 * stride] - input[7 * stride]) * 4;
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
-      output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
-      output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
-      output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
-      input++;
-      output++;
-    }
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
-    for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
-  }
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
-
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-      if (tmp) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
                   int tx_type) {
   if (tx_type == DCT_DCT) {
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.c b/libs/libvpx/vp9/encoder/vp9_denoiser.c
index b08ccaa66ca..2885223b59e 100644
--- a/libs/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.c
@@ -189,7 +189,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
     int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
     int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
-    int use_svc, int spatial_layer) {
+    int use_svc, int spatial_layer, int use_gf_temporal_ref) {
   const int sse_diff = (ctx->newmv_sse == UINT_MAX)
                            ? 0
                            : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
@@ -201,7 +201,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
   int i;
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE];
-  RefBuffer *saved_block_refs[2];
+  const RefBuffer *saved_block_refs[2];
   MV_REFERENCE_FRAME saved_frame;
 
   frame = ctx->best_reference_frame;
@@ -219,8 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME && frame != ALTREF_FRAME &&
-      (frame != GOLDEN_FRAME || num_spatial_layers == 1) &&
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME && frame != GOLDEN_FRAME &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
     mi->mode = ctx->best_sse_inter_mode;
@@ -230,7 +229,9 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     frame = ctx->best_zeromv_reference_frame;
     ctx->newmv_sse = ctx->zeromv_sse;
     // Bias to last reference.
-    if (num_spatial_layers > 1 || frame == ALTREF_FRAME ||
+    if ((num_spatial_layers > 1 && !use_gf_temporal_ref) ||
+        frame == ALTREF_FRAME ||
+        (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
         (frame != LAST_FRAME &&
          ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
@@ -261,6 +262,14 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
   }
 
+  // Force copy (no denoise, copy source in denoised buffer) if
+  // running_avg_y[frame] is NULL.
+  if (denoiser->running_avg_y[frame].buffer_alloc == NULL) {
+    // Restore everything to its original state
+    *mi = saved_mi;
+    return COPY_BLOCK;
+  }
+
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
     // Restore everything to its original state
     *mi = saved_mi;
@@ -326,7 +335,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
 
 void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
                           BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
-                          VP9_DENOISER_DECISION *denoiser_decision) {
+                          VP9_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref) {
   int mv_col, mv_row;
   int motion_magnitude = 0;
   int zeromv_filter = 0;
@@ -349,6 +359,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   int is_skin = 0;
   int increase_denoising = 0;
   int consec_zeromv = 0;
+  int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG;
   mv_col = ctx->best_sse_mv.as_mv.col;
   mv_row = ctx->best_sse_mv.as_mv.row;
   motion_magnitude = mv_row * mv_row + mv_col * mv_col;
@@ -379,7 +390,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
           // zero/small motion in skin detection is high, i.e, > 4).
           if (consec_zeromv < 4) {
             i = ymis;
-            j = xmis;
+            break;
           }
         }
       }
@@ -392,12 +403,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   }
   if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
 
-  if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising)
+  // Copy block if LAST_FRAME is not a reference.
+  // Last doesn't always exist when SVC layers are dynamically changed, e.g. top
+  // spatial layer doesn't have last reference when it's brought up for the
+  // first time on the fly.
+  if (last_is_reference && denoiser->denoising_level >= kDenLow &&
+      !ctx->sb_skip_denoising)
     decision = perform_motion_compensation(
         &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
         cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
-        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
+        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id,
+        use_gf_temporal_ref);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -445,16 +462,16 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
 }
 
 void vp9_denoiser_update_frame_info(
-    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
-    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
-    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer) {
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) {
   const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
   // Copy source into denoised reference buffers on KEY_FRAME or
   // if the just encoded frame was resized. For SVC, copy source if the base
   // spatial layer was key frame.
   if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset ||
-      svc_base_is_key) {
+      svc_refresh_denoiser_buffers) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
     for (i = 1; i < denoiser->num_ref_frames; ++i) {
@@ -465,32 +482,43 @@ void vp9_denoiser_update_frame_info(
     return;
   }
 
-  // If more than one refresh occurs, must copy frame buffer.
-  if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
-    if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
-    }
-    if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
-    }
-    if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
-                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i))
+        copy_frame(&denoiser->running_avg_y[i + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   } else {
-    if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
-    }
-    if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
-    }
-    if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
-                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
+    // If more than one refresh occurs, must copy frame buffer.
+    if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) >
+        1) {
+      if (refresh_alt_ref_frame) {
+        copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                   &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+    } else {
+      if (refresh_alt_ref_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_golden_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
+      if (refresh_last_frame) {
+        swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                          &denoiser->running_avg_y[INTRA_FRAME + shift]);
+      }
     }
   }
 }
@@ -539,26 +567,38 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
 }
 
 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
-                             int gld_fb_idx, int lst_fb_idx) {
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) {
   int fail = 0;
-  if (refresh_alt) {
-    // Increase the frame buffer index by 1 to map it to the buffer index in the
-    // denoiser.
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           alt_fb_idx + 1 + svc_buf_shift);
-    if (fail) return 1;
-  }
-  if (refresh_gld) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           gld_fb_idx + 1 + svc_buf_shift);
-    if (fail) return 1;
-  }
-  if (refresh_lst) {
-    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
-                                           lst_fb_idx + 1 + svc_buf_shift);
-    if (fail) return 1;
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    int i;
+    for (i = 0; i < REF_FRAMES; i++) {
+      if (cm->frame_type == KEY_FRAME ||
+          svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+        fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                               i + 1 + svc_buf_shift);
+      }
+    }
+  } else {
+    if (refresh_alt) {
+      // Increase the frame buffer index by 1 to map it to the buffer index in
+      // the denoiser.
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             alt_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_gld) {
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             gld_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
+    if (refresh_lst) {
+      fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                             lst_fb_idx + 1 + svc_buf_shift);
+      if (fail) return 1;
+    }
   }
   return 0;
 }
@@ -648,9 +688,10 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
   make_grayscale(&denoiser->running_avg_y[i]);
 #endif
   denoiser->frame_buffer_initialized = 1;
-  denoiser->denoising_level = kDenLow;
-  denoiser->prev_denoising_level = kDenLow;
+  denoiser->denoising_level = kDenMedium;
+  denoiser->prev_denoising_level = kDenMedium;
   denoiser->reset = 0;
+  denoiser->current_denoiser_frame = 0;
   return 0;
 }
 
@@ -675,13 +716,29 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
   vpx_free_frame_buffer(&denoiser->last_source);
 }
 
-void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
+static void force_refresh_longterm_ref(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // If long term reference is used, force refresh of that slot, so
+  // denoiser buffer for long term reference stays in sync.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    int index = svc->spatial_layer_id;
+    if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+    assert(index >= 0);
+    cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+    cpi->refresh_alt_ref_frame = 1;
+  }
+}
+
+void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) {
+  VP9_DENOISER *const denoiser = &cpi->denoiser;
   denoiser->denoising_level = noise_level;
   if (denoiser->denoising_level > kDenLowLow &&
-      denoiser->prev_denoising_level == kDenLowLow)
+      denoiser->prev_denoising_level == kDenLowLow) {
     denoiser->reset = 1;
-  else
+    force_refresh_longterm_ref(cpi);
+  } else {
     denoiser->reset = 0;
+  }
   denoiser->prev_denoising_level = denoiser->denoising_level;
 }
 
@@ -713,6 +770,56 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
     return threshold;
 }
 
+void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) {
+  if (vp9_denoise_svc_non_key(cpi) &&
+      cpi->denoiser.current_denoiser_frame == 0) {
+    cpi->denoiser.reset = 1;
+    force_refresh_longterm_ref(cpi);
+  }
+}
+
+void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+
+  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+      cpi->denoiser.denoising_level > kDenLowLow) {
+    int svc_refresh_denoiser_buffers = 0;
+    int denoise_svc_second_layer = 0;
+    FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
+    cpi->denoiser.current_denoiser_frame++;
+    if (cpi->use_svc) {
+      const int svc_buf_shift =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
+      int layer =
+          LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                           svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      svc_refresh_denoiser_buffers =
+          lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id];
+      denoise_svc_second_layer =
+          svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for refreshed frames.
+      if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift,
+                                   cpi->refresh_alt_ref_frame,
+                                   cpi->refresh_golden_frame,
+                                   cpi->refresh_last_frame, cpi->alt_fb_idx,
+                                   cpi->gld_fb_idx, cpi->lst_fb_idx))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
+    }
+    vp9_denoiser_update_frame_info(
+        &cpi->denoiser, *cpi->Source, svc, frame_type,
+        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
+        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+        cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers,
+        denoise_svc_second_layer);
+  }
+}
+
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv) {
   int r, c;
diff --git a/libs/libvpx/vp9/encoder/vp9_denoiser.h b/libs/libvpx/vp9/encoder/vp9_denoiser.h
index f4da24cbf61..1973e989886 100644
--- a/libs/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libs/libvpx/vp9/encoder/vp9_denoiser.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_DENOISER_H_
-#define VP9_ENCODER_DENOISER_H_
+#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_
+#define VPX_VP9_ENCODER_VP9_DENOISER_H_
 
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_skin_detection.h"
@@ -50,6 +50,7 @@ typedef struct vp9_denoiser {
   int reset;
   int num_ref_frames;
   int num_layers;
+  unsigned int current_denoiser_frame;
   VP9_DENOISER_LEVEL denoising_level;
   VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -70,14 +71,15 @@ struct VP9_COMP;
 struct SVC;
 
 void vp9_denoiser_update_frame_info(
-    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
-    int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
-    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
-    int svc_base_is_key, int second_spatial_layer);
+    VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc,
+    FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame,
+    int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx,
+    int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer);
 
 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                           int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
-                          VP9_DENOISER_DECISION *denoiser_decision);
+                          VP9_DENOISER_DECISION *denoiser_decision,
+                          int use_gf_temporal_ref);
 
 void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx);
 
@@ -86,9 +88,9 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PICK_MODE_CONTEXT *ctx);
 
 int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
-                             int svc_buf_shift, int refresh_alt,
-                             int refresh_gld, int refresh_lst, int alt_fb_idx,
-                             int gld_fb_idx, int lst_fb_idx);
+                             struct SVC *svc, int svc_buf_shift,
+                             int refresh_alt, int refresh_gld, int refresh_lst,
+                             int alt_fb_idx, int gld_fb_idx, int lst_fb_idx);
 
 int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
                        int use_svc, int noise_sen, int width, int height,
@@ -110,7 +112,9 @@ static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs,
 
 void vp9_denoiser_free(VP9_DENOISER *denoiser);
 
-void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level);
+void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level);
+
+void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi);
 
 int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
                               int content_state, int temporal_layer_id);
@@ -119,8 +123,10 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
                                 VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id);
 
+void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_DENOISER_H_
+#endif  // VPX_VP9_ENCODER_VP9_DENOISER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.c b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
index 682477df18b..d47b411fa8d 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <float.h>
 #include <limits.h>
 #include <math.h>
 #include <stdio.h>
@@ -21,6 +22,10 @@
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_ports/system_state.h"
 
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_MISMATCH_DEBUG
+
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
@@ -32,16 +37,21 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
-
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_partition_models.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
@@ -52,33 +62,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
-// Machine learning-based early termination parameters.
-static const double train_mean[24] = {
-  303501.697372, 3042630.372158, 24.694696, 1.392182,
-  689.413511,    162.027012,     1.478213,  0.0,
-  135382.260230, 912738.513263,  28.845217, 1.515230,
-  544.158492,    131.807995,     1.436863,  0.0,
-  43682.377587,  208131.711766,  28.084737, 1.356677,
-  138.254122,    119.522553,     1.252322,  0.0
-};
-
-static const double train_stdm[24] = {
-  673689.212982, 5996652.516628, 0.024449, 1.989792,
-  985.880847,    0.014638,       2.001898, 0.0,
-  208798.775332, 1812548.443284, 0.018693, 1.838009,
-  396.986910,    0.015657,       1.332541, 0.0,
-  55888.847031,  448587.962714,  0.017900, 1.904776,
-  98.652832,     0.016598,       1.320992, 0.0
-};
-
-// Error tolerance: 0.01%-0.0.05%-0.1%
-static const double classifiers[24] = {
-  0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863,
-  0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134,
-  0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700,
-  0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211
-};
-
 // This is used as a reference when computing the source variance for the
 //  purpose of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -176,6 +159,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 static unsigned int get_sby_perpixel_diff_variance(VP9_COMP *cpi,
                                                    const struct buf_2d *ref,
                                                    int mi_row, int mi_col,
@@ -204,6 +188,72 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x,
   else
     return BLOCK_8X8;
 }
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row,
+                              int mi_col, BLOCK_SIZE bsize, int segment_index) {
+  VP9_COMMON *const cm = &cpi->common;
+  const struct segmentation *const seg = &cm->seg;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = xd->mi[0];
+
+  const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+  const uint8_t *const map =
+      seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
+  // Initialize the segmentation index as 0.
+  mi->segment_id = 0;
+
+  // Skip the rest if AQ mode is disabled.
+  if (!seg->enabled) return;
+
+  switch (aq_mode) {
+    case CYCLIC_REFRESH_AQ:
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+#if !CONFIG_REALTIME_ONLY
+    case VARIANCE_AQ:
+      if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+          cpi->force_update_segmentation ||
+          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+        int min_energy;
+        int max_energy;
+        // Get sub block energy range
+        if (bsize >= BLOCK_32X32) {
+          vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                                   &max_energy);
+        } else {
+          min_energy = bsize <= BLOCK_16X16 ? x->mb_energy
+                                            : vp9_block_energy(cpi, x, bsize);
+        }
+        mi->segment_id = vp9_vaq_segment_id(min_energy);
+      } else {
+        mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      }
+      break;
+    case EQUATOR360_AQ:
+      if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation)
+        mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
+      else
+        mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+#endif
+    case LOOKAHEAD_AQ:
+      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      break;
+    case PSNR_AQ: mi->segment_id = segment_index; break;
+    case PERCEPTUAL_AQ: mi->segment_id = x->segment_id; break;
+    default:
+      // NO_AQ or PSNR_AQ
+      break;
+  }
+
+  // Set segment index from ROI map if it's enabled.
+  if (cpi->roi.enabled)
+    mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+
+  vp9_init_plane_quantizers(cpi, x);
+}
 
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
@@ -217,23 +267,57 @@ static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
   x->mbmi_ext = x->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 }
 
+static void set_ssim_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+                            const BLOCK_SIZE bsize, const int mi_row,
+                            const int mi_col, int *const rdmult) {
+  const VP9_COMMON *const cm = &cpi->common;
+
+  const int bsize_base = BLOCK_16X16;
+  const int num_8x8_w = num_8x8_blocks_wide_lookup[bsize_base];
+  const int num_8x8_h = num_8x8_blocks_high_lookup[bsize_base];
+  const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+  const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+  const int num_bcols =
+      (num_8x8_blocks_wide_lookup[bsize] + num_8x8_w - 1) / num_8x8_w;
+  const int num_brows =
+      (num_8x8_blocks_high_lookup[bsize] + num_8x8_h - 1) / num_8x8_h;
+  int row, col;
+  double num_of_mi = 0.0;
+  double geom_mean_of_scale = 0.0;
+
+  assert(cpi->oxcf.tuning == VP8_TUNE_SSIM);
+
+  for (row = mi_row / num_8x8_w;
+       row < num_rows && row < mi_row / num_8x8_w + num_brows; ++row) {
+    for (col = mi_col / num_8x8_h;
+         col < num_cols && col < mi_col / num_8x8_h + num_bcols; ++col) {
+      const int index = row * num_cols + col;
+      geom_mean_of_scale += log(cpi->mi_ssim_rdmult_scaling_factors[index]);
+      num_of_mi += 1.0;
+    }
+  }
+  geom_mean_of_scale = exp(geom_mean_of_scale / num_of_mi);
+
+  *rdmult = (int)((double)(*rdmult) * geom_mean_of_scale);
+  *rdmult = VPXMAX(*rdmult, 0);
+  set_error_per_bit(x, *rdmult);
+  vpx_clear_system_state();
+}
+
 static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
                         MACROBLOCK *const x, int mi_row, int mi_col,
                         BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MODE_INFO *mi;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const struct segmentation *const seg = &cm->seg;
   MvLimits *const mv_limits = &x->mv_limits;
 
   set_skip_context(xd, mi_row, mi_col);
 
   set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
 
-  mi = xd->mi[0];
-
   // Set up destination pointers.
   vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
 
@@ -255,21 +339,8 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
   // R/D setup.
   x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
-
-  // Setup segment ID.
-  if (seg->enabled) {
-    if (cpi->oxcf.aq_mode != VARIANCE_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ &&
-        cpi->oxcf.aq_mode != EQUATOR360_AQ) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    vp9_init_plane_quantizers(cpi, x);
-
-    x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id];
-  } else {
-    mi->segment_id = 0;
-    x->encode_breakout = cpi->encode_breakout;
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
   }
 
   // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs()
@@ -385,16 +456,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
         node->split[i] = &vt->split[i].part_variances.none;
       break;
     }
-    case BLOCK_4X4: {
+    default: {
       v4x4 *vt = (v4x4 *)data;
+      assert(bsize == BLOCK_4X4);
       node->part_variances = &vt->part_variances;
       for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
       break;
     }
-    default: {
-      assert(0);
-      break;
-    }
   }
 }
 
@@ -408,7 +476,8 @@ static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
 static void get_variance(var *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
-                   ((v->sum_error * v->sum_error) >> v->log2_count)) >>
+                   (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
+                              v->log2_count)) >>
             v->log2_count);
 }
 
@@ -450,7 +519,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
     // Variance already computed to set the force_split.
-    if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none);
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
         vt.part_variances->none.variance < threshold) {
@@ -460,9 +529,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
     return 0;
   } else if (bsize > bsize_min) {
     // Variance already computed to set the force_split.
-    if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none);
+    if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none);
     // For key frame: take split for bsize above 32X32 or very high variance.
-    if (cm->frame_type == KEY_FRAME &&
+    if (frame_is_intra_only(cm) &&
         (bsize > BLOCK_32X32 ||
          vt.part_variances->none.variance > (threshold << 4))) {
       return 0;
@@ -534,8 +603,9 @@ static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
 static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
                                int content_state) {
   VP9_COMMON *const cm = &cpi->common;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int threshold_multiplier =
+      is_key_frame ? 20 : cpi->sf.variance_part_thresh_mult;
   int64_t threshold_base =
       (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]);
 
@@ -586,6 +656,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
     } else {
       thresholds[1] = (5 * threshold_base) >> 1;
     }
+    if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX;
   }
 }
 
@@ -593,7 +664,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q,
                                            int content_state) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int is_key_frame = frame_is_intra_only(cm);
   if (sf->partition_search_type != VAR_BASED_PARTITION &&
       sf->partition_search_type != REFERENCE_PARTITION) {
     return;
@@ -620,6 +691,11 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q,
         cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000
                                       ? (cpi->y_dequant[q][1] << 3)
                                       : 8000;
+      if (cpi->rc.high_source_sad ||
+          (cpi->use_svc && cpi->svc.high_source_sad_superframe)) {
+        cpi->vbp_threshold_sad = 0;
+        cpi->vbp_threshold_copy = 0;
+      }
     }
     cpi->vbp_threshold_minmax = 15 + (q >> 3);
   }
@@ -885,13 +961,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
         set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
         copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -940,18 +1016,20 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   const int has_rows = (mi_row_high + bs_high) < cm->mi_rows;
   const int has_cols = (mi_col_high + bs_high) < cm->mi_cols;
 
-  const int row_boundary_block_scale_factor[BLOCK_SIZES] = {
-    13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0
-  };
-  const int col_boundary_block_scale_factor[BLOCK_SIZES] = {
-    13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0
-  };
+  const int row_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 1, 0,
+                                                             1,  1,  0,  1, 1,
+                                                             0,  1,  0 };
+  const int col_boundary_block_scale_factor[BLOCK_SIZES] = { 13, 13, 13, 2, 2,
+                                                             0,  2,  2,  0, 2,
+                                                             2,  0,  0 };
   int start_pos;
   BLOCK_SIZE bsize_low;
   PARTITION_TYPE partition_high;
 
   if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0;
-  if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0;
+  if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] ||
+      mi_col >= svc->mi_cols[svc->spatial_layer_id - 1])
+    return 0;
 
   // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
   start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
@@ -1004,7 +1082,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
           set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
                          subsize_high);
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition_high == PARTITION_SPLIT);
         if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
                                    mi_row_high, mi_col_high))
           return 1;
@@ -1020,7 +1099,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
                                    mi_col_high + bs_high))
           return 1;
         break;
-      default: assert(0);
     }
   }
 
@@ -1067,13 +1145,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
         prev_part[start_pos] = subsize;
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         update_partition_svc(cpi, subsize, mi_row, mi_col);
         update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
         update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
         update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -1108,13 +1186,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
         prev_part[start_pos] = subsize;
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
         update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
         update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
         update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
-      default: assert(0);
     }
   }
 }
@@ -1206,6 +1284,7 @@ static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
       cpi->content_state_sb_fd[sb_offset] = 0;
     }
   }
+  if (tmp_sad == 0) x->zero_temp_sad_source = 1;
   return tmp_sad;
 }
 
@@ -1241,21 +1320,40 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   int pixels_wide = 64, pixels_high = 64;
   int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
                             cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] };
+  int force_64_split = cpi->rc.high_source_sad ||
+                       (cpi->use_svc && cpi->svc.high_source_sad_superframe) ||
+                       (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+                        cpi->compute_source_sad_onepass &&
+                        cpi->sf.use_source_sad && !x->zero_temp_sad_source);
 
   // For the variance computation under SVC mode, we treat the frame as key if
   // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
-  const int is_key_frame =
-      (cm->frame_type == KEY_FRAME ||
+  int is_key_frame =
+      (frame_is_intra_only(cm) ||
        (is_one_pass_cbr_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
   // Always use 4x4 partition for key frame.
-  const int use_4x4_partition = cm->frame_type == KEY_FRAME;
+  const int use_4x4_partition = frame_is_intra_only(cm);
   const int low_res = (cm->width <= 352 && cm->height <= 288);
   int variance4x4downsample[16];
   int segment_id;
   int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3);
 
+  // For SVC: check if LAST frame is NULL or if the resolution of LAST is
+  // different than the current frame resolution, and if so, treat this frame
+  // as a key frame, for the purpose of the superblock partitioning.
+  // LAST == NULL can happen in some cases where enhancement spatial layers are
+  // enabled dyanmically in the stream and the only reference is the spatial
+  // reference (GOLDEN).
+  if (cpi->use_svc) {
+    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME);
+    if (ref == NULL || ref->y_crop_height != cm->height ||
+        ref->y_crop_width != cm->width)
+      is_key_frame = 1;
+  }
+
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
   segment_id = xd->mi[0]->segment_id;
 
   if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
@@ -1289,6 +1387,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
     // If source_sad is low copy the partition without computing the y_sad.
     if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
+        !force_64_split &&
         copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
       x->sb_use_mv_part = 1;
       if (cpi->sf.svc_use_lowres_part &&
@@ -1305,6 +1404,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   } else {
     set_vbp_thresholds(cpi, thresholds, cm->base_qindex, content_state);
   }
+  // Decrease 32x32 split threshold for screen on base layer, for scene
+  // change/high motion frames.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->svc.spatial_layer_id == 0 && force_64_split)
+    thresholds[1] = 3 * thresholds[1] >> 2;
 
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
   threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
@@ -1317,7 +1421,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
   // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
   // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
+  force_split[0] = force_64_split;
 
   if (!is_key_frame) {
     // In the case of spatial/temporal scalable coding, the assumption here is
@@ -1333,7 +1437,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) {
+    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+        cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
       yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
@@ -1374,10 +1479,28 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
           xd->plane[0].pre[0].stride);
     } else {
-      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+      const MV dummy_mv = { 0, 0 };
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+                                            &dummy_mv);
       x->sb_use_mv_part = 1;
       x->sb_mvcol_part = mi->mv[0].as_mv.col;
       x->sb_mvrow_part = mi->mv[0].as_mv.row;
+      if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+          cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+          cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source &&
+          cm->width > 640 && cm->height > 480) {
+        // Disable split below 16x16 block size when scroll motion (horz or
+        // vert) is detected.
+        // TODO(marpan/jianj): Improve this condition: issue is that search
+        // range is hard-coded/limited in vp9_int_pro_motion_estimation() so
+        // scroll motion may not be detected here.
+        if (((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) ||
+             (abs(x->sb_mvcol_part) >= 48 && abs(x->sb_mvrow_part) <= 8)) &&
+            y_sad < 100000) {
+          compute_minmax_variance = 0;
+          thresholds[2] = INT64_MAX;
+        }
+      }
     }
 
     y_sad_last = y_sad;
@@ -1513,9 +1636,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           }
         }
       }
-      if (is_key_frame || (low_res &&
-                           vt.split[i].split[j].part_variances.none.variance >
-                               threshold_4x4avg)) {
+      if (is_key_frame ||
+          (low_res && vt.split[i].split[j].part_variances.none.variance >
+                          threshold_4x4avg)) {
         force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
@@ -1648,11 +1771,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     }
   }
 
-  if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
+  if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) {
     update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
   }
 
-  if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part &&
+  if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part &&
       cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
     update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
 
@@ -1666,6 +1789,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   return 0;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
                          int mi_row, int mi_col, BLOCK_SIZE bsize,
                          int output_enabled) {
@@ -1794,6 +1918,7 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
@@ -1836,20 +1961,41 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   vp9_rd_cost_init(rd_cost);
 }
 
-static int set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
-                              int8_t segment_id) {
-  int segment_qindex;
+#if !CONFIG_REALTIME_ONLY
+static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               AQ_MODE aq_mode) {
   VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const uint8_t *const map =
+      cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+
   vp9_init_plane_quantizers(cpi, x);
   vpx_clear_system_state();
-  segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-  return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+
+  if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) {
+    if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult;
+  } else if (aq_mode == PERCEPTUAL_AQ) {
+    x->rdmult = x->cb_rdmult;
+  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+    // If segment is boosted, use rdmult for that segment.
+    if (cyclic_refresh_segment_id_boosted(
+            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  } else {
+    x->rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+  }
+
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+  }
 }
 
 static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
                              MACROBLOCK *const x, int mi_row, int mi_col,
                              RD_COST *rd_cost, BLOCK_SIZE bsize,
-                             PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+                             PICK_MODE_CONTEXT *ctx, int rate_in_best_rd,
+                             int64_t dist_in_best_rd) {
   VP9_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -1858,6 +2004,7 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   struct macroblockd_plane *const pd = xd->plane;
   const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
   int i, orig_rdmult;
+  int64_t best_rd = INT64_MAX;
 
   vpx_clear_system_state();
 
@@ -1914,43 +2061,11 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
   }
 
-  if (aq_mode == VARIANCE_AQ) {
-    const int energy =
-        bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize);
-
-    if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
-        cpi->force_update_segmentation ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
-      mi->segment_id = vp9_vaq_segment_id(energy);
-    } else {
-      const uint8_t *const map =
-          cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == LOOKAHEAD_AQ) {
-    const uint8_t *const map = cpi->segmentation_map;
-
-    // I do not change rdmult here consciously.
-    mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-  } else if (aq_mode == EQUATOR360_AQ) {
-    if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) {
-      mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows);
-    } else {
-      const uint8_t *const map =
-          cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    }
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == COMPLEXITY_AQ) {
-    x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id);
-  } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    const uint8_t *const map =
-        cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-    // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(
-            get_segment_id(cm, map, bsize, mi_row, mi_col)))
-      x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+  set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+  set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode);
+  if (rate_in_best_rd < INT_MAX && dist_in_best_rd < INT64_MAX) {
+    best_rd = vp9_calculate_rd_cost(x->rdmult, x->rddiv, rate_in_best_rd,
+                                    dist_in_best_rd);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -1979,15 +2094,19 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
 
-  x->rdmult = orig_rdmult;
-
   // TODO(jingning) The rate-distortion optimization flow needs to be
   // refactored to provide proper exit/return handle.
-  if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+  if (rd_cost->rate == INT_MAX || rd_cost->dist == INT64_MAX)
+    rd_cost->rdcost = INT64_MAX;
+  else
+    rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
+  x->rdmult = orig_rdmult;
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   const MACROBLOCK *x = &td->mb;
@@ -2013,8 +2132,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
                             [has_second_ref(mi)]++;
 
         if (has_second_ref(mi)) {
-          counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)]
-                          [ref0 == GOLDEN_FRAME]++;
+          const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+          const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+          const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1];
+          counts->comp_ref[ctx][bit]++;
         } else {
           counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
@@ -2046,6 +2167,7 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
                             ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
@@ -2110,6 +2232,16 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td,
                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+
+  if (cpi->sf.enable_tpl_model &&
+      (cpi->oxcf.aq_mode == NO_AQ || cpi->oxcf.aq_mode == PERCEPTUAL_AQ)) {
+    const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+    x->rdmult = x->cb_rdmult;
+    if (oxcf->tuning == VP8_TUNE_SSIM) {
+      set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &x->rdmult);
+    }
+  }
+
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
@@ -2168,7 +2300,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
                  subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
                  pc_tree->leaf_split[0]);
@@ -2183,12 +2316,12 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
                   subsize, pc_tree->split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Check to see if the given partition size is allowed for a specified number
 // of 8x8 block rows and columns remaining in the image.
@@ -2393,17 +2526,15 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && cpi->oxcf.aq_mode != NO_AQ) {
-    // For in frame complexity AQ or variance AQ, copy segment_id from
-    // segmentation_map.
-    if (cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ) {
+  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
+    // Setting segmentation map for cyclic_refresh.
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
+                                        ctx->rate, ctx->dist, x->skip, p);
+    } else {
       const uint8_t *const map =
           seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
       mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
-    } else {
-      // Setting segmentation map for cyclic_refresh.
-      vp9_cyclic_refresh_update_segment(cpi, mi, mi_row, mi_col, bsize,
-                                        ctx->rate, ctx->dist, x->skip, p);
     }
     vp9_init_plane_quantizers(cpi, x);
   }
@@ -2441,7 +2572,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td,
   }
 
   x->skip = ctx->skip;
-  x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0];
+  x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0];
 }
 
 static void encode_b_rt(VP9_COMP *cpi, ThreadData *td,
@@ -2509,7 +2640,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
                     subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
                    pc_tree->split[0]);
@@ -2520,13 +2652,13 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
       encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
                    output_enabled, subsize, pc_tree->split[3]);
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
                              TileDataEnc *tile_data, MODE_INFO **mi_8x8,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
@@ -2595,7 +2727,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         mi_col + (mi_step >> 1) < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize, ctx,
-                       INT64_MAX);
+                       INT_MAX, INT64_MAX);
 
       pl = partition_plane_context(xd, mi_row, mi_col, bsize);
 
@@ -2614,11 +2746,12 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, bsize,
-                       ctx, INT64_MAX);
+                       ctx, INT_MAX, INT64_MAX);
       break;
     case PARTITION_HORZ:
+      pc_tree->horizontal[0].skip_ref_frame_mask = 0;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       subsize, &pc_tree->horizontal[0], INT64_MAX);
+                       subsize, &pc_tree->horizontal[0], INT_MAX, INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
@@ -2626,8 +2759,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        pc_tree->horizontal[1].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
-                         &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX);
+                         &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
+                         INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2638,8 +2773,9 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       }
       break;
     case PARTITION_VERT:
+      pc_tree->vertical[0].skip_ref_frame_mask = 0;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                       subsize, &pc_tree->vertical[0], INT64_MAX);
+                       subsize, &pc_tree->vertical[0], INT_MAX, INT64_MAX);
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
@@ -2647,9 +2783,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         vp9_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
-        rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1),
-                         &tmp_rdc, subsize,
-                         &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+        pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
+        rd_pick_sb_modes(
+            cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+            subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT_MAX, INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp9_rd_cost_reset(&last_part_rdc);
           break;
@@ -2659,10 +2796,11 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.rdcost += tmp_rdc.rdcost;
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
-                         subsize, pc_tree->leaf_split[0], INT64_MAX);
+                         subsize, pc_tree->leaf_split[0], INT_MAX, INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
@@ -2689,7 +2827,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
         last_part_rdc.dist += tmp_rdc.dist;
       }
       break;
-    default: assert(0); break;
   }
 
   pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -2727,7 +2864,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
                        &tmp_rdc, split_subsize, &pc_tree->split[i]->none,
-                       INT64_MAX);
+                       INT_MAX, INT64_MAX);
 
       restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
@@ -2961,6 +3098,7 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   *min_bs = min_size;
   *max_bs = max_size;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
   memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
@@ -2975,15 +3113,15 @@ const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
                                                         1, 2, 2, 2, 4, 4 };
 const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 1, 1,
                                                         2, 1, 2, 4, 2, 4 };
-const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
-  0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120
-};
-const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
-  0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120
-};
-const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
-  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6
-};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] = { 0,   10,  10, 30, 40,
+                                                        40,  60,  80, 80, 90,
+                                                        100, 100, 120 };
+const int qindex_split_threshold_lookup[BLOCK_SIZES] = { 0,  3,  3,  7,  15,
+                                                         15, 30, 40, 40, 60,
+                                                         80, 80, 120 };
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = { 1, 1, 1, 1, 1,
+                                                             1, 1, 1, 1, 1,
+                                                             4, 4, 6 };
 
 typedef enum {
   MV_ZERO = 0,
@@ -3018,14 +3156,60 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
-// Calculate the score used in machine-learning based partition search early
-// termination.
-static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize) {
-  const double *clf;
-  const double *mean;
-  const double *sd;
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+static void nn_predict(const float *features, const NN_CONFIG *nn_config,
+                       float *output) {
+  int num_input_nodes = nn_config->num_inputs;
+  int buf_index = 0;
+  float buf[2][NN_MAX_NODES_PER_LAYER];
+  const float *input_nodes = features;
+
+  // Propagate hidden layers.
+  const int num_layers = nn_config->num_hidden_layers;
+  int layer, node, i;
+  assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+  for (layer = 0; layer < num_layers; ++layer) {
+    const float *weights = nn_config->weights[layer];
+    const float *bias = nn_config->bias[layer];
+    float *output_nodes = buf[buf_index];
+    const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+    assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+    for (node = 0; node < num_output_nodes; ++node) {
+      float val = 0.0f;
+      for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+      val += bias[node];
+      // ReLU as activation function.
+      val = VPXMAX(val, 0.0f);
+      output_nodes[node] = val;
+      weights += num_input_nodes;
+    }
+    num_input_nodes = num_output_nodes;
+    input_nodes = output_nodes;
+    buf_index = 1 - buf_index;
+  }
+
+  // Final output layer.
+  {
+    const float *weights = nn_config->weights[num_layers];
+    for (node = 0; node < nn_config->num_outputs; ++node) {
+      const float *bias = nn_config->bias[num_layers];
+      float val = 0.0f;
+      for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i];
+      output[node] = val + bias[node];
+      weights += num_input_nodes;
+    }
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+#define FEATURES 7
+// Machine-learning based partition search early termination.
+// Return 1 to skip split and rect partitions.
+static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
+                                PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+                                BLOCK_SIZE bsize) {
   const int mag_mv =
       abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
   const int left_in_image = !!xd->left_mi;
@@ -3035,11 +3219,32 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   int above_par = 0;  // above_partitioning
   int left_par = 0;   // left_partitioning
   int last_par = 0;   // last_partitioning
-  BLOCK_SIZE context_size;
-  double score;
   int offset = 0;
+  int i;
+  BLOCK_SIZE context_size;
+  const NN_CONFIG *nn_config = NULL;
+  const float *mean, *sd, *linear_weights;
+  float nn_score, linear_score;
+  float features[FEATURES];
 
   assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+  vpx_clear_system_state();
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      offset = 0;
+      nn_config = &vp9_partition_nnconfig_64x64;
+      break;
+    case BLOCK_32X32:
+      offset = 8;
+      nn_config = &vp9_partition_nnconfig_32x32;
+      break;
+    case BLOCK_16X16:
+      offset = 16;
+      nn_config = &vp9_partition_nnconfig_16x16;
+      break;
+    default: assert(0 && "Unexpected block size."); return 0;
+  }
 
   if (above_in_image) {
     context_size = xd->above_mi->sb_type;
@@ -3065,119 +3270,653 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       last_par = 1;
   }
 
-  if (bsize == BLOCK_64X64)
-    offset = 0;
-  else if (bsize == BLOCK_32X32)
-    offset = 8;
-  else if (bsize == BLOCK_16X16)
-    offset = 16;
-
-  // early termination score calculation
-  clf = &classifiers[offset];
-  mean = &train_mean[offset];
-  sd = &train_stdm[offset];
-  score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
-          clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
-          clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
-          clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) +
-          clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) +
-          clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
-          clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
-  return score;
+  mean = &vp9_partition_feature_mean[offset];
+  sd = &vp9_partition_feature_std[offset];
+  features[0] = ((float)ctx->rate - mean[0]) / sd[0];
+  features[1] = ((float)ctx->dist - mean[1]) / sd[1];
+  features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2];
+  features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3];
+  features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4];
+  features[5] = ((float)cm->base_qindex - mean[5]) * sd[5];
+  features[6] = ((float)last_par - mean[6]) * sd[6];
+
+  // Predict using linear model.
+  linear_weights = &vp9_partition_linear_weights[offset];
+  linear_score = linear_weights[FEATURES];
+  for (i = 0; i < FEATURES; ++i)
+    linear_score += linear_weights[i] * features[i];
+  if (linear_score > 0.1f) return 0;
+
+  // Predict using neural net model.
+  nn_predict(features, nn_config, &nn_score);
+
+  if (linear_score < -0.0f && nn_score < 0.1f) return 1;
+  if (nn_score < -0.0f && linear_score < 0.1f) return 1;
+  return 0;
 }
+#undef FEATURES
+
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                               const MACROBLOCK *const x,
+                               const RD_COST *const rd_cost) {
+  DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
+  const VP9_COMMON *const cm = &cpi->common;
+  float features[FEATURES];
+  const float *linear_weights = NULL;  // Linear model weights.
+  float linear_score = 0.0f;
+  const int qindex = cm->base_qindex;
+  const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2);
+  const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720;
+  const int resolution_ctx = is_720p_or_larger ? 1 : 0;
 
-// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
-// unlikely to be selected depending on previous rate-distortion optimization
-// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
-                              TileDataEnc *tile_data, TOKENEXTRA **tp,
-                              int mi_row, int mi_col, BLOCK_SIZE bsize,
-                              RD_COST *rd_cost, int64_t best_rd,
-                              PC_TREE *pc_tree) {
-  VP9_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
-  TOKENEXTRA *tp_orig = *tp;
-  PICK_MODE_CONTEXT *ctx = &pc_tree->none;
-  int i;
-  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-  BLOCK_SIZE subsize;
-  RD_COST this_rdc, sum_rdc, best_rdc;
-  int do_split = bsize >= BLOCK_8X8;
-  int do_rect = 1;
-  INTERP_FILTER pred_interp_filter;
-
-  // Override skipping rectangular partition operations for edge blocks
-  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
-  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
-  const int xss = x->e_mbd.plane[1].subsampling_x;
-  const int yss = x->e_mbd.plane[1].subsampling_y;
+  switch (bsize) {
+    case BLOCK_64X64:
+      linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_32X32:
+      linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_16X16:
+      linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx];
+      break;
+    case BLOCK_8X8:
+      linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx];
+      break;
+    default: assert(0 && "Unexpected block size."); return 0;
+  }
+  if (!linear_weights) return 0;
 
-  BLOCK_SIZE min_size = x->min_partition_size;
-  BLOCK_SIZE max_size = x->max_partition_size;
+  {  // Generate feature values.
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int ac_q =
+        vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+    const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    const int num_pels_log2 = num_pels_log2_lookup[bsize];
+    int feature_index = 0;
+    unsigned int var, sse;
+    float rate_f, dist_f;
 
-#if CONFIG_FP_MB_STATS
-  unsigned int src_diff_var = UINT_MAX;
-  int none_complexity = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      var =
+          vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd);
+    } else {
+      var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                  vp9_64_zeros, 0, &sse);
+    }
+#else
+    var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                vp9_64_zeros, 0, &sse);
 #endif
+    var = var >> num_pels_log2;
 
-  int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed =
-      !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
-  int partition_vert_allowed =
-      !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
-
-  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
-  int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
+    vpx_clear_system_state();
 
-  (void)*tp_orig;
+    rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX);
+    dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2);
+    rate_f =
+        ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+        rate_f;
 
-  assert(num_8x8_blocks_wide_lookup[bsize] ==
-         num_8x8_blocks_high_lookup[bsize]);
+    features[feature_index++] = rate_f;
+    features[feature_index++] = dist_f;
+    features[feature_index++] = (float)var;
+    features[feature_index++] = (float)ac_q;
+    assert(feature_index == FEATURES);
+  }
 
-  // Adjust dist breakout threshold according to the partition size.
-  dist_breakout_thr >>=
-      8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-  rate_breakout_thr *= num_pels_log2_lookup[bsize];
+  {  // Calculate the output score.
+    int i;
+    linear_score = linear_weights[FEATURES];
+    for (i = 0; i < FEATURES; ++i)
+      linear_score += linear_weights[i] * features[i];
+  }
 
-  vp9_rd_cost_init(&this_rdc);
-  vp9_rd_cost_init(&sum_rdc);
-  vp9_rd_cost_reset(&best_rdc);
-  best_rdc.rdcost = best_rd;
+  return linear_score >= cpi->sf.rd_ml_partition.search_breakout_thresh[q_ctx];
+}
+#undef FEATURES
+
+#define FEATURES 8
+#define LABELS 4
+static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x,
+                                    BLOCK_SIZE bsize,
+                                    const PC_TREE *const pc_tree,
+                                    int *allow_horz, int *allow_vert,
+                                    int64_t ref_rd) {
+  const NN_CONFIG *nn_config = NULL;
+  float score[LABELS] = {
+    0.0f,
+  };
+  int thresh = -1;
+  int i;
+  (void)x;
 
-  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+  if (ref_rd <= 0 || ref_rd > 1000000000) return;
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ &&
-      cpi->oxcf.aq_mode != LOOKAHEAD_AQ)
-    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+  switch (bsize) {
+    case BLOCK_8X8: break;
+    case BLOCK_16X16:
+      nn_config = &vp9_rect_part_nnconfig_16;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[1];
+      break;
+    case BLOCK_32X32:
+      nn_config = &vp9_rect_part_nnconfig_32;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[2];
+      break;
+    case BLOCK_64X64:
+      nn_config = &vp9_rect_part_nnconfig_64;
+      thresh = cpi->sf.rd_ml_partition.prune_rect_thresh[3];
+      break;
+    default: assert(0 && "Unexpected block size."); return;
+  }
+  if (!nn_config || thresh < 0) return;
 
-  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
-    int cb_partition_search_ctrl =
-        ((pc_tree->index == 0 || pc_tree->index == 3) +
-         get_chessboard_index(cm->current_video_frame)) &
-        0x1;
+  // Feature extraction and model score calculation.
+  {
+    const VP9_COMMON *const cm = &cpi->common;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int dc_q =
+        vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8);
+#else
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+    int feature_index = 0;
+    float features[FEATURES];
+
+    features[feature_index++] = logf((float)dc_q + 1.0f);
+    features[feature_index++] =
+        (float)(pc_tree->partitioning == PARTITION_NONE);
+    features[feature_index++] = logf((float)ref_rd / bs / bs + 1.0f);
+
+    {
+      const float norm_factor = 1.0f / ((float)ref_rd + 1.0f);
+      const int64_t none_rdcost = pc_tree->none.rdcost;
+      float rd_ratio = 2.0f;
+      if (none_rdcost > 0 && none_rdcost < 1000000000)
+        rd_ratio = (float)none_rdcost * norm_factor;
+      features[feature_index++] = VPXMIN(rd_ratio, 2.0f);
+
+      for (i = 0; i < 4; ++i) {
+        const int64_t this_rd = pc_tree->split[i]->none.rdcost;
+        const int rd_valid = this_rd > 0 && this_rd < 1000000000;
+        // Ratio between sub-block RD and whole block RD.
+        features[feature_index++] =
+            rd_valid ? (float)this_rd * norm_factor : 1.0f;
+      }
+    }
 
-    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
-      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+    assert(feature_index == FEATURES);
+    nn_predict(features, nn_config, score);
   }
 
-  // Determine partition types in search according to the speed features.
-  // The threshold set here has to be of square block size.
-  if (cpi->sf.auto_min_max_partition_size) {
-    partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
-    partition_horz_allowed &=
-        ((bsize <= max_size && bsize > min_size) || force_horz_split);
-    partition_vert_allowed &=
-        ((bsize <= max_size && bsize > min_size) || force_vert_split);
-    do_split &= bsize > min_size;
+  // Make decisions based on the model score.
+  {
+    int max_score = -1000;
+    int horz = 0, vert = 0;
+    int int_score[LABELS];
+    for (i = 0; i < LABELS; ++i) {
+      int_score[i] = (int)(100 * score[i]);
+      max_score = VPXMAX(int_score[i], max_score);
+    }
+    thresh = max_score - thresh;
+    for (i = 0; i < LABELS; ++i) {
+      if (int_score[i] >= thresh) {
+        if ((i >> 0) & 1) horz = 1;
+        if ((i >> 1) & 1) vert = 1;
+      }
+    }
+    *allow_horz = *allow_horz && horz;
+    *allow_vert = *allow_vert && vert;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
+// Perform fast and coarse motion search for the given block. This is a
+// pre-processing step for the ML based partition search speedup.
+static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 MV ref_mv, MV_REFERENCE_FRAME ref,
+                                 uint8_t *const pred_buf) {
+  const VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+  const int step_param = 1;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  const SEARCH_METHODS search_method = NSTEP;
+  const int sadpb = x->sadperbit16;
+  MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
+  MV best_mv = { 0, 0 };
+  int cost_list[5];
+
+  assert(yv12 != NULL);
+  if (!yv12) return;
+  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                       &cm->frame_refs[ref - 1].sf);
+  mi->ref_frame[0] = ref;
+  mi->ref_frame[1] = NONE;
+  mi->sb_type = bsize;
+  vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
+  vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
+                        sadpb, cond_cost_list(cpi, cost_list), &ref_mv,
+                        &best_mv, 0, 0);
+  best_mv.row *= 8;
+  best_mv.col *= 8;
+  x->mv_limits = tmp_mv_limits;
+  mi->mv[0].as_mv = best_mv;
+
+  set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+  xd->plane[0].dst.buf = pred_buf;
+  xd->plane[0].dst.stride = 64;
+  vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+}
+
+// Use a neural net model to prune partition-none and partition-split search.
+// Features used: QP; spatial block size contexts; variance of prediction
+// residue after simple_motion_search.
+#define FEATURES 12
+static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
+                                          MACROBLOCK *const x,
+                                          PC_TREE *const pc_tree,
+                                          BLOCK_SIZE bsize, int mi_row,
+                                          int mi_col, int *none, int *split) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
+  uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+                                ? (CONVERT_TO_BYTEPTR(pred_buffer))
+                                : pred_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]);
+  uint8_t *const pred_buf = pred_buffer;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  const int speed = cpi->oxcf.speed;
+  float thresh = 0.0f;
+
+  switch (bsize) {
+    case BLOCK_64X64:
+      nn_config = &vp9_part_split_nnconfig_64;
+      thresh = speed > 0 ? 2.8f : 3.0f;
+      break;
+    case BLOCK_32X32:
+      nn_config = &vp9_part_split_nnconfig_32;
+      thresh = speed > 0 ? 3.5f : 3.0f;
+      break;
+    case BLOCK_16X16:
+      nn_config = &vp9_part_split_nnconfig_16;
+      thresh = speed > 0 ? 3.8f : 4.0f;
+      break;
+    case BLOCK_8X8:
+      nn_config = &vp9_part_split_nnconfig_8;
+      if (cm->width >= 720 && cm->height >= 720)
+        thresh = speed > 0 ? 2.5f : 2.0f;
+      else
+        thresh = speed > 0 ? 3.8f : 2.0f;
+      break;
+    default: assert(0 && "Unexpected block size."); return;
+  }
+
+  if (!nn_config) return;
+
+  // Do a simple single motion search to find a prediction for current block.
+  // The variance of the residue will be used as input features.
+  {
+    MV ref_mv;
+    const MV_REFERENCE_FRAME ref =
+        cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME;
+    // If bsize is 64x64, use zero MV as reference; otherwise, use MV result
+    // of previous(larger) block as reference.
+    if (bsize == BLOCK_64X64)
+      ref_mv.row = ref_mv.col = 0;
+    else
+      ref_mv = pc_tree->mv;
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    simple_motion_search(cpi, x, bsize, mi_row, mi_col, ref_mv, ref, pred_buf);
+    pc_tree->mv = x->e_mbd.mi[0]->mv[0].as_mv;
+  }
+
+  vpx_clear_system_state();
+
+  {
+    float features[FEATURES] = { 0.0f };
+#if CONFIG_VP9_HIGHBITDEPTH
+    const int dc_q =
+        vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8);
+#else
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    int feature_idx = 0;
+    float score;
+
+    // Generate model input features.
+    features[feature_idx++] = logf((float)dc_q + 1.0f);
+
+    // Get the variance of the residue as input features.
+    {
+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+      const uint8_t *pred = pred_buf;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const int has_above = !!xd->above_mi;
+      const int has_left = !!xd->left_mi;
+      const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
+      const BLOCK_SIZE left_bsize = has_left ? xd->left_mi->sb_type : bsize;
+      int i;
+
+      features[feature_idx++] = (float)has_above;
+      features[feature_idx++] = (float)b_width_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[above_bsize];
+      features[feature_idx++] = (float)has_left;
+      features[feature_idx++] = (float)b_width_log2_lookup[left_bsize];
+      features[feature_idx++] = (float)b_height_log2_lookup[left_bsize];
+      features[feature_idx++] = logf((float)var + 1.0f);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+      }
+    }
+    assert(feature_idx == FEATURES);
+
+    // Feed the features into the model to get the confidence score.
+    nn_predict(features, nn_config, &score);
+
+    // Higher score means that the model has higher confidence that the split
+    // partition is better than the non-split partition. So if the score is
+    // high enough, we skip the none-split partition search; if the score is
+    // low enough, we skip the split partition search.
+    if (score > thresh) *none = 0;
+    if (score < -thresh) *split = 0;
+  }
+}
+#undef FEATURES
+#endif  // !CONFIG_REALTIME_ONLY
+
+static double log_wiener_var(int64_t wiener_variance) {
+  return log(1.0 + wiener_variance) / log(2.0);
+}
+
+static void build_kmeans_segmentation(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  BLOCK_SIZE bsize = BLOCK_64X64;
+  KMEANS_DATA *kmeans_data;
+
+  vp9_disable_segmentation(&cm->seg);
+  if (cm->show_frame) {
+    int mi_row, mi_col;
+    cpi->kmeans_data_size = 0;
+    cpi->kmeans_ctr_num = 8;
+
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+        int mb_row_start = mi_row >> 1;
+        int mb_col_start = mi_col >> 1;
+        int mb_row_end = VPXMIN(
+            (mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+        int mb_col_end = VPXMIN(
+            (mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+        int row, col;
+        int64_t wiener_variance = 0;
+
+        for (row = mb_row_start; row < mb_row_end; ++row)
+          for (col = mb_col_start; col < mb_col_end; ++col)
+            wiener_variance += cpi->mb_wiener_variance[row * cm->mb_cols + col];
+
+        wiener_variance /=
+            (mb_row_end - mb_row_start) * (mb_col_end - mb_col_start);
+
+#if CONFIG_MULTITHREAD
+        pthread_mutex_lock(&cpi->kmeans_mutex);
+#endif  // CONFIG_MULTITHREAD
+
+        kmeans_data = &cpi->kmeans_data_arr[cpi->kmeans_data_size++];
+        kmeans_data->value = log_wiener_var(wiener_variance);
+        kmeans_data->pos = mi_row * cpi->kmeans_data_stride + mi_col;
+#if CONFIG_MULTITHREAD
+        pthread_mutex_unlock(&cpi->kmeans_mutex);
+#endif  // CONFIG_MULTITHREAD
+      }
+    }
+
+    vp9_kmeans(cpi->kmeans_ctr_ls, cpi->kmeans_boundary_ls,
+               cpi->kmeans_count_ls, cpi->kmeans_ctr_num, cpi->kmeans_data_arr,
+               cpi->kmeans_data_size);
+
+    vp9_perceptual_aq_mode_setup(cpi, &cm->seg);
+  }
+}
+
+#if !CONFIG_REALTIME_ONLY
+static int wiener_var_segment(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                              int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  int mb_row_start = mi_row >> 1;
+  int mb_col_start = mi_col >> 1;
+  int mb_row_end =
+      VPXMIN((mi_row + num_8x8_blocks_high_lookup[bsize]) >> 1, cm->mb_rows);
+  int mb_col_end =
+      VPXMIN((mi_col + num_8x8_blocks_wide_lookup[bsize]) >> 1, cm->mb_cols);
+  int row, col, idx;
+  int64_t wiener_variance = 0;
+  int segment_id;
+  int8_t seg_hist[MAX_SEGMENTS] = { 0 };
+  int8_t max_count = 0, max_index = -1;
+
+  vpx_clear_system_state();
+
+  assert(cpi->norm_wiener_variance > 0);
+
+  for (row = mb_row_start; row < mb_row_end; ++row) {
+    for (col = mb_col_start; col < mb_col_end; ++col) {
+      wiener_variance = cpi->mb_wiener_variance[row * cm->mb_cols + col];
+      segment_id =
+          vp9_get_group_idx(log_wiener_var(wiener_variance),
+                            cpi->kmeans_boundary_ls, cpi->kmeans_ctr_num);
+      ++seg_hist[segment_id];
+    }
+  }
+
+  for (idx = 0; idx < cpi->kmeans_ctr_num; ++idx) {
+    if (seg_hist[idx] > max_count) {
+      max_count = seg_hist[idx];
+      max_index = idx;
+    }
+  }
+
+  assert(max_index >= 0);
+  segment_id = max_index;
+
+  return segment_id;
+}
+
+static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                            int mi_col, int orig_rdmult) {
+  const int gf_group_index = cpi->twopass.gf_group.index;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[gf_group_index];
+  TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+  int tpl_stride = tpl_frame->stride;
+  int64_t intra_cost = 0;
+  int64_t mc_dep_cost = 0;
+  int mi_wide = num_8x8_blocks_wide_lookup[bsize];
+  int mi_high = num_8x8_blocks_high_lookup[bsize];
+  int row, col;
+
+  int dr = 0;
+  int count = 0;
+  double r0, rk, beta;
+
+  if (tpl_frame->is_valid == 0) return orig_rdmult;
+
+  if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult;
+
+  if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult;
+
+  for (row = mi_row; row < mi_row + mi_high; ++row) {
+    for (col = mi_col; col < mi_col + mi_wide; ++col) {
+      TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+
+      if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue;
+
+      intra_cost += this_stats->intra_cost;
+      mc_dep_cost += this_stats->mc_dep_cost;
+
+      ++count;
+    }
+  }
+
+  vpx_clear_system_state();
+
+  r0 = cpi->rd.r0;
+  rk = (double)intra_cost / mc_dep_cost;
+  beta = r0 / rk;
+  dr = vp9_get_adaptive_rdmult(cpi, beta);
+
+  dr = VPXMIN(dr, orig_rdmult * 3 / 2);
+  dr = VPXMAX(dr, orig_rdmult * 1 / 2);
+
+  dr = VPXMAX(1, dr);
+
+  return dr;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
+                             TileDataEnc *tile_data, TOKENEXTRA **tp,
+                             int mi_row, int mi_col, BLOCK_SIZE bsize,
+                             RD_COST *rd_cost, RD_COST best_rdc,
+                             PC_TREE *pc_tree) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
+  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+  PARTITION_CONTEXT sl[8], sa[8];
+  TOKENEXTRA *tp_orig = *tp;
+  PICK_MODE_CONTEXT *const ctx = &pc_tree->none;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  BLOCK_SIZE subsize;
+  RD_COST this_rdc, sum_rdc;
+  int do_split = bsize >= BLOCK_8X8;
+  int do_rect = 1;
+  INTERP_FILTER pred_interp_filter;
+
+  // Override skipping rectangular partition operations for edge blocks
+  const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
+  const int force_vert_split = (mi_col + mi_step >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
+
+  BLOCK_SIZE min_size = x->min_partition_size;
+  BLOCK_SIZE max_size = x->max_partition_size;
+
+#if CONFIG_FP_MB_STATS
+  unsigned int src_diff_var = UINT_MAX;
+  int none_complexity = 0;
+#endif
+
+  int partition_none_allowed = !force_horz_split && !force_vert_split;
+  int partition_horz_allowed =
+      !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
+  int partition_vert_allowed =
+      !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
+
+  int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist;
+  int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate;
+  int must_split = 0;
+  int should_encode_sb = 0;
+
+  // Ref frames picked in the [i_th] quarter subblock during square partition
+  // RD search. It may be used to prune ref frame selection of rect partitions.
+  uint8_t ref_frames_used[4] = { 0, 0, 0, 0 };
+
+  int partition_mul = x->cb_rdmult;
+
+  (void)*tp_orig;
+
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+         num_8x8_blocks_high_lookup[bsize]);
+
+  dist_breakout_thr >>=
+      8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+
+  rate_breakout_thr *= num_pels_log2_lookup[bsize];
+
+  vp9_rd_cost_init(&this_rdc);
+  vp9_rd_cost_init(&sum_rdc);
+
+  set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  if (oxcf->tuning == VP8_TUNE_SSIM) {
+    set_ssim_rdmult(cpi, x, bsize, mi_row, mi_col, &partition_mul);
+  }
+  vp9_rd_cost_update(partition_mul, x->rddiv, &best_rdc);
+
+  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode != NO_AQ &&
+      cpi->oxcf.aq_mode != LOOKAHEAD_AQ)
+    x->mb_energy = vp9_block_energy(cpi, x, bsize);
+
+  if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+    int cb_partition_search_ctrl =
+        ((pc_tree->index == 0 || pc_tree->index == 3) +
+         get_chessboard_index(cm->current_video_frame)) &
+        0x1;
+
+    if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+      set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+  }
+
+  // Get sub block energy range
+  if (bsize >= BLOCK_16X16) {
+    int min_energy, max_energy;
+    vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                             &max_energy);
+    must_split = (min_energy < -3) && (max_energy - min_energy > 2);
+  }
+
+  // Determine partition types in search according to the speed features.
+  // The threshold set here has to be of square block size.
+  if (cpi->sf.auto_min_max_partition_size) {
+    partition_none_allowed &= (bsize <= max_size);
+    partition_horz_allowed &=
+        ((bsize <= max_size && bsize > min_size) || force_horz_split);
+    partition_vert_allowed &=
+        ((bsize <= max_size && bsize > min_size) || force_vert_split);
+    do_split &= bsize > min_size;
   }
 
   if (cpi->sf.use_square_partition_only &&
-      bsize > cpi->sf.use_square_only_threshold) {
+      (bsize > cpi->sf.use_square_only_thresh_high ||
+       bsize < cpi->sf.use_square_only_thresh_low)) {
     if (cpi->use_svc) {
       if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless)
         partition_horz_allowed &= force_horz_split;
@@ -3250,45 +3989,81 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   }
 #endif
 
+  pc_tree->partitioning = PARTITION_NONE;
+
+  if (cpi->sf.rd_ml_partition.var_pruning && !frame_is_intra_only(cm)) {
+    const int do_rd_ml_partition_var_pruning =
+        partition_none_allowed && do_split &&
+        mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
+        mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
+    if (do_rd_ml_partition_var_pruning) {
+      ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+                                    &partition_none_allowed, &do_split);
+    } else {
+      vp9_zero(pc_tree->mv);
+    }
+    if (bsize > BLOCK_8X8) {  // Store MV result as reference for subblocks.
+      for (i = 0; i < 4; ++i) pc_tree->split[i]->mv = pc_tree->mv;
+    }
+  }
+
   // PARTITION_NONE
   if (partition_none_allowed) {
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx,
-                     best_rdc.rdcost);
+                     best_rdc.rate, best_rdc.dist);
+    ctx->rdcost = this_rdc.rdcost;
     if (this_rdc.rate != INT_MAX) {
+      if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+        const int ref1 = ctx->mic.ref_frame[0];
+        const int ref2 = ctx->mic.ref_frame[1];
+        for (i = 0; i < 4; ++i) {
+          ref_frames_used[i] |= (1 << ref1);
+          if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+        }
+      }
       if (bsize >= BLOCK_8X8) {
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
-        this_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
+        vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         MODE_INFO *mi = xd->mi[0];
 
         best_rdc = this_rdc;
+        should_encode_sb = 1;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        if (!cpi->sf.ml_partition_search_early_termination) {
-          // If all y, u, v transform blocks in this partition are skippable,
-          // and the dist & rate are within the thresholds, the partition search
-          // is terminated for current branch of the partition search tree.
-          if (!x->e_mbd.lossless && ctx->skippable &&
-              ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-               (best_rdc.dist < dist_breakout_thr &&
-                best_rdc.rate < rate_breakout_thr))) {
-            do_split = 0;
-            do_rect = 0;
-          }
-        } else {
+        if (cpi->sf.rd_ml_partition.search_early_termination) {
           // Currently, the machine-learning based partition search early
           // termination is only used while bsize is 16x16, 32x32 or 64x64,
           // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
           if (!x->e_mbd.lossless &&
               !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) &&
               ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
-            if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) {
+            if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
+        }
+
+        if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) {
+          const int use_ml_based_breakout =
+              cpi->sf.rd_ml_partition.search_breakout && cm->base_qindex >= 100;
+          if (use_ml_based_breakout) {
+            if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) {
               do_split = 0;
               do_rect = 0;
             }
+          } else {
+            if (!cpi->sf.rd_ml_partition.search_early_termination) {
+              if ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+                  (best_rdc.dist < dist_breakout_thr &&
+                   best_rdc.rate < rate_breakout_thr)) {
+                do_split = 0;
+                do_rect = 0;
+              }
+            }
           }
         }
 
@@ -3341,10 +4116,13 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       }
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  } else {
+    vp9_zero(ctx->pred_mv);
+    ctx->mic.interp_filter = EIGHTTAP;
   }
 
   // store estimated motion vector
-  if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx);
+  store_pred_mv(x, ctx);
 
   // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an
   // intra block and used for context purposes.
@@ -3357,113 +4135,184 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_SPLIT
   // TODO(jingning): use the motion vectors given by the above search as
   // the starting point of motion search in the following partition type check.
-  if (do_split) {
+  pc_tree->split[0]->none.rdcost = 0;
+  pc_tree->split[1]->none.rdcost = 0;
+  pc_tree->split[2]->none.rdcost = 0;
+  pc_tree->split[3]->none.rdcost = 0;
+  if (do_split || must_split) {
     subsize = get_subsize(bsize, PARTITION_SPLIT);
+    load_pred_mv(x, ctx);
     if (bsize == BLOCK_8X8) {
       i = 4;
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rdcost);
-
-      if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX;
+                       pc_tree->leaf_split[0], best_rdc.rate, best_rdc.dist);
+      if (sum_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+      } else {
+        if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+          const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0];
+          const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1];
+          for (i = 0; i < 4; ++i) {
+            ref_frames_used[i] |= (1 << ref1);
+            if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+          }
+        }
+      }
     } else {
-      for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
+      for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split);
+           ++i) {
         const int x_idx = (i & 1) * mi_step;
         const int y_idx = (i >> 1) * mi_step;
+        int found_best_rd = 0;
+        RD_COST best_rdc_split;
+        vp9_rd_cost_reset(&best_rdc_split);
+
+        if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX) {
+          // A must split test here increases the number of sub
+          // partitions but hurts metrics results quite a bit,
+          // so this extra test is commented out pending
+          // further tests on whether it adds much in terms of
+          // visual quality.
+          // (must_split) ? best_rdc.rate
+          //              : best_rdc.rate - sum_rdc.rate,
+          // (must_split) ? best_rdc.dist
+          //              : best_rdc.dist - sum_rdc.dist,
+          best_rdc_split.rate = best_rdc.rate - sum_rdc.rate;
+          best_rdc_split.dist = best_rdc.dist - sum_rdc.dist;
+        }
 
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
 
-        if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
-
         pc_tree->split[i]->index = i;
-        rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx,
-                          mi_col + x_idx, subsize, &this_rdc,
-                          best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+        if (cpi->sf.prune_ref_frame_for_rect_partitions)
+          pc_tree->split[i]->none.rate = INT_MAX;
+        found_best_rd = rd_pick_partition(
+            cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+            &this_rdc, best_rdc_split, pc_tree->split[i]);
 
-        if (this_rdc.rate == INT_MAX) {
+        if (found_best_rd == 0) {
           sum_rdc.rdcost = INT64_MAX;
           break;
         } else {
+          if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+              pc_tree->split[i]->none.rate != INT_MAX) {
+            const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0];
+            const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1];
+            ref_frames_used[i] |= (1 << ref1);
+            if (ref2 > 0) ref_frames_used[i] |= (1 << ref2);
+          }
           sum_rdc.rate += this_rdc.rate;
           sum_rdc.dist += this_rdc.dist;
-          sum_rdc.rdcost += this_rdc.rdcost;
+          vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
         }
       }
     }
 
-    if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
+    if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) {
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
 
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
+      if ((sum_rdc.rdcost < best_rdc.rdcost) ||
+          (must_split && (sum_rdc.dist < best_rdc.dist))) {
         best_rdc = sum_rdc;
+        should_encode_sb = 1;
         pc_tree->partitioning = PARTITION_SPLIT;
 
         // Rate and distortion based partition search termination clause.
-        if (!cpi->sf.ml_partition_search_early_termination &&
-            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-                                   (best_rdc.dist < dist_breakout_thr &&
-                                    best_rdc.rate < rate_breakout_thr))) {
+        if (!cpi->sf.rd_ml_partition.search_early_termination &&
+            !x->e_mbd.lossless &&
+            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+             (best_rdc.dist < dist_breakout_thr &&
+              best_rdc.rate < rate_breakout_thr))) {
           do_rect = 0;
         }
       }
     } else {
       // skip rectangular partition test when larger block size
       // gives better rd cost
-      if ((cpi->sf.less_rectangular_check) &&
-          ((bsize > cpi->sf.use_square_only_threshold) ||
-           (best_rdc.dist < dist_breakout_thr)))
+      if (cpi->sf.less_rectangular_check &&
+          (bsize > cpi->sf.use_square_only_thresh_high ||
+           best_rdc.dist < dist_breakout_thr))
         do_rect &= !partition_none_allowed;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
+  pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+  pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+  pc_tree->vertical[0].skip_ref_frame_mask = 0;
+  pc_tree->vertical[1].skip_ref_frame_mask = 0;
+  if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+    uint8_t used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[1];
+    if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[2] | ref_frames_used[3];
+    if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[0] | ref_frames_used[2];
+    if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+    used_frames = ref_frames_used[1] | ref_frames_used[3];
+    if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+  }
+
+  {
+    const int do_ml_rect_partition_pruning =
+        !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split &&
+        (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8;
+    if (do_ml_rect_partition_pruning) {
+      ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed,
+                              &partition_vert_allowed, best_rdc.rdcost);
+    }
+  }
+
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ];
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+                     &pc_tree->horizontal[0], best_rdc.rate - part_mode_rate,
+                     best_rdc.dist);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rate += part_mode_rate;
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+    }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
                        subsize, &pc_tree->horizontal[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+                       best_rdc.rate - sum_rdc.rate,
+                       best_rdc.dist - sum_rdc.dist);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
       }
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_HORZ;
+      best_rdc = sum_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_HORZ;
 
-        if ((cpi->sf.less_rectangular_check) &&
-            (bsize > cpi->sf.use_square_only_threshold))
-          do_rect = 0;
-      }
+      if (cpi->sf.less_rectangular_check &&
+          bsize > cpi->sf.use_square_only_thresh_high)
+        do_rect = 0;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
@@ -3471,56 +4320,52 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) {
+    const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT];
     subsize = get_subsize(bsize, PARTITION_VERT);
-
-    if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter = pred_interp_filter;
     rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
+                     &pc_tree->vertical[0], best_rdc.rate - part_mode_rate,
+                     best_rdc.dist);
+    if (sum_rdc.rdcost < INT64_MAX) {
+      sum_rdc.rate += part_mode_rate;
+      vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
+    }
+
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
                         &pc_tree->vertical[0]);
-
-      if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter = pred_interp_filter;
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
                        subsize, &pc_tree->vertical[1],
-                       best_rdc.rdcost - sum_rdc.rdcost);
+                       best_rdc.rate - sum_rdc.rate,
+                       best_rdc.dist - sum_rdc.dist);
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
-        sum_rdc.rdcost += this_rdc.rdcost;
+        vp9_rd_cost_update(partition_mul, x->rddiv, &sum_rdc);
       }
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
-      if (sum_rdc.rdcost < best_rdc.rdcost) {
-        best_rdc = sum_rdc;
-        pc_tree->partitioning = PARTITION_VERT;
-      }
+      best_rdc = sum_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_VERT;
     }
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
-  // TODO(jbb): This code added so that we avoid static analysis
-  // warning related to the fact that best_rd isn't used after this
-  // point.  This code should be refactored so that the duplicate
-  // checks occur in some sub function and thus are used...
-  (void)best_rd;
   *rd_cost = best_rdc;
 
-  if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
-      pc_tree->index != 3) {
+  if (should_encode_sb && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
@@ -3533,6 +4378,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   } else {
     assert(tp_orig == *tp);
   }
+
+  return should_encode_sb;
 }
 
 static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
@@ -3564,10 +4411,12 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
     RD_COST dummy_rdc;
     int i;
     int seg_skip = 0;
+    int orig_rdmult = cpi->rd.RDMULT;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
 
+    vp9_rd_cost_reset(&dummy_rdc);
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
                                    sb_col_in_tile);
 
@@ -3582,7 +4431,10 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       }
     }
 
-    vp9_zero(x->pred_mv);
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      x->pred_mv[i].row = INT16_MAX;
+      x->pred_mv[i].col = INT16_MAX;
+    }
     td->pc_root->index = 0;
 
     if (seg->enabled) {
@@ -3593,6 +4445,9 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
     }
 
     x->source_variance = UINT_MAX;
+
+    x->cb_rdmult = orig_rdmult;
+
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
       const BLOCK_SIZE bsize =
           seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
@@ -3613,19 +4468,33 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rate, &dummy_dist, 1, td->pc_root);
     } else {
+      if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) {
+        int dr =
+            get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult);
+        x->cb_rdmult = dr;
+      }
+
+      if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ && cm->show_frame) {
+        x->segment_id = wiener_var_segment(cpi, BLOCK_64X64, mi_row, mi_col);
+        x->cb_rdmult = vp9_compute_rd_mult(
+            cpi, vp9_get_qindex(&cm->seg, x->segment_id, cm->base_qindex));
+      }
+
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size, &x->max_partition_size);
       }
+      td->pc_root->none.rdcost = 0;
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, td->pc_root);
+                        &dummy_rdc, dummy_rdc, td->pc_root);
     }
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, num_sb_cols);
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
@@ -3703,6 +4572,36 @@ static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
     vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
 }
 
+static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x,
+                                        RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                        PICK_MODE_CONTEXT *ctx,
+                                        TileDataEnc *tile_data, int mi_row,
+                                        int mi_col) {
+  if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  } else {
+    if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF)
+      vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
+    else if (bsize >= BLOCK_8X8)
+      vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+                          ctx);
+    else
+      vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
+}
+
+static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x,
+                                       RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                       PICK_MODE_CONTEXT *ctx,
+                                       TileDataEnc *tile_data, int mi_row,
+                                       int mi_col) {
+  if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) {
+    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
+  } else {
+    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
+}
+
 static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
                                 MACROBLOCK *const x, int mi_row, int mi_col,
                                 RD_COST *rd_cost, BLOCK_SIZE bsize,
@@ -3718,6 +4617,9 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   int plane;
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+  set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
+
   mi = xd->mi[0];
   mi->sb_type = bsize;
 
@@ -3733,14 +4635,23 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (cyclic_refresh_segment_id_boosted(mi->segment_id))
       x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
 
-  if (cm->frame_type == KEY_FRAME)
+  if (frame_is_intra_only(cm))
     hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx);
+  else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)
+    hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+                                mi_col);
   else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
     set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
-  else if (bsize >= BLOCK_8X8)
-    vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx);
-  else
+  else if (bsize >= BLOCK_8X8) {
+    if (cpi->rc.hybrid_intra_scene_change)
+      hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
+                                 mi_col);
+    else
+      vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize,
+                          ctx);
+  } else {
     vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx);
+  }
 
   duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize);
 
@@ -3830,6 +4741,76 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
   }
 }
 
+#define FEATURES 6
+#define LABELS 2
+static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mi_row,
+                                      int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const NN_CONFIG *nn_config = NULL;
+
+  switch (bsize) {
+    case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break;
+    case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break;
+    case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break;
+    case BLOCK_8X8: break;
+    default: assert(0 && "Unexpected block size."); return -1;
+  }
+
+  if (!nn_config) return -1;
+
+  vpx_clear_system_state();
+
+  {
+    const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f;
+    float features[FEATURES] = { 0.0f };
+    const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth);
+    int feature_idx = 0;
+    float score[LABELS];
+
+    features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f);
+    vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col);
+    {
+      const int bs = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+      const int sb_offset_row = 8 * (mi_row & 7);
+      const int sb_offset_col = 8 * (mi_col & 7);
+      const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col;
+      const uint8_t *src = x->plane[0].src.buf;
+      const int src_stride = x->plane[0].src.stride;
+      const int pred_stride = 64;
+      unsigned int sse;
+      int i;
+      // Variance of whole block.
+      const unsigned int var =
+          cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
+      const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
+
+      features[feature_idx++] = logf((float)var + 1.0f);
+      for (i = 0; i < 4; ++i) {
+        const int x_idx = (i & 1) * bs / 2;
+        const int y_idx = (i >> 1) * bs / 2;
+        const int src_offset = y_idx * src_stride + x_idx;
+        const int pred_offset = y_idx * pred_stride + x_idx;
+        // Variance of quarter block.
+        const unsigned int sub_var =
+            cpi->fn_ptr[subsize].vf(src + src_offset, src_stride,
+                                    pred + pred_offset, pred_stride, &sse);
+        const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var;
+        features[feature_idx++] = var_ratio;
+      }
+    }
+
+    assert(feature_idx == FEATURES);
+    nn_predict(features, nn_config, score);
+    if (score[0] > thresh) return PARTITION_SPLIT;
+    if (score[0] < -thresh) return PARTITION_NONE;
+    return -1;
+  }
+}
+#undef FEATURES
+#undef LABELS
+
 static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                                  TileDataEnc *tile_data, TOKENEXTRA **tp,
                                  int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -3859,6 +4840,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       !force_vert_split && yss <= xss && bsize >= BLOCK_8X8;
   int partition_vert_allowed =
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
+  const int use_ml_based_partitioning =
+      sf->partition_search_type == ML_BASED_PARTITION;
+
   (void)*tp_orig;
 
   // Avoid checking for rectangular partitions for speed >= 6.
@@ -3889,6 +4873,18 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     partition_vert_allowed &= force_vert_split;
   }
 
+  if (use_ml_based_partitioning) {
+    if (partition_none_allowed || do_split) do_rect = 0;
+    if (partition_none_allowed && do_split) {
+      const int ml_predicted_partition =
+          ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+      if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
+      if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
+    }
+  }
+
+  if (!partition_none_allowed && !do_split) do_rect = 1;
+
   ctx->pred_pixel_ready =
       !(partition_vert_allowed || partition_horz_allowed || do_split);
 
@@ -3902,26 +4898,25 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     ctx->skip = x->skip;
 
     if (this_rdc.rate != INT_MAX) {
-      int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
       this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
       this_rdc.rdcost =
           RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
       if (this_rdc.rdcost < best_rdc.rdcost) {
-        int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist;
-        int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate;
-
-        dist_breakout_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        rate_breakout_thr *= num_pels_log2_lookup[bsize];
-
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr &&
-            this_rdc.dist < dist_breakout_thr) {
-          do_split = 0;
-          do_rect = 0;
+        if (!use_ml_based_partitioning) {
+          int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist;
+          int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate;
+          dist_breakout_thr >>=
+              8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+          rate_breakout_thr *= num_pels_log2_lookup[bsize];
+          if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr &&
+              this_rdc.dist < dist_breakout_thr) {
+            do_split = 0;
+            do_rect = 0;
+          }
         }
       }
     }
@@ -3969,7 +4964,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    if (sf->adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     pc_tree->horizontal[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->horizontal[0]);
@@ -4013,7 +5008,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
   // PARTITION_VERT
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
-    if (sf->adaptive_motion_search) load_pred_mv(x, ctx);
+    load_pred_mv(x, ctx);
     pc_tree->vertical[0].pred_pixel_ready = 1;
     nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
                         &pc_tree->vertical[0]);
@@ -4173,7 +5168,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
           }
         }
         break;
-      case PARTITION_SPLIT:
+      default:
+        assert(partition == PARTITION_SPLIT);
         subsize = get_subsize(bsize, PARTITION_SPLIT);
         nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                subsize, output_enabled, rd_cost,
@@ -4203,7 +5199,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
           rd_cost->dist += this_rdc.dist;
         }
         break;
-      default: assert(0 && "Invalid partition type."); break;
     }
   }
 
@@ -4292,7 +5287,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
                     output_enabled, subsize, &pc_tree->horizontal[1]);
       }
       break;
-    case PARTITION_SPLIT:
+    default:
+      assert(partition == PARTITION_SPLIT);
       subsize = get_subsize(bsize, PARTITION_SPLIT);
       if (bsize == BLOCK_8X8) {
         nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
@@ -4313,13 +5309,110 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
                             dummy_cost, pc_tree->split[3]);
       }
       break;
-    default: assert(0 && "Invalid partition type."); break;
   }
 
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
+// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock.
+static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
+                               MACROBLOCK *x, int mi_row, int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_key_frame = frame_is_intra_only(cm);
+  MACROBLOCKD *xd = &x->e_mbd;
+
+  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+
+  if (!is_key_frame) {
+    MODE_INFO *mi = xd->mi[0];
+    YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 +
+                             (mi_row + 4 < cm->mi_rows);
+    unsigned int y_sad_g, y_sad_thr;
+    unsigned int y_sad = UINT_MAX;
+
+    assert(yv12 != NULL);
+
+    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+        cpi->svc.use_gf_temporal_ref_current_layer) {
+      // For now, GOLDEN will not be used for non-zero spatial layers, since
+      // it may not be a temporal reference.
+      yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    }
+
+    // Only compute y_sad_g (sad for golden reference) for speed < 8.
+    if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 &&
+        (cpi->ref_frame_flags & VP9_GOLD_FLAG)) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      y_sad_g = cpi->fn_ptr[bsize].sdf(
+          x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
+          xd->plane[0].pre[0].stride);
+    } else {
+      y_sad_g = UINT_MAX;
+    }
+
+    if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+        cpi->rc.is_src_frame_alt_ref) {
+      yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME);
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[ALTREF_FRAME - 1].sf);
+      mi->ref_frame[0] = ALTREF_FRAME;
+      y_sad_g = UINT_MAX;
+    } else {
+      vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
+                           &cm->frame_refs[LAST_FRAME - 1].sf);
+      mi->ref_frame[0] = LAST_FRAME;
+    }
+    mi->ref_frame[1] = NONE;
+    mi->sb_type = BLOCK_64X64;
+    mi->mv[0].as_int = 0;
+    mi->interp_filter = BILINEAR;
+
+    {
+      const MV dummy_mv = { 0, 0 };
+      y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+                                            &dummy_mv);
+      x->sb_use_mv_part = 1;
+      x->sb_mvcol_part = mi->mv[0].as_mv.col;
+      x->sb_mvrow_part = mi->mv[0].as_mv.row;
+    }
+
+    // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
+    // are close if short_circuit_low_temp_var is on.
+    y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad;
+    if (y_sad_g < y_sad_thr) {
+      vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
+                           &cm->frame_refs[GOLDEN_FRAME - 1].sf);
+      mi->ref_frame[0] = GOLDEN_FRAME;
+      mi->mv[0].as_int = 0;
+    } else {
+      x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv;
+    }
+
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
+    xd->plane[0].dst.buf = x->est_pred;
+    xd->plane[0].dst.stride = 64;
+    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+  } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (xd->bd) {
+      case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break;
+      case 10:
+        memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+      case 12:
+        memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0]));
+        break;
+    }
+#else
+    memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0]));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
 static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                 TileDataEnc *tile_data, int mi_row,
                                 TOKENEXTRA **tp) {
@@ -4350,6 +5443,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type;
     BLOCK_SIZE bsize = BLOCK_64X64;
     int seg_skip = 0;
+    int i;
 
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
                                    sb_col_in_tile);
@@ -4359,7 +5453,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     }
 
     x->source_variance = UINT_MAX;
-    vp9_zero(x->pred_mv);
+    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+      x->pred_mv[i].row = INT16_MAX;
+      x->pred_mv[i].col = INT16_MAX;
+    }
     vp9_rd_cost_init(&dummy_rdc);
     x->color_sensitivity[0] = 0;
     x->color_sensitivity[1] = 0;
@@ -4367,6 +5464,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     x->skip_low_source_sad = 0;
     x->lowvar_highsumdiff = 0;
     x->content_state_sb = 0;
+    x->zero_temp_sad_source = 0;
     x->sb_use_mv_part = 0;
     x->sb_mvcol_part = 0;
     x->sb_mvrow_part = 0;
@@ -4406,6 +5504,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
+      case ML_BASED_PARTITION:
+        get_estimated_pred(cpi, tile_info, x, mi_row, mi_col);
+        x->max_partition_size = BLOCK_64X64;
+        x->min_partition_size = BLOCK_8X8;
+        x->sb_pickmode_part = 1;
+        nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+                             BLOCK_64X64, &dummy_rdc, 1, INT64_MAX,
+                             td->pc_root);
+        break;
       case SOURCE_VAR_BASED_PARTITION:
         set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col);
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
@@ -4417,14 +5524,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
-      case REFERENCE_PARTITION:
+      default:
+        assert(partition_search_type == REFERENCE_PARTITION);
         x->sb_pickmode_part = 1;
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         // Use nonrd_pick_partition on scene-cut for VBR mode.
         // nonrd_pick_partition does not support 4x4 partition, so avoid it
         // on key frame for now.
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
-             cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME &&
+             cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
           // Use lower max_partition_size for low resoultions.
           if (cm->width <= 352 && cm->height <= 288)
@@ -4440,7 +5548,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
           // TODO(marpan): Seems like nonrd_select_partition does not support
           // 4x4 partition. Since 4x4 is used on key frame, use this switch
           // for now.
-          if (cm->frame_type == KEY_FRAME)
+          if (frame_is_intra_only(cm))
             nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
                                 BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
           else
@@ -4449,7 +5557,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         }
 
         break;
-      default: assert(0); break;
     }
 
     // Update ref_frame usage for inter frame if this group is ARF group.
@@ -4516,16 +5623,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
                                       &var16->sse, &var16->sum);
             var16->var = variance_highbd(var16);
             break;
-          case VPX_BITS_12:
+          default:
+            assert(cm->bit_depth == VPX_BITS_12);
             vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
             var16->var = variance_highbd(var16);
             break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
-                   " or VPX_BITS_12");
-            return -1;
         }
       } else {
         vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
@@ -4620,8 +5723,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
-    CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
-                                                   sizeof(*cpi->tile_data)));
+    CHECK_MEM_ERROR(
+        cm, cpi->tile_data,
+        vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
@@ -4632,6 +5736,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
+#if CONFIG_CONSISTENT_RECODE
+            tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
+#endif
             tile_data->mode_map[i][j] = j;
           }
         }
@@ -4645,6 +5752,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
       TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
       TileInfo *tile_info = &this_tile->tile_info;
+      if (cpi->sf.adaptive_rd_thresh_row_mt &&
+          this_tile->row_base_thresh_freq_fact == NULL)
+        vp9_row_mt_alloc_rd_thresh(cpi, this_tile);
       vp9_tile_init(tile_info, cm, tile_row, tile_col);
 
       cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
@@ -4675,8 +5785,10 @@ void vp9_encode_sb_row(VP9_COMP *cpi, ThreadData *td, int tile_row,
 
   if (cpi->sf.use_nonrd_pick_mode)
     encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#if !CONFIG_REALTIME_ONLY
   else
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#endif
 
   cpi->tplist[tile_row][tile_col][tile_sb_row].stop = tok;
   cpi->tplist[tile_row][tile_col][tile_sb_row].count =
@@ -4729,16 +5841,117 @@ static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
 }
 #endif
 
+static int compare_kmeans_data(const void *a, const void *b) {
+  if (((const KMEANS_DATA *)a)->value > ((const KMEANS_DATA *)b)->value) {
+    return 1;
+  } else if (((const KMEANS_DATA *)a)->value <
+             ((const KMEANS_DATA *)b)->value) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+static void compute_boundary_ls(const double *ctr_ls, int k,
+                                double *boundary_ls) {
+  // boundary_ls[j] is the upper bound of data centered at ctr_ls[j]
+  int j;
+  for (j = 0; j < k - 1; ++j) {
+    boundary_ls[j] = (ctr_ls[j] + ctr_ls[j + 1]) / 2.;
+  }
+  boundary_ls[k - 1] = DBL_MAX;
+}
+
+int vp9_get_group_idx(double value, double *boundary_ls, int k) {
+  int group_idx = 0;
+  while (value >= boundary_ls[group_idx]) {
+    ++group_idx;
+    if (group_idx == k - 1) {
+      break;
+    }
+  }
+  return group_idx;
+}
+
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+                KMEANS_DATA *arr, int size) {
+  int i, j;
+  int itr;
+  int group_idx;
+  double sum[MAX_KMEANS_GROUPS];
+  int count[MAX_KMEANS_GROUPS];
+
+  vpx_clear_system_state();
+
+  assert(k >= 2 && k <= MAX_KMEANS_GROUPS);
+
+  qsort(arr, size, sizeof(*arr), compare_kmeans_data);
+
+  // initialize the center points
+  for (j = 0; j < k; ++j) {
+    ctr_ls[j] = arr[(size * (2 * j + 1)) / (2 * k)].value;
+  }
+
+  for (itr = 0; itr < 10; ++itr) {
+    compute_boundary_ls(ctr_ls, k, boundary_ls);
+    for (i = 0; i < MAX_KMEANS_GROUPS; ++i) {
+      sum[i] = 0;
+      count[i] = 0;
+    }
+
+    // Both the data and centers are sorted in ascending order.
+    // As each data point is processed in order, its corresponding group index
+    // can only increase. So we only need to reset the group index to zero here.
+    group_idx = 0;
+    for (i = 0; i < size; ++i) {
+      while (arr[i].value >= boundary_ls[group_idx]) {
+        // place samples into clusters
+        ++group_idx;
+        if (group_idx == k - 1) {
+          break;
+        }
+      }
+      sum[group_idx] += arr[i].value;
+      ++count[group_idx];
+    }
+
+    for (group_idx = 0; group_idx < k; ++group_idx) {
+      if (count[group_idx] > 0)
+        ctr_ls[group_idx] = sum[group_idx] / count[group_idx];
+
+      sum[group_idx] = 0;
+      count[group_idx] = 0;
+    }
+  }
+
+  // compute group_idx, boundary_ls and count_ls
+  for (j = 0; j < k; ++j) {
+    count_ls[j] = 0;
+  }
+  compute_boundary_ls(ctr_ls, k, boundary_ls);
+  group_idx = 0;
+  for (i = 0; i < size; ++i) {
+    while (arr[i].value >= boundary_ls[group_idx]) {
+      ++group_idx;
+      if (group_idx == k - 1) {
+        break;
+      }
+    }
+    arr[i].group_idx = group_idx;
+    ++count_ls[group_idx];
+  }
+}
+
 static void encode_frame_internal(VP9_COMP *cpi) {
   SPEED_FEATURES *const sf = &cpi->sf;
   ThreadData *const td = &cpi->td;
   MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
+  const int gf_group_index = cpi->twopass.gf_group.index;
 
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
-
   vp9_zero(*td->counts);
   vp9_zero(cpi->td.rd_counts);
 
@@ -4756,8 +5969,12 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-
+#if CONFIG_CONSISTENT_RECODE
+  x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
+#endif
   if (xd->lossless) x->optimize = 0;
+  x->sharpness = cpi->oxcf.sharpness;
+  x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
 
   cm->tx_mode = select_tx_mode(cpi, xd);
 
@@ -4799,8 +6016,33 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
     if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
       source_var_based_partition_search_method(cpi);
+  } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE &&
+             cpi->sf.enable_tpl_model) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+
+    int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int row, col;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+      }
+    }
+
+    vpx_clear_system_state();
+
+    if (tpl_frame->is_valid)
+      cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
+  // Frame segmentation
+  if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
+
   {
     struct vpx_usec_timer emr_timer;
     vpx_usec_timer_start(&emr_timer);
@@ -4881,9 +6123,52 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
+#if CONFIG_CONSISTENT_RECODE
+static void restore_encode_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  int i, j;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes[i][j] =
+          rd_opt->prediction_type_threshes_prev[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
+  }
+
+  if (cpi->tile_data != NULL) {
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact[i][j] =
+                tile_data->thresh_freq_fact_prev[i][j];
+          }
+        }
+      }
+  }
+
+  cm->interp_filter = cpi->sf.default_interp_filter;
+}
+#endif
+
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
+#if CONFIG_CONSISTENT_RECODE
+  restore_encode_params(cpi);
+#endif
+
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_reset_frame(MAX_MB_PLANE);
+#endif
+
   // In the longer term the encoder should be generalized to match the
   // decoder such that we allow compound where one of the 3 buffers has a
   // different sign bias and that buffer is then the fixed ref. However, this
@@ -4891,16 +6176,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
   // side behavior is where the ALT ref buffer has opposite sign bias to
   // the other two.
   if (!frame_is_intra_only(cm)) {
-    if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
-        (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
-         cm->ref_frame_sign_bias[LAST_FRAME])) {
-      cpi->allow_comp_inter_inter = 0;
-    } else {
+    if (vp9_compound_reference_allowed(cm)) {
       cpi->allow_comp_inter_inter = 1;
-      cm->comp_fixed_ref = ALTREF_FRAME;
-      cm->comp_var_ref[0] = LAST_FRAME;
-      cm->comp_var_ref[1] = GOLDEN_FRAME;
+      vp9_setup_compound_reference_mode(cm);
+    } else {
+      cpi->allow_comp_inter_inter = 0;
     }
   }
 
@@ -5064,7 +6344,8 @@ static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi,
   for (y = 0; y < ymis; y++)
     for (x = 0; x < xmis; x++) {
       int map_offset = block_index + y * cm->mi_cols + x;
-      if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+      if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) &&
+          mi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         if (abs(mv.row) < 8 && abs(mv.col) < 8) {
           if (cpi->consec_zero_mv[map_offset] < 255)
             cpi->consec_zero_mv[map_offset]++;
@@ -5131,7 +6412,27 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col,
                                     VPXMAX(bsize, BLOCK_8X8));
 
-    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+#if CONFIG_MISMATCH_DEBUG
+    if (output_enabled) {
+      int plane;
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const struct macroblockd_plane *pd = &xd->plane[plane];
+        int pixel_c, pixel_r;
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), &xd->plane[plane]);
+        const int bw = get_block_width(plane_bsize);
+        const int bh = get_block_height(plane_bsize);
+        mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+                        pd->subsampling_x, pd->subsampling_y);
+
+        mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, plane, pixel_c,
+                                  pixel_r, bw, bh,
+                                  xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+      }
+    }
+#endif
+
+    vp9_encode_sb(x, VPXMAX(bsize, BLOCK_8X8), mi_row, mi_col, output_enabled);
     vp9_tokenize_sb(cpi, td, t, !output_enabled, seg_skip,
                     VPXMAX(bsize, BLOCK_8X8));
   }
@@ -5159,7 +6460,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
     ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])];
     if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize);
-    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0)
+    if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)))
       update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize);
   }
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_encodeframe.h b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
index cf5ae3d8ac7..fd0a9c517ef 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodeframe.h
+++ b/libs/libvpx/vp9/encoder/vp9_encodeframe.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
-#define VP9_ENCODER_VP9_ENCODEFRAME_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -45,8 +45,13 @@ void vp9_encode_sb_row(struct VP9_COMP *cpi, struct ThreadData *td,
 void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q,
                                            int content_state);
 
+struct KMEANS_DATA;
+void vp9_kmeans(double *ctr_ls, double *boundary_ls, int *count_ls, int k,
+                struct KMEANS_DATA *arr, int size);
+int vp9_get_group_idx(double value, double *boundary_ls, int k);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEFRAME_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.c b/libs/libvpx/vp9/encoder/vp9_encodemb.c
index f3c17f2559f..7630a811038 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libs/libvpx/vp9/encoder/vp9_encodemb.c
@@ -16,6 +16,10 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
+#if CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -50,12 +54,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
 }
 
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
-  { 10, 6 }, { 8, 5 },
+  { 10, 6 },
+  { 8, 5 },
 };
 
 // 'num' can be negative, but 'shift' must be non-negative.
 #define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
-  ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
+  (((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift)))
 
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx) {
@@ -76,13 +81,19 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
+  const MODE_INFO *mbmi = xd->mi[0];
+  const int sharpness = mb->sharpness;
+  const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type];
   const int64_t rdmult =
-      ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
+      (sharpness == 0 ? rdadj >> 1
+                      : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4);
+
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
   int64_t rate0, rate1;
   int16_t t0, t1;
   int i, final_eob;
+  int count_high_values_after_eob = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
   const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
@@ -200,9 +211,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
           const int band_next = band_translate[i + 1];
           const int token_next =
               (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
-          unsigned int(
-              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-              token_costs + band_next;
+          unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+                                               [ENTROPY_TOKENS] =
+                                                   token_costs + band_next;
           token_cache[rc] = vp9_pt_energy_class[t0];
           ctx_next = get_coef_context(nb, token_cache, i + 1);
           token_tree_sel_next = (x == 0);
@@ -262,6 +273,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
           assert(distortion0 <= distortion_for_zero);
           token_cache[rc] = vp9_pt_energy_class[t0];
         }
+        if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++;
         assert(accu_error >= 0);
         x_prev = qcoeff[rc];  // Update based on selected quantized value.
 
@@ -272,6 +284,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
         if (best_eob_cost_cur < best_block_rd_cost) {
           best_block_rd_cost = best_eob_cost_cur;
           final_eob = i + 1;
+          count_high_values_after_eob = 0;
           if (use_x1) {
             before_best_eob_qc = x1;
             before_best_eob_dqc = dqc1;
@@ -283,19 +296,31 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
       }
     }
   }
-  assert(final_eob <= eob);
-  if (final_eob > 0) {
-    int rc;
-    assert(before_best_eob_qc != 0);
-    i = final_eob - 1;
-    rc = scan[i];
-    qcoeff[rc] = before_best_eob_qc;
-    dqcoeff[rc] = before_best_eob_dqc;
-  }
-  for (i = final_eob; i < eob; i++) {
-    int rc = scan[i];
-    qcoeff[rc] = 0;
-    dqcoeff[rc] = 0;
+  if (count_high_values_after_eob > 0) {
+    final_eob = eob - 1;
+    for (; final_eob >= 0; final_eob--) {
+      const int rc = scan[final_eob];
+      const int x = qcoeff[rc];
+      if (x) {
+        break;
+      }
+    }
+    final_eob++;
+  } else {
+    assert(final_eob <= eob);
+    if (final_eob > 0) {
+      int rc;
+      assert(before_best_eob_qc != 0);
+      i = final_eob - 1;
+      rc = scan[i];
+      qcoeff[rc] = before_best_eob_qc;
+      dqcoeff[rc] = before_best_eob_dqc;
+    }
+    for (i = final_eob; i < eob; i++) {
+      int rc = scan[i];
+      qcoeff[rc] = 0;
+      dqcoeff[rc] = 0;
+    }
   }
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
@@ -357,13 +382,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                                p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
                                p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -383,17 +408,19 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                       scan_order->iscan);
       break;
     case TX_8X8:
-      vp9_fdct8x8_quant(src_diff, diff_stride, coeff, 64, x->skip_block,
-                        p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                        eob, scan_order->scan, scan_order->iscan);
+      vpx_fdct8x8(src_diff, coeff, diff_stride);
+      vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
+                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                      scan_order->iscan);
+
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                       scan_order->iscan);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -433,13 +460,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -461,12 +488,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
       vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -510,14 +537,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                               pd->dequant, eob, scan_order->scan,
                               scan_order->iscan);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
                               scan_order->iscan);
         break;
-      default: assert(0);
     }
     return;
   }
@@ -543,19 +570,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
       break;
-    default: assert(0); break;
   }
 }
 
 static void encode_block(int plane, int block, int row, int col,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
+#if CONFIG_MISMATCH_DEBUG
+  int mi_row = args->mi_row;
+  int mi_col = args->mi_col;
+  int output_enabled = args->output_enabled;
+#endif
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *const p = &x->plane[plane];
@@ -572,7 +604,11 @@ static void encode_block(int plane, int block, int row, int col,
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
     p->eobs[block] = 0;
     *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
     return;
+#endif
   }
 
   if (!x->skip_recode) {
@@ -582,7 +618,11 @@ static void encode_block(int plane, int block, int row, int col,
         // skip forward transform
         p->eobs[block] = 0;
         *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+        goto encode_block_end;
+#else
         return;
+#endif
       } else {
         vp9_xform_quant_fp(x, plane, block, row, col, plane_bsize, tx_size);
       }
@@ -599,7 +639,11 @@ static void encode_block(int plane, int block, int row, int col,
           // skip forward transform
           p->eobs[block] = 0;
           *a = *l = 0;
+#if CONFIG_MISMATCH_DEBUG
+          goto encode_block_end;
+#else
           return;
+#endif
         }
       } else {
         vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size);
@@ -616,7 +660,13 @@ static void encode_block(int plane, int block, int row, int col,
 
   if (p->eobs[block]) *(args->skip) = 0;
 
-  if (x->skip_encode || p->eobs[block] == 0) return;
+  if (x->skip_encode || p->eobs[block] == 0) {
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
+    return;
+#endif
+  }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -633,16 +683,20 @@ static void encode_block(int plane, int block, int row, int col,
         vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
         x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
                                xd->bd);
         break;
-      default: assert(0 && "Invalid transform size");
     }
+#if CONFIG_MISMATCH_DEBUG
+    goto encode_block_end;
+#else
     return;
+#endif
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -656,14 +710,27 @@ static void encode_block(int plane, int block, int row, int col,
     case TX_8X8:
       vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
       x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
-    default: assert(0 && "Invalid transform size"); break;
   }
+#if CONFIG_MISMATCH_DEBUG
+encode_block_end:
+  if (output_enabled) {
+    int pixel_c, pixel_r;
+    int blk_w = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    int blk_h = 1 << (tx_size + TX_UNIT_SIZE_LOG2);
+    mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, col, row,
+                    pd->subsampling_x, pd->subsampling_y);
+    mismatch_record_block_tx(dst, pd->dst.stride, plane, pixel_c, pixel_r,
+                             blk_w, blk_h,
+                             xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+  }
+#endif
 }
 
 static void encode_block_pass1(int plane, int block, int row, int col,
@@ -697,12 +764,21 @@ void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
                                          encode_block_pass1, x);
 }
 
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+                   int output_enabled) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MODE_INFO *mi = xd->mi[0];
-  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
   int plane;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args arg = { x,         1,      NULL,   NULL,
+                               &mi->skip, mi_row, mi_col, output_enabled };
+#else
+  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
+  (void)mi_row;
+  (void)mi_col;
+  (void)output_enabled;
+#endif
 
   mi->skip = 1;
 
@@ -847,7 +923,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                                 xd->bd);
         }
         break;
-      case TX_4X4:
+      default:
+        assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
           vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
@@ -875,7 +952,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           }
         }
         break;
-      default: assert(0); return;
     }
     if (*eob) *(args->skip) = 0;
     return;
@@ -929,7 +1005,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
       break;
-    case TX_4X4:
+    default:
+      assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
         vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
@@ -954,7 +1031,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
       break;
-    default: assert(0); break;
   }
   if (*eob) *(args->skip) = 0;
 }
@@ -963,8 +1039,16 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
                                   int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
+#if CONFIG_MISMATCH_DEBUG
+  // TODO(angiebird): make mismatch_debug support intra mode
+  struct encode_b_args arg = {
+    x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0,
+    0
+  };
+#else
   struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
                                ctx.tl[plane], &xd->mi[0]->skip };
+#endif
 
   if (enable_optimize_b && x->optimize &&
       (!x->skip_recode || !x->skip_optimize)) {
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemb.h b/libs/libvpx/vp9/encoder/vp9_encodemb.h
index cf943bedfdf..1975ee73acd 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libs/libvpx/vp9/encoder/vp9_encodemb.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEMB_H_
-#define VP9_ENCODER_VP9_ENCODEMB_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
@@ -24,10 +24,16 @@ struct encode_b_args {
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
+#if CONFIG_MISMATCH_DEBUG
+  int mi_row;
+  int mi_col;
+  int output_enabled;
+#endif
 };
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx);
-void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+                   int output_enabled);
 void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
@@ -48,4 +54,4 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEMB_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encodemv.h b/libs/libvpx/vp9/encoder/vp9_encodemv.h
index 9fc7ab8dc45..2f1be4b233f 100644
--- a/libs/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/libs/libvpx/vp9/encoder/vp9_encodemv.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODEMV_H_
-#define VP9_ENCODER_VP9_ENCODEMV_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_
+#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref,
                    unsigned int *const max_mv_magnitude);
 
 void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
-                              const nmv_context *mvctx, int usehp);
+                              const nmv_context *ctx, int usehp);
 
 void vp9_update_mv_count(ThreadData *td);
 
@@ -35,4 +35,4 @@ void vp9_update_mv_count(ThreadData *td);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODEMV_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODEMV_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.c b/libs/libvpx/vp9/encoder/vp9_encoder.c
index 2ae59dd9812..7f82a470b31 100644
--- a/libs/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.c
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits.h>
 #include <math.h>
 #include <stdio.h>
-#include <limits.h>
+#include <stdlib.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
@@ -25,31 +26,49 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_timer.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_tile_common.h"
+#include "vp9/common/vp9_scan.h"
 
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
 #include "vp9/encoder/vp9_aq_360.h"
 #include "vp9/encoder/vp9_aq_complexity.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_bitstream.h"
+#if CONFIG_INTERNAL_STATS
+#include "vp9/encoder/vp9_blockiness.h"
+#endif
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_encoder.h"
-#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/encoder/vp9_mcomp.h"
+#endif
 #include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_picklpf.h"
@@ -65,12 +84,12 @@
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
-#define ALTREF_HIGH_PRECISION_MV 1     // Whether to use high precision mv
-                                       //  for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200  // Q threshold for high precision
-                                       // mv. Choose a very high value for
-                                       // now so that HIGH_PRECISION is always
-                                       // chosen.
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
 
 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -84,6 +103,9 @@ static FILE *yuv_skinmap_file = NULL;
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+FILE *yuv_svc_src[3] = { NULL, NULL, NULL };
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -102,6 +124,14 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size);
+#endif
+void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                  TX_SIZE tx_size);
+
+#if !CONFIG_REALTIME_ONLY
 // compute adaptive threshold for skip recoding
 static int compute_context_model_thresh(const VP9_COMP *const cpi) {
   const VP9_COMMON *const cm = &cpi->common;
@@ -426,6 +456,7 @@ static int compute_context_model_diff(const VP9_COMMON *const cm) {
 
   return -diff;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Test for whether to calculate metrics for the frame.
 static int is_psnr_calc_enabled(VP9_COMP *cpi) {
@@ -483,14 +514,10 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
-      *hr = 1;
-      *hs = 2;
-      break;
     default:
+      assert(mode == ONETWO);
       *hr = 1;
-      *hs = 1;
-      assert(0);
+      *hs = 2;
       break;
   }
 }
@@ -547,6 +574,74 @@ static void apply_active_map(VP9_COMP *cpi) {
   }
 }
 
+static void apply_roi_map(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *const seg = &cm->seg;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int *delta_q = roi->delta_q;
+  const int *delta_lf = roi->delta_lf;
+  const int *skip = roi->skip;
+  int ref_frame[8];
+  int internal_delta_q[MAX_SEGMENTS];
+  int i;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+
+  // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+  // realtime mode.
+  if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+  if (!roi->enabled) return;
+
+  memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  // Select delta coding method;
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    // Translate the external delta q values to internal values.
+    internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+    if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+    vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+    if (internal_delta_q[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+    }
+    if (delta_lf[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+      vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+    }
+    if (skip[i] != 0) {
+      vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+    }
+    if (ref_frame[i] >= 0) {
+      int valid_ref = 1;
+      // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+      if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+        valid_ref = 0;
+      // If GOLDEN is selected, make sure it's set as reference.
+      if (ref_frame[i] == GOLDEN_FRAME &&
+          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+        valid_ref = 0;
+      }
+      // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+      // same reference.
+      if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+        ref_frame[i] = LAST_FRAME;
+      if (valid_ref) {
+        vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+        vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+      }
+    }
+  }
+  roi->enabled = 1;
+}
+
 static void init_level_info(Vp9LevelInfo *level_info) {
   Vp9LevelStats *const level_stats = &level_info->level_stats;
   Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -557,6 +652,13 @@ static void init_level_info(Vp9LevelInfo *level_info) {
   level_spec->min_altref_distance = INT_MAX;
 }
 
+static int check_seg_range(int seg_data[8], int range) {
+  return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
+           abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
+           abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
+           abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   int i;
   const Vp9LevelSpec *this_level;
@@ -583,6 +685,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]) {
+  VP9_COMMON *cm = &cpi->common;
+  vpx_roi_map_t *roi = &cpi->roi;
+  const int range = 63;
+  const int ref_frame_range = 3;  // Alt-ref
+  const int skip_range = 1;
+  const int frame_rows = cpi->common.mi_rows;
+  const int frame_cols = cpi->common.mi_cols;
+
+  // Check number of rows and columns match
+  if (frame_rows != (int)rows || frame_cols != (int)cols) {
+    return -1;
+  }
+
+  if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+      !check_seg_range(ref_frame, ref_frame_range) ||
+      !check_seg_range(skip, skip_range))
+    return -1;
+
+  // Also disable segmentation if no deltas are specified.
+  if (!map ||
+      (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+         delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+         delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+         delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+         skip[5] | skip[6] | skip[7]) &&
+       (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+        ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+        ref_frame[6] == -1 && ref_frame[7] == -1))) {
+    vp9_disable_segmentation(&cm->seg);
+    cpi->roi.enabled = 0;
+    return 0;
+  }
+
+  if (roi->roi_map) {
+    vpx_free(roi->roi_map);
+    roi->roi_map = NULL;
+  }
+  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+
+  // Copy to ROI sturcture in the compressor.
+  memcpy(roi->roi_map, map, rows * cols);
+  memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+  memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+  memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+  memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+  roi->enabled = 1;
+  roi->rows = rows;
+  roi->cols = cols;
+
+  return 0;
+}
+
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols) {
   if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -660,8 +817,17 @@ static void setup_frame(VP9_COMP *cpi) {
     if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame;
   }
 
+  // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF
+  // case. Need some further investigation on if we could apply this to single
+  // layer ARF case as well.
+  if (cpi->multi_layer_arf && !cpi->use_svc) {
+    GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    cm->frame_context_idx = clamp(gf_group->layer_depth[gf_group->index] - 1, 0,
+                                  FRAME_CONTEXTS - 1);
+  }
+
   if (cm->frame_type == KEY_FRAME) {
-    if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1;
+    cpi->refresh_golden_frame = 1;
     cpi->refresh_alt_ref_frame = 1;
     vp9_zero(cpi->interp_filter_selected);
   } else {
@@ -713,12 +879,17 @@ static void vp9_enc_free_mi(VP9_COMMON *cm) {
   cm->mi_grid_base = NULL;
   vpx_free(cm->prev_mi_grid_base);
   cm->prev_mi_grid_base = NULL;
+  cm->mi_alloc_size = 0;
 }
 
 static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) {
   // Current mip will be the prev_mip for the next frame.
   MODE_INFO **temp_base = cm->prev_mi_grid_base;
   MODE_INFO *temp = cm->prev_mip;
+
+  // Skip update prev_mi frame in show_existing_frame mode.
+  if (cm->show_existing_frame) return;
+
   cm->prev_mip = cm->mip;
   cm->mip = temp;
 
@@ -817,9 +988,18 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  vpx_free(cpi->roi.roi_map);
+  cpi->roi.roi_map = NULL;
+
   vpx_free(cpi->consec_zero_mv);
   cpi->consec_zero_mv = NULL;
 
+  vpx_free(cpi->mb_wiener_variance);
+  cpi->mb_wiener_variance = NULL;
+
+  vpx_free(cpi->mi_ssim_rdmult_scaling_factors);
+  cpi->mi_ssim_rdmult_scaling_factors = NULL;
+
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp9_free_postproc_buffers(cm);
@@ -1121,8 +1301,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
 
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
-  // target of 1/4x1/4.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
+  // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+      cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
             &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -1213,15 +1394,9 @@ static void set_tile_limits(VP9_COMP *cpi) {
   int min_log2_tile_cols, max_log2_tile_cols;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  if (is_two_pass_svc(cpi) && (cpi->svc.encode_empty_frame_state == ENCODING ||
-                               cpi->svc.number_spatial_layers > 1)) {
-    cm->log2_tile_cols = 0;
-    cm->log2_tile_rows = 0;
-  } else {
-    cm->log2_tile_cols =
-        clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
-    cm->log2_tile_rows = cpi->oxcf.tile_rows;
-  }
+  cm->log2_tile_cols =
+      clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  cm->log2_tile_rows = cpi->oxcf.tile_rows;
 
   if (cpi->oxcf.target_level == LEVEL_AUTO) {
     const int level_tile_cols =
@@ -1244,24 +1419,17 @@ static void update_frame_size(VP9_COMP *cpi) {
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
 
   set_tile_limits(cpi);
-
-  if (is_two_pass_svc(cpi)) {
-    if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height,
-                                 cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 cm->use_highbitdepth,
-#endif
-                                 VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                 NULL, NULL, NULL))
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Failed to reallocate alt_ref_buffer");
-  }
 }
 
 static void init_buffer_indices(VP9_COMP *cpi) {
-  cpi->lst_fb_idx = 0;
-  cpi->gld_fb_idx = 1;
-  cpi->alt_fb_idx = 2;
+  int ref_frame;
+
+  for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+    cpi->ref_fb_idx[ref_frame] = ref_frame;
+
+  cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1];
+  cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+  cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
 }
 
 static void init_level_constraint(LevelConstraint *lc) {
@@ -1610,7 +1778,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
                    vpx_highbd_sad4x4x4d_bits10)
         break;
 
-      case VPX_BITS_12:
+      default:
+        assert(cm->bit_depth == VPX_BITS_12);
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
                    vpx_highbd_12_sub_pixel_variance32x16,
@@ -1689,11 +1858,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
                    vpx_highbd_12_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x4d_bits12)
         break;
-
-      default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
     }
   }
 }
@@ -1757,6 +1921,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   int last_w = cpi->oxcf.width;
   int last_h = cpi->oxcf.height;
 
+  vp9_init_quantizer(cpi);
   if (cm->profile != oxcf->profile) cm->profile = oxcf->profile;
   cm->bit_depth = oxcf->bit_depth;
   cm->color_space = oxcf->color_space;
@@ -2017,10 +2182,13 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                                sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      cm, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
 
+#if !CONFIG_REALTIME_ONLY
   CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+#endif
 
   CHECK_MEM_ERROR(
       cm, cpi->consec_zero_mv,
@@ -2062,8 +2230,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 #endif
 
   cpi->refresh_alt_ref_frame = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
-
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 
   init_level_info(&cpi->level_info);
@@ -2104,9 +2270,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   if (cpi->b_calculate_consistency) {
     CHECK_MEM_ERROR(cm, cpi->ssim_vars,
-                    vpx_malloc(sizeof(*cpi->ssim_vars) * 4 *
-                               cpi->common.mi_rows * cpi->common.mi_cols));
+                    vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
+                               sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
+  } else {
+    cpi->ssim_vars = NULL;
   }
 
 #endif
@@ -2141,6 +2309,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+  yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb");
+  yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb");
+  yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb");
+#endif
 
 #if 0
   framepsnr = fopen("framepsnr.stt", "a");
@@ -2216,8 +2389,30 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   }
 #endif  // !CONFIG_REALTIME_ONLY
 
-  vp9_set_speed_features_framesize_independent(cpi);
-  vp9_set_speed_features_framesize_dependent(cpi);
+  cpi->mb_wiener_var_cols = 0;
+  cpi->mb_wiener_var_rows = 0;
+  cpi->mb_wiener_variance = NULL;
+
+  vp9_set_speed_features_framesize_independent(cpi, oxcf->speed);
+  vp9_set_speed_features_framesize_dependent(cpi, oxcf->speed);
+
+  {
+    const int bsize = BLOCK_16X16;
+    const int w = num_8x8_blocks_wide_lookup[bsize];
+    const int h = num_8x8_blocks_high_lookup[bsize];
+    const int num_cols = (cm->mi_cols + w - 1) / w;
+    const int num_rows = (cm->mi_rows + h - 1) / h;
+    CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+                    vpx_calloc(num_rows * num_cols,
+                               sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
+  }
+
+  cpi->kmeans_data_arr_alloc = 0;
+#if CONFIG_NON_GREEDY_MV
+  cpi->feature_score_loc_alloc = 0;
+  cpi->tpl_ready = 0;
+#endif  // CONFIG_NON_GREEDY_MV
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
 
   // Allocate memory to store variances for a frame.
   CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
@@ -2293,6 +2488,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   vp9_loop_filter_init(cm);
 
+  // Set up the unit scaling factor used during motion search.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  cpi->td.mb.me_sf = &cpi->me_sf;
+
   cm->error.setjmp = 0;
 
   return cpi;
@@ -2307,11 +2513,15 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
-  unsigned int i;
+  unsigned int i, frame;
   int t;
 
   if (!cpi) return;
 
+#if CONFIG_INTERNAL_STATS
+  vpx_free(cpi->ssim_vars);
+#endif
+
   cm = &cpi->common;
   if (cm->current_video_frame > 0) {
 #if CONFIG_INTERNAL_STATS
@@ -2383,7 +2593,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
       fclose(f);
     }
-
 #endif
 
 #if 0
@@ -2402,6 +2611,35 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   vp9_denoiser_free(&(cpi->denoiser));
 #endif
 
+  if (cpi->kmeans_data_arr_alloc) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&cpi->kmeans_mutex);
+#endif
+    vpx_free(cpi->kmeans_data_arr);
+  }
+
+#if CONFIG_NON_GREEDY_MV
+  vpx_free(cpi->feature_score_loc_arr);
+  vpx_free(cpi->feature_score_loc_sort);
+  vpx_free(cpi->feature_score_loc_heap);
+  vpx_free(cpi->select_mv_arr);
+#endif
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+    int rf_idx;
+    for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+      int sqr_bsize;
+      for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) {
+        vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]);
+      }
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    cpi->tpl_stats[frame].is_valid = 0;
+  }
+
   for (t = 0; t < cpi->num_workers; ++t) {
     VPxWorker *const worker = &cpi->workers[t];
     EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
@@ -2425,7 +2663,9 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   }
 
+#if !CONFIG_REALTIME_ONLY
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
+#endif
 
   dealloc_compressor_data(cpi);
 
@@ -2459,6 +2699,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #ifdef OUTPUT_YUV_REC
   fclose(yuv_rec_file);
 #endif
+#ifdef OUTPUT_YUV_SVC_SRC
+  fclose(yuv_svc_src[0]);
+  fclose(yuv_svc_src[1]);
+  fclose(yuv_svc_src[2]);
+#endif
 
 #if 0
 
@@ -2707,6 +2952,7 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if !CONFIG_REALTIME_ONLY
 static int scale_down(VP9_COMP *cpi, int q) {
   RATE_CONTROL *const rc = &cpi->rc;
   GF_GROUP *const gf_group = &cpi->twopass.gf_group;
@@ -2754,11 +3000,14 @@ static int big_rate_miss(VP9_COMP *cpi) {
 
 // test in two pass for the first
 static int two_pass_first_group_inter(VP9_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-  GF_GROUP *const gf_group = &twopass->gf_group;
-  if ((cpi->oxcf.pass == 2) &&
-      (gf_group->index == gf_group->first_inter_index)) {
-    return 1;
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *const twopass = &cpi->twopass;
+    GF_GROUP *const gf_group = &twopass->gf_group;
+    const int gfg_index = gf_group->index;
+
+    if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE;
+    return gf_group->update_type[gfg_index - 1] != LF_UPDATE &&
+           gf_group->update_type[gfg_index] == LF_UPDATE;
   } else {
     return 0;
   }
@@ -2807,10 +3056,24 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   }
   return force_recode;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
-void vp9_update_reference_frames(VP9_COMP *cpi) {
+static void update_ref_frames(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
+  GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  if (cpi->rc.show_arf_as_gld) {
+    int tmp = cpi->alt_fb_idx;
+    cpi->alt_fb_idx = cpi->gld_fb_idx;
+    cpi->gld_fb_idx = tmp;
+  } else if (cm->show_existing_frame) {
+    // Pop ARF.
+    cpi->lst_fb_idx = cpi->alt_fb_idx;
+    cpi->alt_fb_idx =
+        stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+    --gf_group->stack_size;
+  }
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
@@ -2836,23 +3099,23 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
-
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx;
-      cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx;
-    }
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
-      int arf_idx = cpi->alt_fb_idx;
-      if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-        const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-        arf_idx = gf_group->arf_update_idx[gf_group->index];
-      }
+      int arf_idx = gf_group->top_arf_idx;
+
+      // Push new ARF into stack.
+      stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx,
+                 gf_group->stack_size);
+      ++gf_group->stack_size;
+
+      assert(arf_idx < REF_FRAMES);
 
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+
+      cpi->alt_fb_idx = arf_idx;
     }
 
     if (cpi->refresh_golden_frame) {
@@ -2877,69 +3140,39 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
   }
-#if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
-      cpi->denoiser.denoising_level > kDenLowLow) {
-    int svc_base_is_key = 0;
-    int denoise_svc_second_layer = 0;
-    if (cpi->use_svc) {
-      int realloc_fail = 0;
-      const int svc_buf_shift =
-          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
-              ? cpi->denoiser.num_ref_frames
-              : 0;
-      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-                                   cpi->svc.temporal_layer_id,
-                                   cpi->svc.number_temporal_layers);
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      svc_base_is_key = lc->is_key_frame;
-      denoise_svc_second_layer =
-          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
-                                                                          : 0;
-      // Check if we need to allocate extra buffers in the denoiser
-      // for
-      // refreshed frames.
-      realloc_fail = vp9_denoiser_realloc_svc(
-          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
-          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
-          cpi->gld_fb_idx, cpi->lst_fb_idx);
-      if (realloc_fail)
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to re-allocate denoiser for SVC");
-    }
-    vp9_denoiser_update_frame_info(
-        &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
-        cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
-        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
-        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
-        denoise_svc_second_layer);
+
+  if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) {
+    cpi->alt_fb_idx =
+        stack_pop(gf_group->arf_index_stack, gf_group->stack_size);
+    --gf_group->stack_size;
   }
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+  update_ref_frames(cpi);
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  vp9_denoiser_update_ref_frame(cpi);
 #endif
-  if (is_one_pass_cbr_svc(cpi)) {
-    // Keep track of frame index for each reference frame.
-    SVC *const svc = &cpi->svc;
-    if (cm->frame_type == KEY_FRAME) {
-      svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
-    } else {
-      if (cpi->refresh_last_frame)
-        svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_golden_frame)
-        svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
-      if (cpi->refresh_alt_ref_frame)
-        svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
-    }
-  }
+
+  if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi);
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
   struct loopfilter *lf = &cm->lf;
-
-  const int is_reference_frame =
+  int is_reference_frame =
       (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
        cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+  if (cpi->use_svc &&
+      cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS)
+    is_reference_frame = !cpi->svc.non_reference_frame;
+
+  // Skip loop filter in show_existing_frame mode.
+  if (cm->show_existing_frame) {
+    lf->filter_level = 0;
+    return;
+  }
 
   if (xd->lossless) {
     lf->filter_level = 0;
@@ -3066,8 +3299,8 @@ void vp9_scale_references(VP9_COMP *cpi) {
         if (cpi->oxcf.pass == 0 && !cpi->use_svc) {
           // Check for release of scaled reference.
           buf_idx = cpi->scaled_ref_idx[ref_frame - 1];
-          buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL;
-          if (buf != NULL) {
+          if (buf_idx != INVALID_IDX) {
+            buf = &pool->frame_bufs[buf_idx];
             --buf->ref_count;
             cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
           }
@@ -3098,22 +3331,21 @@ static void release_scaled_references(VP9_COMP *cpi) {
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i - 1];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
-      if (buf != NULL &&
-          (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
-                              buf->buf.y_crop_height == ref->y_crop_height))) {
-        --buf->ref_count;
-        cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+      if (idx != INVALID_IDX) {
+        RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
+        const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i);
+        if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width &&
+                               buf->buf.y_crop_height == ref->y_crop_height)) {
+          --buf->ref_count;
+          cpi->scaled_ref_idx[i - 1] = INVALID_IDX;
+        }
       }
     }
   } else {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    for (i = 0; i < REFS_PER_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i];
-      RefCntBuffer *const buf =
-          idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
-      if (buf != NULL) {
+      if (idx != INVALID_IDX) {
+        RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx];
         --buf->ref_count;
         cpi->scaled_ref_idx[i] = INVALID_IDX;
       }
@@ -3172,11 +3404,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
       case VPX_BITS_10:
         dc_quant_devisor = 16.0;
         break;
-      case VPX_BITS_12:
-        dc_quant_devisor = 64.0;
-        break;
       default:
-        assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        dc_quant_devisor = 64.0;
         break;
     }
 #else
@@ -3292,7 +3522,7 @@ static void set_mv_search_params(VP9_COMP *cpi) {
 }
 
 static void set_size_independent_vars(VP9_COMP *cpi) {
-  vp9_set_speed_features_framesize_independent(cpi);
+  vp9_set_speed_features_framesize_independent(cpi, cpi->oxcf.speed);
   vp9_set_rd_speed_thresholds(cpi);
   vp9_set_rd_speed_thresholds_sub8x8(cpi);
   cpi->common.interp_filter = cpi->sf.default_interp_filter;
@@ -3303,11 +3533,16 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
   VP9_COMMON *const cm = &cpi->common;
 
   // Setup variables that depend on the dimensions of the frame.
-  vp9_set_speed_features_framesize_dependent(cpi);
+  vp9_set_speed_features_framesize_dependent(cpi, cpi->oxcf.speed);
 
   // Decide q and q bounds.
   *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index);
 
+  if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) {
+    *q = cpi->rc.worst_quality;
+    cpi->rc.force_max_q = 0;
+  }
+
   if (!frame_is_intra_only(cm)) {
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
@@ -3415,9 +3650,7 @@ static void set_frame_size(VP9_COMP *cpi) {
 #endif
   }
 
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc || (is_two_pass_svc(cpi) &&
-                         cpi->svc.encode_empty_frame_state != ENCODING))) {
+  if ((oxcf->pass == 2) && !cpi->use_svc) {
     vp9_set_target_rate(cpi);
   }
 
@@ -3464,19 +3697,76 @@ static void set_frame_size(VP9_COMP *cpi) {
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
-                                       uint8_t *dest) {
+#if CONFIG_CONSISTENT_RECODE
+static void save_encode_params(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  int tile_col, tile_row;
+  int i, j;
+  RD_OPT *rd_opt = &cpi->rd;
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    for (j = 0; j < REFERENCE_MODES; j++)
+      rd_opt->prediction_type_threshes_prev[i][j] =
+          rd_opt->prediction_type_threshes[i][j];
+
+    for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++)
+      rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
+  }
+
+  if (cpi->tile_data != NULL) {
+    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+        TileDataEnc *tile_data =
+            &cpi->tile_data[tile_row * tile_cols + tile_col];
+        for (i = 0; i < BLOCK_SIZES; ++i) {
+          for (j = 0; j < MAX_MODES; ++j) {
+            tile_data->thresh_freq_fact_prev[i][j] =
+                tile_data->thresh_freq_fact[i][j];
+          }
+        }
+      }
+  }
+}
+#endif
+
+static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
+#ifdef ENABLE_KF_DENOISE
+  if (is_spatial_denoise_enabled(cpi)) {
+    cpi->raw_source_frame = vp9_scale_if_required(
+        cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
+        (oxcf->pass == 0), EIGHTTAP, 0);
+  } else {
+    cpi->raw_source_frame = cpi->Source;
+  }
+#else
+  cpi->raw_source_frame = cpi->Source;
+#endif
+}
+
+static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
+                                      uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
-  int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  SVC *const svc = &cpi->svc;
+  int q = 0, bottom_index = 0, top_index = 0;
+  int no_drop_scene_change = 0;
   const INTERP_FILTER filter_scaler =
       (is_one_pass_cbr_svc(cpi))
-          ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id]
+          ? svc->downsample_filter_type[svc->spatial_layer_id]
           : EIGHTTAP;
   const int phase_scaler =
       (is_one_pass_cbr_svc(cpi))
-          ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id]
+          ? svc->downsample_filter_phase[svc->spatial_layer_id]
           : 0;
 
+  if (cm->show_existing_frame) {
+    cpi->rc.this_frame_target = 0;
+    if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+    return 1;
+  }
+
+  svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe;
+
   // Flag to check if its valid to compute the source sad (used for
   // scene detection and for superblock content state in CBR mode).
   // The flag may get reset below based on SVC or resizing state.
@@ -3489,30 +3779,36 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (is_one_pass_cbr_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
-      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
-      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+      svc->scaled_temp.y_width == cm->width << 1 &&
+      svc->scaled_temp.y_height == cm->height << 1) {
     // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
     // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
     // result will be saved in scaled_temp and might be used later.
-    const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1];
-    const int phase_scaler2 = cpi->svc.downsample_filter_phase[1];
+    const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
+    const int phase_scaler2 = svc->downsample_filter_phase[1];
     cpi->Source = vp9_svc_twostage_scale(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp,
+        cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
-    cpi->svc.scaled_one_half = 1;
+    svc->scaled_one_half = 1;
   } else if (is_one_pass_cbr_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
-             cpi->svc.scaled_one_half) {
+             svc->scaled_one_half) {
     // If the spatial layer is 1/2x1/2 and the scaling is already done in the
     // two-stage scaling, use the result directly.
-    cpi->Source = &cpi->svc.scaled_temp;
-    cpi->svc.scaled_one_half = 0;
+    cpi->Source = &svc->scaled_temp;
+    svc->scaled_one_half = 0;
   } else {
     cpi->Source = vp9_scale_if_required(
         cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
         filter_scaler, phase_scaler);
   }
+#ifdef OUTPUT_YUV_SVC_SRC
+  // Write out at most 3 spatial layers.
+  if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+    vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
+  }
+#endif
   // Unfiltered raw source used in metrics calculation if the source
   // has been filtered.
   if (is_psnr_calc_enabled(cpi)) {
@@ -3530,9 +3826,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 
   if ((cpi->use_svc &&
-       (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
-        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
-        cpi->svc.current_superframe < 1)) ||
+       (svc->spatial_layer_id < svc->number_spatial_layers - 1 ||
+        svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+        svc->current_superframe < 1)) ||
       cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
       cpi->resize_state != ORIG) {
     cpi->compute_source_sad_onepass = 0;
@@ -3562,53 +3858,102 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
       cpi->Last_Source->y_height != cpi->Source->y_height)
     cpi->compute_source_sad_onepass = 0;
 
-  if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) {
+  if (frame_is_intra_only(cm) || cpi->resize_pending != 0) {
     memset(cpi->consec_zero_mv, 0,
            cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
   }
 
-  vp9_update_noise_estimate(cpi);
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc)
+    vp9_denoiser_reset_on_first_frame(cpi);
+#endif
 
   // Scene detection is always used for VBR mode or screen-content case.
   // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
   // (need to check encoding time cost for doing this for speed 8).
   cpi->rc.high_source_sad = 0;
-  if (cpi->compute_source_sad_onepass && cm->show_frame &&
+  cpi->rc.hybrid_intra_scene_change = 0;
+  cpi->rc.re_encode_maxq_scene_change = 0;
+  if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
-       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc)))
+       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
     vp9_scene_detection_onepass(cpi);
 
+  if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+    svc->high_source_sad_superframe = cpi->rc.high_source_sad;
+    svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion;
+    // On scene change reset temporal layer pattern to TL0.
+    // Note that if the base/lower spatial layers are skipped: instead of
+    // inserting base layer here, we force max-q for the next superframe
+    // with lower spatial layers: this is done in vp9_encodedframe_overshoot()
+    // when max-q is decided for the current layer.
+    // Only do this reset for bypass/flexible mode.
+    if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 &&
+        svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      // rc->high_source_sad will get reset so copy it to restore it.
+      int tmp_high_source_sad = cpi->rc.high_source_sad;
+      vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME);
+      cpi->rc.high_source_sad = tmp_high_source_sad;
+    }
+  }
+
+  vp9_update_noise_estimate(cpi);
+
+  // For 1 pass CBR, check if we are dropping this frame.
+  // Never drop on key frame, if base layer is key for svc,
+  // on scene change, or if superframe has layer sync.
+  if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) &&
+      !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0]))
+    no_drop_scene_change = 1;
+  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
+      !frame_is_intra_only(cm) && !no_drop_scene_change &&
+      !svc->superframe_has_layer_sync &&
+      (!cpi->use_svc ||
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi)) return 0;
+  }
+
   // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
   if (frame_is_intra_only(cm) == 0 &&
-      !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) {
+      !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) {
     vp9_scale_references(cpi);
   }
 
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+  // search method and step parameter might be changed in speed settings.
+  init_motion_estimation(cpi);
+
   if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
 
   if (cpi->sf.svc_use_lowres_part &&
-      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) {
-    if (cpi->svc.prev_partition_svc == NULL) {
+      svc->spatial_layer_id == svc->number_spatial_layers - 2) {
+    if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->svc.prev_partition_svc,
+          cm, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
-                                   sizeof(*cpi->svc.prev_partition_svc)));
+                                   sizeof(*svc->prev_partition_svc)));
     }
   }
 
-  if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
+  // TODO(jianj): Look into issue of skin detection with high bitdepth.
+  if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
     cpi->use_skin_detection = 1;
   }
 
+  // Enable post encode frame dropping for CBR on non key frame, when
+  // ext_use_post_encode_drop is specified by user.
+  cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop &&
+                                 cpi->oxcf.rc_mode == VPX_CBR &&
+                                 cm->frame_type != KEY_FRAME;
+
   vp9_set_quantizer(cm, q);
   vp9_set_variance_partition_thresholds(cpi, q, 0);
 
@@ -3616,6 +3961,34 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   suppress_active_map(cpi);
 
+  if (cpi->use_svc) {
+    // On non-zero spatial layer, check for disabling inter-layer
+    // prediction.
+    if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
+    vp9_svc_assert_constraints_pattern(cpi);
+  }
+
+  if (cpi->rc.last_post_encode_dropped_scene_change) {
+    cpi->rc.high_source_sad = 1;
+    svc->high_source_sad_superframe = 1;
+    // For now disable use_source_sad since Last_Source will not be the previous
+    // encoded but the dropped one.
+    cpi->sf.use_source_sad = 0;
+    cpi->rc.last_post_encode_dropped_scene_change = 0;
+  }
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ &&
+      (cpi->rc.high_source_sad ||
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
+    if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
+      vp9_set_quantizer(cm, q);
+      vp9_set_variance_partition_thresholds(cpi, q, 0);
+    }
+  }
+
+#if !CONFIG_REALTIME_ONLY
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
   if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -3624,24 +3997,32 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_360aq_frame_setup(cpi);
   } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
     vp9_setup_in_frame_q_adj(cpi);
-  } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-    vp9_cyclic_refresh_setup(cpi);
   } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
     // it may be pretty bad for rate-control,
     // and I should handle it somehow
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+  } else {
+#endif
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vp9_cyclic_refresh_setup(cpi);
+    } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+      apply_roi_map(cpi);
+    }
+#if !CONFIG_REALTIME_ONLY
   }
+#endif
 
   apply_active_map(cpi);
 
   vp9_encode_frame(cpi);
 
-  // Check if we should drop this frame because of high overshoot.
-  // Only for frames where high temporal-source SAD is detected.
-  if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
-      cpi->resize_state == ORIG && cm->frame_type != KEY_FRAME &&
-      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
-      cpi->rc.high_source_sad == 1) {
+  // Check if we should re-encode this frame at high Q because of high
+  // overshoot based on the encoded frame size. Only for frames where
+  // high temporal-source SAD is detected.
+  // For SVC: all spatial layers are checked for re-encoding.
+  if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+      (cpi->rc.high_source_sad ||
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
     int frame_size = 0;
     // Get an estimate of the encoded frame size.
     save_coding_context(cpi);
@@ -3657,8 +4038,12 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
       suppress_active_map(cpi);
       // Turn-off cyclic refresh for re-encoded frame.
       if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+        CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
         unsigned char *const seg_map = cpi->segmentation_map;
         memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+        memset(cr->last_coded_q_map, MAXQ,
+               cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+        cr->sb_index = 0;
         vp9_disable_segmentation(&cm->seg);
       }
       apply_active_map(cpi);
@@ -3668,15 +4053,17 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   // Update some stats from cyclic refresh, and check for golden frame update.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
-      cm->frame_type != KEY_FRAME)
+      !frame_is_intra_only(cm))
     vp9_cyclic_refresh_postencode(cpi);
 
   // Update the skip mb flag probabilities based on the distribution
   // seen in the last encoder iteration.
   // update_base_skip_probs(cpi);
   vpx_clear_system_state();
+  return 1;
 }
 
+#if !CONFIG_REALTIME_ONLY
 #define MAX_QSTEP_ADJ 4
 static int get_qstep_adj(int rate_excess, int rate_limit) {
   int qstep =
@@ -3703,11 +4090,17 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
   int qrange_adj = 1;
 #endif
 
+  if (cm->show_existing_frame) {
+    rc->this_frame_target = 0;
+    if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi);
+    return;
+  }
+
   set_size_independent_vars(cpi);
 
-  enable_acl = cpi->sf.allow_acl
-                   ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0)
-                   : 0;
+  enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) ||
+                                       (cpi->twopass.gf_group.index == 1)
+                                 : 0;
 
   do {
     vpx_clear_system_state();
@@ -3796,6 +4189,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       vp9_setup_in_frame_q_adj(cpi);
     } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
       vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+    } else if (oxcf->aq_mode == PSNR_AQ) {
+      vp9_psnr_aq_mode_setup(&cm->seg);
     }
 
     vp9_encode_frame(cpi);
@@ -3900,8 +4295,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           // Special case if the projected size is > the max allowed.
           if ((q == q_high) &&
               ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-               (rc->projected_frame_size >=
-                big_rate_miss_high_threshold(cpi)))) {
+               (!rc->is_src_frame_alt_ref &&
+                (rc->projected_frame_size >=
+                 big_rate_miss_high_threshold(cpi))))) {
             int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
                                             big_rate_miss_high_threshold(cpi)));
             double q_val_high;
@@ -4006,7 +4402,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 #endif
     // Have we been forced to adapt Q outside the expected range by an extreme
     // rate miss. If so adjust the active maxQ for the subsequent frames.
-    if (q > cpi->twopass.active_worst_quality) {
+    if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) {
       cpi->twopass.active_worst_quality = q;
     } else if (oxcf->vbr_corpus_complexity && q == q_low &&
                rc->projected_frame_size < rc->this_frame_target) {
@@ -4028,14 +4424,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_encode_frame(cpi);
     vpx_clear_system_state();
     restore_coding_context(cpi);
-    vp9_pack_bitstream(cpi, dest, size);
-
-    vp9_encode_frame(cpi);
-    vpx_clear_system_state();
-
-    restore_coding_context(cpi);
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static int get_ref_frame_flags(const VP9_COMP *cpi) {
   const int *const map = cpi->common.ref_frame_map;
@@ -4131,20 +4522,21 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
   }
 }
 
-static void set_arf_sign_bias(VP9_COMP *cpi) {
+static void set_ref_sign_bias(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  int arf_sign_bias;
+  RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+  const int cur_frame_index = ref_buffer->frame_index;
+  MV_REFERENCE_FRAME ref_frame;
 
-  if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    arf_sign_bias = cpi->rc.source_alt_ref_active &&
-                    (!cpi->refresh_alt_ref_frame ||
-                     (gf_group->rf_level[gf_group->index] == GF_ARF_LOW));
-  } else {
-    arf_sign_bias =
-        (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
+  for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    const RefCntBuffer *const ref_cnt_buf =
+        get_ref_cnt_buffer(&cpi->common, buf_idx);
+    if (ref_cnt_buf) {
+      cm->ref_frame_sign_bias[ref_frame] =
+          cur_frame_index < ref_cnt_buf->frame_index;
+    }
   }
-  cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
 }
 
 static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
@@ -4328,6 +4720,7 @@ static void spatial_denoise_frame(VP9_COMP *cpi) {
 }
 #endif  // ENABLE_KF_DENOISE
 
+#if !CONFIG_REALTIME_ONLY
 static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
                                          uint8_t *dest) {
   if (cpi->common.seg.enabled)
@@ -4351,35 +4744,290 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size,
         vp9_enable_segmentation(&cpi->common.seg);
     }
 }
+#endif
 
-static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
-                                      uint8_t *dest,
-                                      unsigned int *frame_flags) {
-  VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  struct segmentation *const seg = &cm->seg;
-  TX_SIZE t;
+static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) {
+  RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx);
 
-  set_ext_overrides(cpi);
-  vpx_clear_system_state();
+  if (ref_buffer) {
+    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+    ref_buffer->frame_index =
+        cm->current_video_frame + gf_group->arf_src_offset[gf_group->index];
+  }
+}
 
-#ifdef ENABLE_KF_DENOISE
-  // Spatial denoise of key frame.
-  if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi);
+// Implementation and modifications of C. Yeo, H. L. Tan, and Y. H. Tan, "On
+// rate distortion optimization using SSIM," Circuits and Systems for Video
+// Technology, IEEE Transactions on, vol. 23, no. 7, pp. 1170-1181, 2013.
+// SSIM_VAR_SCALE defines the strength of the bias towards SSIM in RDO.
+// Some sample values are:
+// (for midres test set)
+// SSIM_VAR_SCALE  avg_psnr   ssim   ms_ssim
+//      8.0          9.421   -5.537  -6.898
+//     16.0          4.703   -5.378  -6.238
+//     32.0          1.929   -4.308  -4.807
+#define SSIM_VAR_SCALE 16.0
+static void set_mb_ssim_rdmult_scaling(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  uint8_t *y_buffer = cpi->Source->y_buffer;
+  const int y_stride = cpi->Source->y_stride;
+  const int block_size = BLOCK_16X16;
+
+  const int num_8x8_w = num_8x8_blocks_wide_lookup[block_size];
+  const int num_8x8_h = num_8x8_blocks_high_lookup[block_size];
+  const int num_cols = (cm->mi_cols + num_8x8_w - 1) / num_8x8_w;
+  const int num_rows = (cm->mi_rows + num_8x8_h - 1) / num_8x8_h;
+  double log_sum = 0.0;
+  int row, col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  double c2;
+  if (xd->bd == 10) {
+    c2 = 941.8761;  // (.03*1023)^2
+  } else if (xd->bd == 12) {
+    c2 = 15092.1225;  // (.03*4095)^2
+  } else {
+    c2 = 58.5225;  // (.03*255)^2
+  }
+#else
+  const double c2 = 58.5225;  // (.03*255)^2
 #endif
 
-  // Set the arf sign bias for this frame.
-  set_arf_sign_bias(cpi);
+  // Loop through each 64x64 block.
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      int mi_row, mi_col;
+      double var = 0.0, num_of_var = 0.0;
+      const int index = row * num_cols + col;
+
+      for (mi_row = row * num_8x8_h;
+           mi_row < cm->mi_rows && mi_row < (row + 1) * num_8x8_h; ++mi_row) {
+        for (mi_col = col * num_8x8_w;
+             mi_col < cm->mi_cols && mi_col < (col + 1) * num_8x8_w; ++mi_col) {
+          struct buf_2d buf;
+          const int row_offset_y = mi_row << 3;
+          const int col_offset_y = mi_col << 3;
+
+          buf.buf = y_buffer + row_offset_y * y_stride + col_offset_y;
+          buf.stride = y_stride;
+
+          // In order to make SSIM_VAR_SCALE in a same scale for both 8 bit
+          // and high bit videos, the variance needs to be divided by 2.0 or
+          // 64.0 separately.
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+            var +=
+                vp9_high_get_sby_variance(cpi, &buf, BLOCK_8X8, xd->bd) / 2.0;
+          else
+#endif
+            var += vp9_get_sby_variance(cpi, &buf, BLOCK_8X8) / 64.0;
 
-  // Set default state for segment based loop filter update flags.
-  cm->lf.mode_ref_delta_update = 0;
+          num_of_var += 1.0;
+        }
+      }
+      var = var / num_of_var / SSIM_VAR_SCALE;
+      var = 2.0 * var + c2;
+      cpi->mi_ssim_rdmult_scaling_factors[index] = var;
+      log_sum += log(var);
+    }
+  }
+  log_sum = exp(log_sum / (double)(num_rows * num_cols));
 
-  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
-    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
+  for (row = 0; row < num_rows; ++row) {
+    for (col = 0; col < num_cols; ++col) {
+      const int index = row * num_cols + col;
+      cpi->mi_ssim_rdmult_scaling_factors[index] /= log_sum;
+    }
+  }
 
-  // Set various flags etc to special state if it is a key frame.
-  if (frame_is_intra_only(cm)) {
-    // Reset the loop filter deltas and segmentation map.
+  (void)xd;
+}
+
+// Process the wiener variance in 16x16 block basis.
+static int qsort_comp(const void *elem1, const void *elem2) {
+  int a = *((const int *)elem1);
+  int b = *((const int *)elem2);
+  if (a > b) return 1;
+  if (a < b) return -1;
+  return 0;
+}
+
+static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+
+  if (cpi->mb_wiener_variance && cpi->mb_wiener_var_rows >= cm->mb_rows &&
+      cpi->mb_wiener_var_cols >= cm->mb_cols)
+    return;
+
+  vpx_free(cpi->mb_wiener_variance);
+  cpi->mb_wiener_variance = NULL;
+
+  CHECK_MEM_ERROR(
+      cm, cpi->mb_wiener_variance,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
+  cpi->mb_wiener_var_rows = cm->mb_rows;
+  cpi->mb_wiener_var_cols = cm->mb_cols;
+}
+
+static void set_mb_wiener_variance(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  uint8_t *buffer = cpi->Source->y_buffer;
+  int buf_stride = cpi->Source->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  DECLARE_ALIGNED(16, uint16_t, zero_pred16[32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, zero_pred8[32 * 32]);
+  uint8_t *zero_pred;
+#else
+  DECLARE_ALIGNED(16, uint8_t, zero_pred[32 * 32]);
+#endif
+
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+
+  int mb_row, mb_col, count = 0;
+  // Hard coded operating block size
+  const int block_size = 16;
+  const int coeff_count = block_size * block_size;
+  const TX_SIZE tx_size = TX_16X16;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  xd->cur_buf = cpi->Source;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    zero_pred = CONVERT_TO_BYTEPTR(zero_pred16);
+    memset(zero_pred16, 0, sizeof(*zero_pred16) * coeff_count);
+  } else {
+    zero_pred = zero_pred8;
+    memset(zero_pred8, 0, sizeof(*zero_pred8) * coeff_count);
+  }
+#else
+  memset(zero_pred, 0, sizeof(*zero_pred) * coeff_count);
+#endif
+
+  cpi->norm_wiener_variance = 0;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      int idx;
+      int16_t median_val = 0;
+      uint8_t *mb_buffer =
+          buffer + mb_row * block_size * buf_stride + mb_col * block_size;
+      int64_t wiener_variance = 0;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
+                                  mb_buffer, buf_stride, zero_pred, block_size,
+                                  xd->bd);
+        highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      } else {
+        vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                           mb_buffer, buf_stride, zero_pred, block_size);
+        wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      }
+#else
+      vpx_subtract_block(block_size, block_size, src_diff, block_size,
+                         mb_buffer, buf_stride, zero_pred, block_size);
+      wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      coeff[0] = 0;
+      for (idx = 1; idx < coeff_count; ++idx) coeff[idx] = abs(coeff[idx]);
+
+      qsort(coeff, coeff_count - 1, sizeof(*coeff), qsort_comp);
+
+      // Noise level estimation
+      median_val = coeff[coeff_count / 2];
+
+      // Wiener filter
+      for (idx = 1; idx < coeff_count; ++idx) {
+        int64_t sqr_coeff = (int64_t)coeff[idx] * coeff[idx];
+        int64_t tmp_coeff = (int64_t)coeff[idx];
+        if (median_val) {
+          tmp_coeff = (sqr_coeff * coeff[idx]) /
+                      (sqr_coeff + (int64_t)median_val * median_val);
+        }
+        wiener_variance += tmp_coeff * tmp_coeff;
+      }
+      cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col] =
+          wiener_variance / coeff_count;
+      cpi->norm_wiener_variance +=
+          cpi->mb_wiener_variance[mb_row * cm->mb_cols + mb_col];
+      ++count;
+    }
+  }
+
+  if (count) cpi->norm_wiener_variance /= count;
+  cpi->norm_wiener_variance = VPXMAX(1, cpi->norm_wiener_variance);
+}
+
+static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
+                                      uint8_t *dest,
+                                      unsigned int *frame_flags) {
+  VP9_COMMON *const cm = &cpi->common;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  struct segmentation *const seg = &cm->seg;
+  TX_SIZE t;
+
+  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+  // If in constrained layer drop mode (svc.framedrop_mode != LAYER_DROP) and
+  // base spatial layer was dropped, no need to set svc.skip_enhancement_layer,
+  // as whole superframe will be dropped.
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      cpi->oxcf.target_bandwidth == 0 &&
+      !(cpi->svc.framedrop_mode != LAYER_DROP &&
+        cpi->svc.drop_spatial_layer[0])) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    if (cpi->svc.framedrop_mode == LAYER_DROP ||
+        cpi->svc.drop_spatial_layer[0] == 0) {
+      // For the case of constrained drop mode where the base is dropped
+      // (drop_spatial_layer[0] == 1), which means full superframe dropped,
+      // we don't increment the svc frame counters. In particular temporal
+      // layer counter (which is incremented in vp9_inc_frame_in_layer())
+      // won't be incremented, so on a dropped frame we try the same
+      // temporal_layer_id on next incoming frame. This is to avoid an
+      // issue with temporal alignement with full superframe dropping.
+      vp9_inc_frame_in_layer(cpi);
+    }
+    return;
+  }
+
+  set_ext_overrides(cpi);
+  vpx_clear_system_state();
+
+#ifdef ENABLE_KF_DENOISE
+  // Spatial denoise of key frame.
+  if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi);
+#endif
+
+  if (cm->show_existing_frame == 0) {
+    // Update frame index
+    set_frame_index(cpi, cm);
+
+    // Set the arf sign bias for this frame.
+    set_ref_sign_bias(cpi);
+  }
+
+  // Set default state for segment based loop filter update flags.
+  cm->lf.mode_ref_delta_update = 0;
+
+  if (cpi->oxcf.pass == 2 && cpi->sf.adaptive_interp_filter_search)
+    cpi->sf.interp_filter_search_mask = setup_interp_filter_search_mask(cpi);
+
+  // Set various flags etc to special state if it is a key frame.
+  if (frame_is_intra_only(cm)) {
+    // Reset the loop filter deltas and segmentation map.
     vp9_reset_segment_features(&cm->seg);
 
     // If segmentation is enabled force a map update for key frames.
@@ -4404,66 +5052,12 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
       cm->reset_frame_context = 2;
     }
   }
-  if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
-    // Use context 0 for intra only empty frame, but the last frame context
-    // for other empty frames.
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      if (cpi->svc.encode_intra_empty_frame != 0)
-        cm->frame_context_idx = 0;
-      else
-        cm->frame_context_idx = FRAME_CONTEXTS - 1;
-    } else {
-      cm->frame_context_idx =
-          cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
-          cpi->svc.temporal_layer_id;
-    }
 
-    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
+  if (oxcf->tuning == VP8_TUNE_SSIM) set_mb_ssim_rdmult_scaling(cpi);
 
-    // The probs will be updated based on the frame type of its previous
-    // frame if frame_parallel_decoding_mode is 0. The type may vary for
-    // the frame after a key frame in base layer since we may drop enhancement
-    // layers. So set frame_parallel_decoding_mode to 1 in this case.
-    if (cm->frame_parallel_decoding_mode == 0) {
-      if (cpi->svc.number_temporal_layers == 1) {
-        if (cpi->svc.spatial_layer_id == 0 &&
-            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
-          cm->frame_parallel_decoding_mode = 1;
-      } else if (cpi->svc.spatial_layer_id == 0) {
-        // Find the 2nd frame in temporal base layer and 1st frame in temporal
-        // enhancement layers from the key frame.
-        int i;
-        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
-          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
-            cm->frame_parallel_decoding_mode = 1;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // For 1 pass CBR, check if we are dropping this frame.
-  // For spatial layers, for now only check for frame-dropping on first spatial
-  // layer, and if decision is to drop, we drop whole super-frame.
-  if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR &&
-      cm->frame_type != KEY_FRAME) {
-    if (vp9_rc_drop_frame(cpi) ||
-        (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) {
-      vp9_rc_postencode_update_drop_frame(cpi);
-      ++cm->current_video_frame;
-      cpi->ext_refresh_frame_flags_pending = 0;
-      cpi->svc.rc_drop_superframe = 1;
-      cpi->last_frame_dropped = 1;
-      // TODO(marpan): Advancing the svc counters on dropped frames can break
-      // the referencing scheme for the fixed svc patterns defined in
-      // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but
-      // for now, don't advance the svc frame counters on dropped frame.
-      // if (cpi->use_svc)
-      //   vp9_inc_frame_in_layer(cpi);
-
-      return;
-    }
+  if (oxcf->aq_mode == PERCEPTUAL_AQ) {
+    init_mb_wiener_var_buffer(cpi);
+    set_mb_wiener_variance(cpi);
   }
 
   vpx_clear_system_state();
@@ -4472,18 +5066,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
+#if CONFIG_CONSISTENT_RECODE
+  // Backup to ensure consistency between recodes
+  save_encode_params(cpi);
+#endif
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
-    encode_without_recode_loop(cpi, size, dest);
+    if (!encode_without_recode_loop(cpi, size, dest)) return;
   } else {
+#if !CONFIG_REALTIME_ONLY
     encode_with_recode_loop(cpi, size, dest);
+#endif
   }
 
-  cpi->last_frame_dropped = 0;
+  // TODO(jingning): When using show existing frame mode, we assume that the
+  // current ARF will be directly used as the final reconstructed frame. This is
+  // an encoder control scheme. One could in principle explore other
+  // possibilities to arrange the reference frame buffer and their coding order.
+  if (cm->show_existing_frame) {
+    ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx,
+               cm->ref_frame_map[cpi->alt_fb_idx]);
+  }
 
+#if !CONFIG_REALTIME_ONLY
   // Disable segmentation if it decrease rate/distortion ratio
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
     vp9_try_disable_lookahead_aq(cpi, size, dest);
+#endif
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
@@ -4527,9 +5136,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
+  if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
+
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
 
+  if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality &&
+      cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) {
+    restore_coding_context(cpi);
+    return;
+  }
+
+  cpi->last_frame_dropped = 0;
+  cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0;
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->svc.num_encoded_top_layer++;
+
+  // Keep track of the frame buffer index updated/refreshed for the
+  // current encoded TL0 superframe.
+  if (cpi->svc.temporal_layer_id == 0) {
+    if (cpi->refresh_last_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx;
+    else if (cpi->refresh_golden_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx;
+    else if (cpi->refresh_alt_ref_frame)
+      cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx;
+  }
+
   if (cm->seg.update_map) update_reference_segmentation_map(cpi);
 
   if (frame_is_intra_only(cm) == 0) {
@@ -4537,17 +5170,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   }
   vp9_update_reference_frames(cpi);
 
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
-
-  if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode)
-    vp9_adapt_coef_probs(cm);
+  if (!cm->show_existing_frame) {
+    for (t = TX_4X4; t <= TX_32X32; ++t) {
+      full_to_model_counts(cpi->td.counts->coef[t],
+                           cpi->td.rd_counts.coef_counts[t]);
+    }
 
-  if (!frame_is_intra_only(cm)) {
     if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) {
-      vp9_adapt_mode_probs(cm);
-      vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      if (!frame_is_intra_only(cm)) {
+        vp9_adapt_mode_probs(cm);
+        vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv);
+      }
+      vp9_adapt_coef_probs(cm);
     }
   }
 
@@ -4567,8 +5201,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
 
   cm->last_frame_type = cm->frame_type;
 
-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp9_rc_postencode_update(cpi, *size);
+  vp9_rc_postencode_update(cpi, *size);
+
+  *size = VPXMAX(1, *size);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -4592,7 +5227,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
   cm->last_height = cm->height;
 
   // reset to normal state now that we are done.
-  if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame;
+  if (!cm->show_existing_frame) {
+    cm->last_show_frame = cm->show_frame;
+    cm->prev_frame = cm->cur_frame;
+  }
 
   if (cm->show_frame) {
     vp9_swap_mi_and_prev_mi(cm);
@@ -4601,19 +5239,26 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
     ++cm->current_video_frame;
     if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
   }
-  cm->prev_frame = cm->cur_frame;
 
-  if (cpi->use_svc)
+  if (cpi->use_svc) {
     cpi->svc
         .layer_context[cpi->svc.spatial_layer_id *
                            cpi->svc.number_temporal_layers +
                        cpi->svc.temporal_layer_id]
         .last_frame_type = cm->frame_type;
+    // Reset layer_sync back to 0 for next frame.
+    cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0;
+  }
 
   cpi->force_update_segmentation = 0;
 
+#if !CONFIG_REALTIME_ONLY
   if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
     vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi);
+#endif
+
+  cpi->svc.previous_frame_is_intra_only = cm->intra_only;
+  cpi->svc.set_intra_only_frame = 0;
 }
 
 static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
@@ -4636,10 +5281,12 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
 static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
+#if CONFIG_MISMATCH_DEBUG
+  mismatch_move_frame_idx_w();
+#endif
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
-  if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
-    vp9_twopass_postencode_update(cpi);
+  vp9_twopass_postencode_update(cpi);
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -4649,6 +5296,8 @@ static void init_ref_frame_bufs(VP9_COMMON *cm) {
   cm->new_fb_idx = INVALID_IDX;
   for (i = 0; i < REF_FRAMES; ++i) {
     cm->ref_frame_map[i] = INVALID_IDX;
+  }
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
     pool->frame_bufs[i].ref_count = 0;
   }
 }
@@ -4702,6 +5351,12 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
   check_initial_width(cpi, subsampling_x, subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Disable denoiser for high bitdepth since vp9_denoiser_filter only works for
+  // 8 bits.
+  if (cm->bit_depth > 8) cpi->oxcf.noise_sensitivity = 0;
+#endif
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   setup_denoiser_buffer(cpi);
 #endif
@@ -4822,10 +5477,6 @@ static void check_src_altref(VP9_COMP *cpi,
 }
 
 #if CONFIG_INTERNAL_STATS
-extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch,
-                                 const uint8_t *img2, int img2_pitch, int width,
-                                 int height);
-
 static void adjust_image_stat(double y, double u, double v, double all,
                               ImageStat *s) {
   s->stat[Y] += y;
@@ -5065,6 +5716,1455 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   }
 }
 
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                            const GF_GROUP *gf_group, int *tpl_group_frames) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int arf_index_stack[MAX_ARF_LAYERS];
+  int arf_stack_size = 0;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+  // TODO(jingning): To be used later for gf frame type parsing.
+  (void)gf_group;
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+  gf_picture[0].update_type = gf_group->update_type[0];
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize base layer ARF frame
+  gf_picture[1].frame = cpi->Source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  gf_picture[1].update_type = gf_group->update_type[1];
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    switch (gf_group->update_type[frame_idx]) {
+      case ARF_UPDATE:
+        stack_push(arf_index_stack, alt_index, arf_stack_size);
+        ++arf_stack_size;
+        alt_index = frame_idx;
+        break;
+      case LF_UPDATE: lst_index = frame_idx; break;
+      case OVERLAY_UPDATE:
+        gld_index = frame_idx;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      case USE_BUF_FRAME:
+        lst_index = alt_index;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      default: break;
+    }
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) break;
+  }
+
+  alt_index = -1;
+  ++frame_idx;
+  ++frame_gop_offset;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(
+    VP9_COMP *cpi, ThreadData *td, int frame_idx, uint8_t *cur_frame_buf,
+    uint8_t *ref_frame_buf, int stride, BLOCK_SIZE bsize, int mi_row,
+    int mi_col, MV *mv, int rf_idx) {
+#else   // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+                                              int frame_idx,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              int mi_row, int mi_col, MV *mv) {
+#endif  // CONFIG_NON_GREEDY_MV
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+#if CONFIG_NON_GREEDY_MV
+  // lambda is used to adjust the importance of motion vector consitency.
+  // TODO(angiebird): Figure out lambda's proper value.
+  const int lambda = cpi->tpl_stats[frame_idx].lambda;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+#endif
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+#if CONFIG_NON_GREEDY_MV
+  (void)search_method;
+  (void)sadpb;
+  vp9_prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row, mi_col, rf_idx,
+                          bsize, nb_full_mvs);
+  vp9_full_pixel_diamond_new(cpi, x, &best_ref_mv1_full, step_param, lambda, 1,
+                             &cpi->fn_ptr[bsize], nb_full_mvs, NB_MVS_NUM, mv);
+#else
+  (void)frame_idx;
+  (void)mi_row;
+  (void)mi_col;
+  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, mv, 0, 0);
+#endif
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+      const int64_t mc_flow = tpl_ptr->mc_flow;
+      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+      *tpl_ptr = *src_stats;
+      tpl_ptr->mc_flow = mc_flow;
+      tpl_ptr->mc_ref_cost = mc_ref_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+    }
+  }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_8X8);
+    }
+  }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  uint16_t eob;
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
+                                 p->quant_fp, qcoeff, dqcoeff, pd->dequant,
+                                 &eob, scan_order->scan, scan_order->iscan);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
+                          p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob,
+                          scan_order->scan, scan_order->iscan);
+  }
+#else
+  vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp,
+                        qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan,
+                        scan_order->iscan);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = VPXMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                  TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+                          int mi_col) {
+  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.row_max =
+      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, TplDepFrame *tpl_frame,
+                            int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *sse) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+    int_mv mv;
+    if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+    (void)td;
+    mv.as_int =
+        get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col)->as_int;
+#else
+    motion_compensated_prediction(
+        cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset,
+        ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize,
+        mi_row, mi_col, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+    if (inter_cost < best_inter_cost) {
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = VPXMAX(
+      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->intra_cost = VPXMAX(
+      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+                                  int frame_idx, int rf_idx, int mi_row,
+                                  int mi_col, struct buf_2d *src,
+                                  struct buf_2d *pre) {
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  YV12_BUFFER_CONFIG *ref_frame = NULL;
+  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+  if (ref_frame_idx != -1) {
+    ref_frame = gf_picture[ref_frame_idx].frame;
+    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+    src->stride = xd->cur_buf->y_stride;
+    pre->buf = ref_frame->y_buffer + mb_y_offset;
+    pre->stride = ref_frame->y_stride;
+    assert(src->stride == pre->stride);
+    return 1;
+  } else {
+    printf("invalid ref_frame_idx");
+    assert(ref_frame_idx != -1);
+    return 0;
+  }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+  { -1, 0 },
+  { 0, -1 },
+  { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col) {
+  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int i;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int_mv nearest_mv, near_mv, invalid_mv;
+  nearest_mv.as_int = INVALID_MV;
+  near_mv.as_int = INVALID_MV;
+  invalid_mv.as_int = INVALID_MV;
+  for (i = 0; i < MV_REF_POS_NUM; ++i) {
+    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+    assert(mv_ref_pos[i].row <= 0);
+    assert(mv_ref_pos[i].col <= 0);
+    if (nb_row >= 0 && nb_col >= 0) {
+      if (nearest_mv.as_int == INVALID_MV) {
+        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+      } else {
+        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+        if (mv.as_int == nearest_mv.as_int) {
+          continue;
+        } else {
+          near_mv = mv;
+          break;
+        }
+      }
+    }
+  }
+  if (nearest_mv.as_int == INVALID_MV) {
+    nearest_mv.as_mv.row = 0;
+    nearest_mv.as_mv.col = 0;
+  }
+  if (near_mv.as_int == INVALID_MV) {
+    near_mv.as_mv.row = 0;
+    near_mv.as_mv.col = 0;
+  }
+  if (mv_mode == NEAREST_MV_MODE) {
+    return nearest_mv;
+  }
+  if (mv_mode == NEAR_MV_MODE) {
+    return near_mv;
+  }
+  assert(0);
+  return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+                                  TplDepFrame *tpl_frame, int rf_idx,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int_mv mv;
+  switch (mv_mode) {
+    case ZERO_MV_MODE:
+      mv.as_mv.row = 0;
+      mv.as_mv.col = 0;
+      break;
+    case NEW_MV_MODE:
+      mv = *get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col);
+      break;
+    case NEAREST_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    case NEAR_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    default:
+      mv.as_int = INVALID_MV;
+      assert(0);
+      break;
+  }
+  return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+                          GF_PICTURE *gf_picture, int frame_idx,
+                          TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize,
+                          int mi_row, int mi_col, int_mv *mv) {
+  uint32_t sse;
+  struct buf_2d src;
+  struct buf_2d pre;
+  MV full_mv;
+  *mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row,
+                            mi_col);
+  full_mv = get_full_mv(&mv->as_mv);
+  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+                             &src, &pre)) {
+    // TODO(angiebird): Consider subpixel when computing the sse.
+    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+                          pre.stride, &sse);
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+  // TODO(angiebird): The probabilities are roughly inferred from
+  // default_inter_mode_probs. Check if there is a better way to set the
+  // probabilities.
+  const int zero_mv_prob = 16;
+  const int new_mv_prob = 24 * 1;
+  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+  switch (mv_mode) {
+    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    default: assert(0); return -1;
+  }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+                        log2(1 + abs(new_mv->col - ref_mv->col));
+  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+  return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          int rf_idx, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col) {
+  double mv_cost = get_mv_mode_cost(mv_mode);
+  if (mv_mode == NEW_MV_MODE) {
+    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, tpl_frame, rf_idx, bsize,
+                                    mi_row, mi_col)
+                    .as_mv;
+    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, tpl_frame, rf_idx,
+                                        bsize, mi_row, mi_col)
+                        .as_mv;
+    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, tpl_frame, rf_idx,
+                                     bsize, mi_row, mi_col)
+                     .as_mv;
+    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+  }
+  return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+                           GF_PICTURE *gf_picture, int frame_idx,
+                           TplDepFrame *tpl_frame, int rf_idx, BLOCK_SIZE bsize,
+                           int mi_row, int mi_col, int_mv *mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double mv_dist = get_mv_dist(mv_mode, cpi, xd, gf_picture, frame_idx,
+                               tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+  double mv_cost =
+      get_mv_cost(mv_mode, cpi, tpl_frame, rf_idx, bsize, mi_row, mi_col);
+  double mult = 180;
+
+  return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 GF_PICTURE *gf_picture, int frame_idx,
+                                 TplDepFrame *tpl_frame, int rf_idx,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 double *rd, int_mv *mv) {
+  int best_mv_mode = ZERO_MV_MODE;
+  int update = 0;
+  int mv_mode;
+  *rd = 0;
+  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+    double this_rd;
+    int_mv this_mv;
+    if (mv_mode == NEW_MV_MODE) {
+      continue;
+    }
+    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, frame_idx, tpl_frame,
+                           rf_idx, bsize, mi_row, mi_col, &this_mv);
+    if (update == 0) {
+      *rd = this_rd;
+      *mv = this_mv;
+      best_mv_mode = mv_mode;
+      update = 1;
+    } else {
+      if (this_rd < *rd) {
+        *rd = this_rd;
+        *mv = this_mv;
+        best_mv_mode = mv_mode;
+      }
+    }
+  }
+  return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            GF_PICTURE *gf_picture, int frame_idx,
+                            TplDepFrame *tpl_frame, int rf_idx,
+                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int tmp_mv_mode_arr[kMvPreCheckSize];
+  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+  int_mv *select_mv_arr = cpi->select_mv_arr;
+  int_mv tmp_select_mv_arr[kMvPreCheckSize];
+  int stride = tpl_frame->stride;
+  double new_mv_rd = 0;
+  double no_new_mv_rd = 0;
+  double this_new_mv_rd = 0;
+  double this_no_new_mv_rd = 0;
+  int idx;
+  int tmp_idx;
+  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+  // no new mv
+  // diagnal scan order
+  tmp_idx = 0;
+  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] =
+            find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame,
+                                  rf_idx, bsize, nb_row, nb_col, &this_rd, mv);
+        if (r == 0 && c == 0) {
+          this_no_new_mv_rd = this_rd;
+        }
+        no_new_mv_rd += this_rd;
+        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+        ++tmp_idx;
+      }
+    }
+  }
+
+  // new mv
+  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+  this_new_mv_rd = eval_mv_mode(NEW_MV_MODE, cpi, x, gf_picture, frame_idx,
+                                tpl_frame, rf_idx, bsize, mi_row, mi_col,
+                                &select_mv_arr[mi_row * stride + mi_col]);
+  new_mv_rd = this_new_mv_rd;
+  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+  // beforehand.
+  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] =
+            find_best_ref_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame,
+                                  rf_idx, bsize, nb_row, nb_col, &this_rd, mv);
+        new_mv_rd += this_rd;
+      }
+    }
+  }
+
+  // update best_mv_mode
+  tmp_idx = 0;
+  if (no_new_mv_rd < new_mv_rd) {
+    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+      int r;
+      for (r = 0; r <= idx; ++r) {
+        int c = idx - r;
+        int nb_row = mi_row + r * mi_height;
+        int nb_col = mi_col + c * mi_width;
+        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+          ++tmp_idx;
+        }
+      }
+    }
+    rd_diff_arr[mi_row * stride + mi_col] = 0;
+  } else {
+    rd_diff_arr[mi_row * stride + mi_col] =
+        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+  }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+                                GF_PICTURE *gf_picture, int frame_idx,
+                                TplDepFrame *tpl_frame, int rf_idx,
+                                BLOCK_SIZE bsize) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int unit_rows = tpl_frame->mi_rows / mi_height;
+  const int unit_cols = tpl_frame->mi_cols / mi_width;
+  const int max_diagonal_lines = unit_rows + unit_cols - 1;
+  int idx;
+  for (idx = 0; idx < max_diagonal_lines; ++idx) {
+    int r;
+    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+         ++r) {
+      int c = idx - r;
+      int mi_row = r * mi_height;
+      int mi_col = c * mi_width;
+      assert(c >= 0 && c < unit_cols);
+      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+      predict_mv_mode(cpi, x, gf_picture, frame_idx, tpl_frame, rf_idx, bsize,
+                      mi_row, mi_col);
+    }
+  }
+}
+
+static double get_feature_score(uint8_t *buf, ptrdiff_t stride, int rows,
+                                int cols) {
+  double IxIx = 0;
+  double IxIy = 0;
+  double IyIy = 0;
+  double score;
+  int r, c;
+  vpx_clear_system_state();
+  for (r = 0; r + 1 < rows; ++r) {
+    for (c = 0; c + 1 < cols; ++c) {
+      int diff_x = buf[r * stride + c] - buf[r * stride + c + 1];
+      int diff_y = buf[r * stride + c] - buf[(r + 1) * stride + c];
+      IxIx += diff_x * diff_x;
+      IxIy += diff_x * diff_y;
+      IyIy += diff_y * diff_y;
+    }
+  }
+  IxIx /= (rows - 1) * (cols - 1);
+  IxIy /= (rows - 1) * (cols - 1);
+  IyIy /= (rows - 1) * (cols - 1);
+  score = (IxIx * IyIy - IxIy * IxIy + 0.0001) / (IxIx + IyIy + 0.0001);
+  return score;
+}
+
+static int compare_feature_score(const void *a, const void *b) {
+  const FEATURE_SCORE_LOC *aa = *(FEATURE_SCORE_LOC *const *)a;
+  const FEATURE_SCORE_LOC *bb = *(FEATURE_SCORE_LOC *const *)b;
+  if (aa->feature_score < bb->feature_score) {
+    return 1;
+  } else if (aa->feature_score > bb->feature_score) {
+    return -1;
+  } else {
+    return 0;
+  }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td, int frame_idx,
+                             YV12_BUFFER_CONFIG **ref_frame, BLOCK_SIZE bsize,
+                             int mi_row, int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  int rf_idx;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+    int_mv *mv = get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col);
+    if (ref_frame[rf_idx] == NULL) {
+      tpl_stats->ready[rf_idx] = 0;
+      continue;
+    } else {
+      tpl_stats->ready[rf_idx] = 1;
+    }
+    motion_compensated_prediction(
+        cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset,
+        ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize,
+        mi_row, mi_col, &mv->as_mv, rf_idx);
+  }
+}
+
+#define CHANGE_MV_SEARCH_ORDER 1
+#define USE_PQSORT 1
+
+#if CHANGE_MV_SEARCH_ORDER
+#if USE_PQSORT
+static void max_heap_pop(FEATURE_SCORE_LOC **heap, int *size,
+                         FEATURE_SCORE_LOC **output) {
+  if (*size > 0) {
+    *output = heap[0];
+    --*size;
+    if (*size > 0) {
+      int p, l, r;
+      heap[0] = heap[*size];
+      p = 0;
+      l = 2 * p + 1;
+      r = 2 * p + 2;
+      while (l < *size) {
+        FEATURE_SCORE_LOC *tmp;
+        int c = l;
+        if (r < *size && heap[r]->feature_score > heap[l]->feature_score) {
+          c = r;
+        }
+        if (heap[p]->feature_score >= heap[c]->feature_score) {
+          break;
+        }
+        tmp = heap[p];
+        heap[p] = heap[c];
+        heap[c] = tmp;
+        p = c;
+        l = 2 * p + 1;
+        r = 2 * p + 2;
+      }
+    }
+  } else {
+    assert(0);
+  }
+}
+
+static void max_heap_push(FEATURE_SCORE_LOC **heap, int *size,
+                          FEATURE_SCORE_LOC *input) {
+  int c, p;
+  FEATURE_SCORE_LOC *tmp;
+  input->visited = 1;
+  heap[*size] = input;
+  ++*size;
+  c = *size - 1;
+  p = c >> 1;
+  while (c > 0 && heap[c]->feature_score > heap[p]->feature_score) {
+    tmp = heap[p];
+    heap[p] = heap[c];
+    heap[c] = tmp;
+    c = p;
+    p >>= 1;
+  }
+}
+
+static void add_nb_blocks_to_heap(VP9_COMP *cpi, const TplDepFrame *tpl_frame,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int *heap_size) {
+  const int mi_unit = num_8x8_blocks_wide_lookup[bsize];
+  const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };
+  int i;
+  for (i = 0; i < NB_MVS_NUM; ++i) {
+    int r = dirs[i][0] * mi_unit;
+    int c = dirs[i][1] * mi_unit;
+    if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 &&
+        mi_col + c < tpl_frame->mi_cols) {
+      FEATURE_SCORE_LOC *fs_loc =
+          &cpi->feature_score_loc_arr[(mi_row + r) * tpl_frame->stride +
+                                      (mi_col + c)];
+      if (fs_loc->visited == 0) {
+        max_heap_push(cpi->feature_score_loc_heap, heap_size, fs_loc);
+      }
+    }
+  }
+}
+#endif  // USE_PQSORT
+#endif  // CHANGE_MV_SEARCH_ORDER
+
+static void build_motion_field(VP9_COMP *cpi, MACROBLOCKD *xd, int frame_idx,
+                               YV12_BUFFER_CONFIG *ref_frame[3],
+                               BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+  int fs_loc_sort_size;
+  int fs_loc_heap_size;
+  int mi_row, mi_col;
+
+  tpl_frame->lambda = (pw * ph) >> 2;
+  assert(pw * ph == tpl_frame->lambda << 2);
+
+  fs_loc_sort_size = 0;
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      const int mb_y_offset =
+          mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+      const int bw = 4 << b_width_log2_lookup[bsize];
+      const int bh = 4 << b_height_log2_lookup[bsize];
+      TplDepStats *tpl_stats =
+          &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+      FEATURE_SCORE_LOC *fs_loc =
+          &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col];
+      tpl_stats->feature_score = get_feature_score(
+          xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh);
+      fs_loc->visited = 0;
+      fs_loc->feature_score = tpl_stats->feature_score;
+      fs_loc->mi_row = mi_row;
+      fs_loc->mi_col = mi_col;
+      cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc;
+      ++fs_loc_sort_size;
+    }
+  }
+
+  qsort(cpi->feature_score_loc_sort, fs_loc_sort_size,
+        sizeof(*cpi->feature_score_loc_sort), compare_feature_score);
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int rf_idx;
+      for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+        TplDepStats *tpl_stats =
+            &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+        tpl_stats->ready[rf_idx] = 0;
+      }
+    }
+  }
+
+#if CHANGE_MV_SEARCH_ORDER
+#if !USE_PQSORT
+  for (i = 0; i < fs_loc_sort_size; ++i) {
+    FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i];
+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
+                     fs_loc->mi_col);
+  }
+#else   // !USE_PQSORT
+  fs_loc_heap_size = 0;
+  max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size,
+                cpi->feature_score_loc_sort[0]);
+
+  while (fs_loc_heap_size > 0) {
+    FEATURE_SCORE_LOC *fs_loc;
+    max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc);
+
+    do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row,
+                     fs_loc->mi_col);
+
+    add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col,
+                          &fs_loc_heap_size);
+  }
+#endif  // !USE_PQSORT
+#else   // CHANGE_MV_SEARCH_ORDER
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col);
+    }
+  }
+#endif  // CHANGE_MV_SEARCH_ORDER
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx, BLOCK_SIZE bsize) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[3] = { NULL, NULL, NULL };
+
+  VP9_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int64_t recon_error, sse;
+#if CONFIG_NON_GREEDY_MV
+  int square_block_idx;
+  int rf_idx;
+#endif
+
+  // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height,
+      cpi->common.use_highbitdepth);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+#else
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < 3; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+       ++square_block_idx) {
+    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+    build_motion_field(cpi, xd, frame_idx, ref_frame, square_bsize);
+  }
+  for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+    if (ref_frame_idx != -1) {
+      predict_mv_mode_arr(cpi, x, gf_picture, frame_idx, tpl_frame, rf_idx,
+                          bsize);
+    }
+  }
+#endif
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, predictor, &recon_error, &sse);
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+  int i, j;
+  printf("%d %d\n", h, w);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      printf("%d ", buf[(row + i) * stride + col + j]);
+    }
+  }
+  printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+           frame_buf->y_width);
+  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+                           const GF_GROUP *gf_group,
+                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+  int frame_idx;
+  const VP9_COMMON *cm = &cpi->common;
+  int rf_idx;
+  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+      int mi_row, mi_col;
+      int ref_frame_idx;
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+        const int ref_gf_frame_offset =
+            gf_group->frame_gop_index[ref_frame_idx];
+        printf("=\n");
+        printf(
+            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              int_mv mv =
+                  *get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col);
+              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+                     mv.as_mv.col);
+            }
+          }
+        }
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              const TplDepStats *tpl_ptr =
+                  &tpl_frame
+                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+              printf("%f ", tpl_ptr->feature_score);
+            }
+          }
+        }
+        printf("\n");
+
+        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+            const int mv_mode =
+                tpl_frame
+                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+            printf("%d ", mv_mode);
+          }
+        }
+        printf("\n");
+
+        dump_frame_buf(gf_picture[frame_idx].frame);
+        dump_frame_buf(ref_frame_buf);
+      }
+    }
+  }
+}
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int sqr_bsize;
+  int rf_idx;
+
+  // TODO(angiebird): This probably needs further modifications to support
+  // frame scaling later on.
+  if (cpi->feature_score_loc_alloc == 0) {
+    // The smallest block size of motion field is 4x4, but the mi_unit is 8x8,
+    // therefore the number of units is "mi_rows * mi_cols * 4" here.
+    CHECK_MEM_ERROR(
+        cm, cpi->feature_score_loc_arr,
+        vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->feature_score_loc_arr)));
+    CHECK_MEM_ERROR(cm, cpi->feature_score_loc_sort,
+                    vpx_calloc(mi_rows * mi_cols * 4,
+                               sizeof(*cpi->feature_score_loc_sort)));
+    CHECK_MEM_ERROR(cm, cpi->feature_score_loc_heap,
+                    vpx_calloc(mi_rows * mi_cols * 4,
+                               sizeof(*cpi->feature_score_loc_heap)));
+
+    cpi->feature_score_loc_alloc = 1;
+  }
+  vpx_free(cpi->select_mv_arr);
+  CHECK_MEM_ERROR(
+      cm, cpi->select_mv_arr,
+      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    for (rf_idx = 0; rf_idx < 3; ++rf_idx) {
+      for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) {
+        vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]);
+        CHECK_MEM_ERROR(
+            cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize],
+            vpx_calloc(
+                mi_rows * mi_cols * 4,
+                sizeof(
+                    *cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize])));
+      }
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
+static void setup_tpl_stats(VP9_COMP *cpi) {
+  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+  cpi->tpl_bsize = BLOCK_32X32;
+
+  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+  init_tpl_stats(cpi);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+  }
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+}
+
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
                             int64_t *time_end, int flush) {
@@ -5077,17 +7177,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
-  if (is_two_pass_svc(cpi)) {
-#if CONFIG_SPATIAL_SVC
-    vp9_svc_start_frame(cpi);
-    // Use a small empty frame instead of a real frame
-    if (cpi->svc.encode_empty_frame_state == ENCODING)
-      source = &cpi->svc.empty_frame;
-#endif
-    if (oxcf->pass == 2) vp9_restore_layer_context(cpi);
-  } else if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_cbr_svc(cpi)) {
     vp9_one_pass_cbr_svc_start_layer(cpi);
   }
 
@@ -5098,10 +7191,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Is multi-arf enabled.
   // Note that at the moment multi_arf is only configured for 2 pass VBR and
   // will not work properly with svc.
-  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf > 1))
-    cpi->multi_arf_allowed = 1;
+  // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf"
+  // is greater than or equal to 2.
+  if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2))
+    cpi->multi_layer_arf = 1;
   else
-    cpi->multi_arf_allowed = 0;
+    cpi->multi_layer_arf = 0;
 
   // Normal defaults
   cm->reset_frame_context = 0;
@@ -5115,9 +7210,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
 
-  // Skip alt frame if we encode the empty frame
-  if (is_two_pass_svc(cpi) && source != NULL) arf_src_index = 0;
-
   if (arf_src_index) {
     for (i = 0; i <= arf_src_index; ++i) {
       struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i);
@@ -5132,25 +7224,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
   }
 
+  // Clear arf index stack before group of pictures processing starts.
+  if (gf_group_index == 1) {
+    stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2);
+    cpi->twopass.gf_group.stack_size = 0;
+  }
+
   if (arf_src_index) {
     assert(arf_src_index <= rc->frames_to_key);
-
     if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
       cpi->alt_ref_source = source;
 
-#if CONFIG_SPATIAL_SVC
-      if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) {
-        int i;
-        // Reference a hidden frame from a lower layer
-        for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) {
-          if (oxcf->ss_enable_auto_arf[i]) {
-            cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx;
-            break;
-          }
-        }
-      }
-      cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
-#endif
 #if !CONFIG_REALTIME_ONLY
       if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
           (oxcf->arnr_strength > 0)) {
@@ -5192,7 +7276,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     }
 
     // Read in the source frame.
-    if (cpi->use_svc)
+    if (cpi->use_svc || cpi->svc.set_intra_only_frame)
       source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush);
     else
       source = vp9_lookahead_pop(cpi->lookahead, flush);
@@ -5202,8 +7286,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cm->intra_only = 0;
       // if the flags indicate intra frame, but if the current picture is for
       // non-zero spatial layer, it should not be an intra picture.
-      if ((source->flags & VPX_EFLAG_FORCE_KF) &&
-          cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
+      if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc &&
+          cpi->svc.spatial_layer_id > 0) {
         source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
       }
 
@@ -5227,7 +7311,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     *time_stamp = source->ts_start;
     *time_end = source->ts_end;
     *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
-
   } else {
     *size = 0;
 #if !CONFIG_REALTIME_ONLY
@@ -5249,7 +7332,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   // adjust frame rates based on timestamps given
   if (cm->show_frame) {
-    adjust_frame_rate(cpi, source);
+    if (cpi->use_svc && cpi->svc.use_set_ref_frame_config &&
+        cpi->svc.duration[cpi->svc.spatial_layer_id] > 0)
+      vp9_svc_adjust_frame_rate(cpi);
+    else
+      adjust_frame_rate(cpi, source);
   }
 
   if (is_one_pass_cbr_svc(cpi)) {
@@ -5268,24 +7355,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
-  if (!cpi->use_svc && cpi->multi_arf_allowed) {
-    if (cm->frame_type == KEY_FRAME) {
-      init_buffer_indices(cpi);
-    } else if (oxcf->pass == 2) {
-      const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-      cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index];
-    }
-  }
-
   // Start with a 0 size frame.
   *size = 0;
 
   cpi->frame_flags = *frame_flags;
 
 #if !CONFIG_REALTIME_ONLY
-  if ((oxcf->pass == 2) &&
-      (!cpi->use_svc || (is_two_pass_svc(cpi) &&
-                         cpi->svc.encode_empty_frame_state != ENCODING))) {
+  if ((oxcf->pass == 2) && !cpi->use_svc) {
     vp9_rc_get_second_pass_params(cpi);
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
@@ -5297,9 +7373,39 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     level_rc_framerate(cpi, arf_src_index);
 
   if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) {
-    for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+    for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+  }
+
+  if (cpi->kmeans_data_arr_alloc == 0) {
+    const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+    const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_MULTITHREAD
+    pthread_mutex_init(&cpi->kmeans_mutex, NULL);
+#endif
+    CHECK_MEM_ERROR(
+        cm, cpi->kmeans_data_arr,
+        vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
+    cpi->kmeans_data_stride = mi_cols;
+    cpi->kmeans_data_arr_alloc = 1;
+  }
+
+  if (gf_group_index == 1 &&
+      cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
+      cpi->sf.enable_tpl_model) {
+    init_tpl_buffer(cpi);
+    vp9_estimate_qp_gop(cpi);
+    setup_tpl_stats(cpi);
   }
 
+#if CONFIG_BITSTREAM_DEBUG
+  assert(cpi->oxcf.max_threads == 0 &&
+         "bitstream debug tool does not support multithreading");
+  bitstream_queue_record_write();
+#endif
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+  bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
   cpi->td.mb.fp_src_pred = 0;
 #if CONFIG_REALTIME_ONLY
   if (cpi->use_svc) {
@@ -5309,7 +7415,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     Pass0Encode(cpi, size, dest, frame_flags);
   }
 #else  // !CONFIG_REALTIME_ONLY
-  if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  if (oxcf->pass == 1 && !cpi->use_svc) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
@@ -5324,7 +7430,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
-  } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
+  } else if (oxcf->pass == 2 && !cpi->use_svc) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
@@ -5334,6 +7440,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_REALTIME_ONLY
 
+  if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx;
+
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
@@ -5416,7 +7524,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
             ppflags.post_proc_flag = VP9D_DEBLOCK;
             ppflags.deblocking_level = 0;  // not used in vp9_post_proc_frame()
             ppflags.noise_level = 0;       // not used in vp9_post_proc_frame()
-            vp9_post_proc_frame(cm, pp, &ppflags);
+            vp9_post_proc_frame(cm, pp, &ppflags,
+                                cpi->un_scaled_source->y_width);
           }
 #endif
           vpx_clear_system_state();
@@ -5462,11 +7571,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
           cpi->summedp_quality += frame_ssim2 * weight;
           cpi->summedp_weights += weight;
 #if 0
-          {
+          if (cm->show_frame) {
             FILE *f = fopen("q_used.stt", "a");
             fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                    cpi->common.current_video_frame, y2, u2, v2,
-                    frame_psnr2, frame_ssim2);
+                    cpi->common.current_video_frame, psnr2.psnr[1],
+                    psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2);
             fclose(f);
           }
 #endif
@@ -5525,21 +7634,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.encode_empty_frame_state == ENCODING) {
-      cpi->svc.encode_empty_frame_state = ENCODED;
-      cpi->svc.encode_intra_empty_frame = 0;
-    }
-
-    if (cm->show_frame) {
-      ++cpi->svc.spatial_layer_to_encode;
-      if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
-        cpi->svc.spatial_layer_to_encode = 0;
-
-      // May need the empty frame after an visible frame.
-      cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE;
-    }
-  } else if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_cbr_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
       if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
@@ -5563,7 +7658,7 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   } else {
     int ret;
 #if CONFIG_VP9_POSTPROC
-    ret = vp9_post_proc_frame(cm, dest, flags);
+    ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width);
 #else
     if (cm->frame_to_show) {
       *dest = *cm->frame_to_show;
diff --git a/libs/libvpx/vp9/encoder/vp9_encoder.h b/libs/libvpx/vp9/encoder/vp9_encoder.h
index d723d93cbc1..f157fdfc5e8 100644
--- a/libs/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libs/libvpx/vp9/encoder/vp9_encoder.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ENCODER_H_
-#define VP9_ENCODER_VP9_ENCODER_H_
+#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_
+#define VPX_VP9_ENCODER_VP9_ENCODER_H_
 
 #include <stdio.h>
 
@@ -29,7 +29,9 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+#if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
+#endif
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodemb.h"
@@ -119,9 +121,11 @@ typedef enum {
   COMPLEXITY_AQ = 2,
   CYCLIC_REFRESH_AQ = 3,
   EQUATOR360_AQ = 4,
+  PERCEPTUAL_AQ = 5,
+  PSNR_AQ = 6,
   // AQ based on lookahead temporal
   // variance (only valid for altref frames)
-  LOOKAHEAD_AQ = 5,
+  LOOKAHEAD_AQ = 7,
   AQ_MODE_COUNT  // This should always be the last member of the enum
 } AQ_MODE;
 
@@ -248,6 +252,8 @@ typedef struct VP9EncoderConfig {
   int tile_columns;
   int tile_rows;
 
+  int enable_tpl_model;
+
   int max_threads;
 
   unsigned int target_level;
@@ -278,11 +284,102 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
   return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
 }
 
+typedef struct TplDepStats {
+  int64_t intra_cost;
+  int64_t inter_cost;
+  int64_t mc_flow;
+  int64_t mc_dep_cost;
+  int64_t mc_ref_cost;
+
+  int ref_frame_index;
+  int_mv mv;
+
+#if CONFIG_NON_GREEDY_MV
+  int ready[3];
+  int64_t sse_arr[3];
+  double feature_score;
+#endif
+} TplDepStats;
+
+#if CONFIG_NON_GREEDY_MV
+#define SQUARE_BLOCK_SIZES 4
+
+#define ZERO_MV_MODE 0
+#define NEW_MV_MODE 1
+#define NEAREST_MV_MODE 2
+#define NEAR_MV_MODE 3
+#define MAX_MV_MODE 4
+#endif
+
+typedef struct TplDepFrame {
+  uint8_t is_valid;
+  TplDepStats *tpl_stats_ptr;
+  int stride;
+  int width;
+  int height;
+  int mi_rows;
+  int mi_cols;
+  int base_qindex;
+#if CONFIG_NON_GREEDY_MV
+  int lambda;
+  int_mv *pyramid_mv_arr[3][SQUARE_BLOCK_SIZES];
+  int *mv_mode_arr[3];
+  double *rd_diff_arr[3];
+#endif
+} TplDepFrame;
+
+#if CONFIG_NON_GREEDY_MV
+static INLINE int get_square_block_idx(BLOCK_SIZE bsize) {
+  if (bsize == BLOCK_4X4) {
+    return 0;
+  }
+  if (bsize == BLOCK_8X8) {
+    return 1;
+  }
+  if (bsize == BLOCK_16X16) {
+    return 2;
+  }
+  if (bsize == BLOCK_32X32) {
+    return 3;
+  }
+  assert(0 && "ERROR: non-square block size");
+  return -1;
+}
+
+static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) {
+  if (square_block_idx == 0) {
+    return BLOCK_4X4;
+  }
+  if (square_block_idx == 1) {
+    return BLOCK_8X8;
+  }
+  if (square_block_idx == 2) {
+    return BLOCK_16X16;
+  }
+  if (square_block_idx == 3) {
+    return BLOCK_32X32;
+  }
+  assert(0 && "ERROR: invalid square_block_idx");
+  return BLOCK_INVALID;
+}
+
+static INLINE int_mv *get_pyramid_mv(const TplDepFrame *tpl_frame, int rf_idx,
+                                     BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  return &tpl_frame->pyramid_mv_arr[rf_idx][get_square_block_idx(bsize)]
+                                   [mi_row * tpl_frame->stride + mi_col];
+}
+#endif
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
 // TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-  int mode_map[BLOCK_SIZES][MAX_MODES];
+#if CONFIG_CONSISTENT_RECODE
+  int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
+#endif
+  int8_t mode_map[BLOCK_SIZES][MAX_MODES];
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
 
@@ -450,6 +547,31 @@ typedef struct ARNRFilterData {
   struct scale_factors sf;
 } ARNRFilterData;
 
+typedef struct EncFrameBuf {
+  int mem_valid;
+  int released;
+  YV12_BUFFER_CONFIG frame;
+} EncFrameBuf;
+
+// Maximum operating frame buffer size needed for a GOP using ARF reference.
+#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
+#if CONFIG_NON_GREEDY_MV
+typedef struct FEATURE_SCORE_LOC {
+  int visited;
+  double feature_score;
+  int mi_row;
+  int mi_col;
+} FEATURE_SCORE_LOC;
+#endif
+
+#define MAX_KMEANS_GROUPS 8
+
+typedef struct KMEANS_DATA {
+  double value;
+  int pos;
+  int group_idx;
+} KMEANS_DATA;
+
 typedef struct VP9_COMP {
   QUANTS quants;
   ThreadData td;
@@ -473,17 +595,43 @@ typedef struct VP9_COMP {
 #endif
   YV12_BUFFER_CONFIG *raw_source_frame;
 
+  BLOCK_SIZE tpl_bsize;
+  TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+  YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
+  EncFrameBuf enc_frame_buf[REF_FRAMES];
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t kmeans_mutex;
+#endif
+  int kmeans_data_arr_alloc;
+  KMEANS_DATA *kmeans_data_arr;
+  int kmeans_data_size;
+  int kmeans_data_stride;
+  double kmeans_ctr_ls[MAX_KMEANS_GROUPS];
+  double kmeans_boundary_ls[MAX_KMEANS_GROUPS];
+  int kmeans_count_ls[MAX_KMEANS_GROUPS];
+  int kmeans_ctr_num;
+#if CONFIG_NON_GREEDY_MV
+  int tpl_ready;
+  int feature_score_loc_alloc;
+  FEATURE_SCORE_LOC *feature_score_loc_arr;
+  FEATURE_SCORE_LOC **feature_score_loc_sort;
+  FEATURE_SCORE_LOC **feature_score_loc_heap;
+  int_mv *select_mv_arr;
+#endif
+
   TileDataEnc *tile_data;
   int allocated_tiles;  // Keep track of memory allocated for tiles.
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
-  int scaled_ref_idx[MAX_REF_FRAMES];
+  int scaled_ref_idx[REFS_PER_FRAME];
   int lst_fb_idx;
   int gld_fb_idx;
   int alt_fb_idx;
 
+  int ref_fb_idx[REF_FRAMES];
+
   int refresh_last_frame;
   int refresh_golden_frame;
   int refresh_alt_ref_frame;
@@ -496,10 +644,15 @@ typedef struct VP9_COMP {
   int ext_refresh_frame_context_pending;
   int ext_refresh_frame_context;
 
+  int64_t norm_wiener_variance;
+  int64_t *mb_wiener_variance;
+  int mb_wiener_var_rows;
+  int mb_wiener_var_cols;
+  double *mi_ssim_rdmult_scaling_factors;
+
   YV12_BUFFER_CONFIG last_frame_uf;
 
   TOKENEXTRA *tile_tok[4][1 << 6];
-  uint32_t tok_count[4][1 << 6];
   TOKENLIST *tplist[4][1 << 6];
 
   // Ambient reconstruction err target for force key frames
@@ -521,7 +674,7 @@ typedef struct VP9_COMP {
   RATE_CONTROL rc;
   double framerate;
 
-  int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE];
+  int interp_filter_selected[REF_FRAMES][SWITCHABLE];
 
   struct vpx_codec_pkt_list *output_pkt_list;
 
@@ -555,6 +708,7 @@ typedef struct VP9_COMP {
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
+  struct scale_factors me_sf;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -645,10 +799,8 @@ typedef struct VP9_COMP {
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
-
-  int multi_arf_allowed;
-  int multi_arf_enabled;
-  int multi_arf_last_grp_enabled;
+  // Indices are:  max_tx_size-1,  tx_size_ctx,    tx_size
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
@@ -723,6 +875,9 @@ typedef struct VP9_COMP {
 
   uint8_t *count_arf_frame_usage;
   uint8_t *count_lastgolden_frame_usage;
+
+  int multi_layer_arf;
+  vpx_roi_map_t roi;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -737,7 +892,7 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf);
 // frame is made and not just a copy of the pointer..
 int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
-                          int64_t end_time_stamp);
+                          int64_t end_time);
 
 int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
                             size_t *size, uint8_t *dest, int64_t *time_stamp,
@@ -758,9 +913,11 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
 
 int vp9_update_entropy(VP9_COMP *cpi, int update);
 
-int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
-int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
+                       int cols);
 
 int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
                           VPX_SCALING vert_mode);
@@ -770,6 +927,27 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
 
 void vp9_set_svc(VP9_COMP *cpi, int use_svc);
 
+static INLINE int stack_pop(int *stack, int stack_size) {
+  int idx;
+  const int r = stack[0];
+  for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx];
+
+  return r;
+}
+
+static INLINE int stack_top(const int *stack) { return stack[0]; }
+
+static INLINE void stack_push(int *stack, int new_item, int stack_size) {
+  int idx;
+  for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1];
+  stack[0] = new_item;
+}
+
+static INLINE void stack_init(int *stack, int length) {
+  int idx;
+  for (idx = 0; idx < length; ++idx) stack[idx] = -1;
+}
+
 int vp9_get_quantizer(struct VP9_COMP *cpi);
 
 static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) {
@@ -795,9 +973,13 @@ static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi,
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
 }
 
+static INLINE RefCntBuffer *get_ref_cnt_buffer(VP9_COMMON *cm, int fb_idx) {
+  return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL;
+}
+
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
-    VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
-  VP9_COMMON *const cm = &cpi->common;
+    const VP9_COMP *const cpi, MV_REFERENCE_FRAME ref_frame) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
   return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
                                 : NULL;
@@ -858,19 +1040,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) {
-  return cpi->use_svc && cpi->oxcf.pass != 0;
-}
-
 static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
-  return (!cpi->use_svc ||
-          (cpi->use_svc &&
-           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+  return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+                                                cpi->svc.first_layer_denoise));
 }
 #endif
 
@@ -878,12 +1055,10 @@ static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
          cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
-         (cpi->oxcf.enable_auto_arf &&
-          (!is_two_pass_svc(cpi) ||
-           cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
+         cpi->oxcf.enable_auto_arf;
 }
 
-static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
+static INLINE void set_ref_ptrs(const VP9_COMMON *const cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
   xd->block_refs[0] =
@@ -938,6 +1113,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+                    unsigned int cols, int delta_q[8], int delta_lf[8],
+                    int skip[8], int ref_frame[8]);
+
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
 void vp9_set_row_mt(VP9_COMP *cpi);
@@ -948,4 +1127,4 @@ void vp9_set_row_mt(VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ENCODER_H_
+#endif  // VPX_VP9_ENCODER_VP9_ENCODER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.c b/libs/libvpx/vp9/encoder/vp9_ethread.c
index 0bd2e21451a..e7f8a537d47 100644
--- a/libs/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.c
@@ -270,19 +270,19 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
-                    vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));
-    if (row_mt_sync->mutex_) {
+    CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+                    vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
+    if (row_mt_sync->mutex) {
       for (i = 0; i < rows; ++i) {
-        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+        pthread_mutex_init(&row_mt_sync->mutex[i], NULL);
       }
     }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
-                    vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));
-    if (row_mt_sync->cond_) {
+    CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+                    vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
+    if (row_mt_sync->cond) {
       for (i = 0; i < rows; ++i) {
-        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+        pthread_cond_init(&row_mt_sync->cond[i], NULL);
       }
     }
   }
@@ -301,17 +301,17 @@ void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
 #if CONFIG_MULTITHREAD
     int i;
 
-    if (row_mt_sync->mutex_ != NULL) {
+    if (row_mt_sync->mutex != NULL) {
       for (i = 0; i < row_mt_sync->rows; ++i) {
-        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+        pthread_mutex_destroy(&row_mt_sync->mutex[i]);
       }
-      vpx_free(row_mt_sync->mutex_);
+      vpx_free(row_mt_sync->mutex);
     }
-    if (row_mt_sync->cond_ != NULL) {
+    if (row_mt_sync->cond != NULL) {
       for (i = 0; i < row_mt_sync->rows; ++i) {
-        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+        pthread_cond_destroy(&row_mt_sync->cond[i]);
       }
-      vpx_free(row_mt_sync->cond_);
+      vpx_free(row_mt_sync->cond);
     }
 #endif  // CONFIG_MULTITHREAD
     vpx_free(row_mt_sync->cur_col);
@@ -327,11 +327,11 @@ void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
   const int nsync = row_mt_sync->sync_range;
 
   if (r && !(c & (nsync - 1))) {
-    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1];
     pthread_mutex_lock(mutex);
 
     while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) {
-      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+      pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex);
     }
     pthread_mutex_unlock(mutex);
   }
@@ -365,12 +365,12 @@ void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
   }
 
   if (sig) {
-    pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+    pthread_mutex_lock(&row_mt_sync->mutex[r]);
 
     row_mt_sync->cur_col[r] = cur;
 
-    pthread_cond_signal(&row_mt_sync->cond_[r]);
-    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+    pthread_cond_signal(&row_mt_sync->cond[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex[r]);
   }
 #else
   (void)row_mt_sync;
@@ -390,8 +390,9 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static int first_pass_worker_hook(EncWorkerData *const thread_data,
-                                  MultiThreadHandle *multi_thread_ctxt) {
+static int first_pass_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -470,8 +471,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
     }
   }
 
-  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt,
+                     num_workers);
 
   first_tile_col = &cpi->tile_data[0];
   for (i = 1; i < tile_cols; i++) {
@@ -480,8 +481,9 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
   }
 }
 
-static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
-                                       MultiThreadHandle *multi_thread_ctxt) {
+static int temporal_filter_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -508,8 +510,8 @@ static int temporal_filter_worker_hook(EncWorkerData *const thread_data,
       tile_col = proc_job->tile_col_id;
       tile_row = proc_job->tile_row_id;
       this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
-      mb_col_start = (this_tile->tile_info.mi_col_start) >> 1;
-      mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1;
+      mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT;
+      mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT;
       mb_row = proc_job->vert_unit_row_num;
 
       vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row,
@@ -553,13 +555,14 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
     }
   }
 
-  launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt,
+                     num_workers);
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
-static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
-                                  MultiThreadHandle *multi_thread_ctxt) {
+static int enc_row_mt_worker_hook(void *arg1, void *arg2) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+  MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -648,8 +651,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) {
     }
   }
 
-  launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook,
-                     multi_thread_ctxt, num_workers);
+  launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt,
+                     num_workers);
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
diff --git a/libs/libvpx/vp9/encoder/vp9_ethread.h b/libs/libvpx/vp9/encoder/vp9_ethread.h
index a396e621d7d..cda0293bcf0 100644
--- a/libs/libvpx/vp9/encoder/vp9_ethread.h
+++ b/libs/libvpx/vp9/encoder/vp9_ethread.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_ETHREAD_H_
-#define VP9_ENCODER_VP9_ETHREAD_H_
+#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_
+#define VPX_VP9_ENCODER_VP9_ETHREAD_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -33,8 +33,8 @@ typedef struct EncWorkerData {
 // Encoder row synchronization
 typedef struct VP9RowMTSyncData {
 #if CONFIG_MULTITHREAD
-  pthread_mutex_t *mutex_;
-  pthread_cond_t *cond_;
+  pthread_mutex_t *mutex;
+  pthread_cond_t *cond;
 #endif
   // Allocate memory to store the sb/mb block index in each row.
   int *cur_col;
@@ -69,4 +69,4 @@ void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_ETHREAD_H_
+#endif  // VPX_VP9_ENCODER_VP9_ETHREAD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_extend.h b/libs/libvpx/vp9/encoder/vp9_extend.h
index c0dd7571596..4ba7fc95e3d 100644
--- a/libs/libvpx/vp9/encoder/vp9_extend.h
+++ b/libs/libvpx/vp9/encoder/vp9_extend.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_EXTEND_H_
-#define VP9_ENCODER_VP9_EXTEND_H_
+#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_
+#define VPX_VP9_ENCODER_VP9_EXTEND_H_
 
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_integer.h"
@@ -28,4 +28,4 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_EXTEND_H_
+#endif  // VPX_VP9_ENCODER_VP9_EXTEND_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.c b/libs/libvpx/vp9/encoder/vp9_firstpass.c
index fb6b132a5b4..e0acf563b83 100644
--- a/libs/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.c
@@ -44,15 +44,11 @@
 #define COMPLEXITY_STATS_OUTPUT 0
 
 #define FIRST_PASS_Q 10.0
-#define INTRA_MODE_PENALTY 1024
-#define MIN_ARF_GF_BOOST 240
+#define NORMAL_BOOST 100
+#define MIN_ARF_GF_BOOST 250
 #define MIN_DECAY_FACTOR 0.01
 #define NEW_MV_MODE_PENALTY 32
 #define DARK_THRESH 64
-#define DEFAULT_GRP_WEIGHT 1.0
-#define RC_FACTOR_MIN 0.75
-#define RC_FACTOR_MAX 1.75
-#define SECTION_NOISE_DEF 250.0
 #define LOW_I_THRESH 24000
 
 #define NCOUNT_INTRA_THRESH 8192
@@ -105,7 +101,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
     fprintf(fpfile,
             "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
-            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf"
+            "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf"
             "%12.4lf"
             "\n",
             stats->frame, stats->weight, stats->intra_error, stats->coded_error,
@@ -316,16 +312,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) {
 }
 
 void vp9_end_first_pass(VP9_COMP *cpi) {
-  if (is_two_pass_svc(cpi)) {
-    int i;
-    for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-      output_stats(&cpi->svc.layer_context[i].twopass.total_stats,
-                   cpi->output_pkt_list);
-    }
-  } else {
-    output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
-  }
-
+  output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
   vpx_free(cpi->twopass.fp_mb_float_stats);
   cpi->twopass.fp_mb_float_stats = NULL;
 }
@@ -503,11 +490,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = thresh; break;
       case VPX_BITS_10: ret_val = thresh << 4; break;
-      case VPX_BITS_12: ret_val = thresh << 8; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = thresh << 8;
+        break;
     }
   }
 #else
@@ -529,11 +515,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break;
-      case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = UL_INTRA_THRESH << 4;
+        break;
     }
   }
 #else
@@ -550,11 +535,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
     switch (cm->bit_depth) {
       case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break;
       case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break;
-      case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break;
       default:
-        assert(0 &&
-               "cm->bit_depth should be VPX_BITS_8, "
-               "VPX_BITS_10 or VPX_BITS_12");
+        assert(cm->bit_depth == VPX_BITS_12);
+        ret_val = SMOOTH_INTRA_THRESH << 8;
+        break;
     }
   }
 #else
@@ -564,7 +548,7 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 }
 
 #define FP_DN_THRESH 8
-#define FP_MAX_DN_THRESH 16
+#define FP_MAX_DN_THRESH 24
 #define KERNEL_SIZE 3
 
 // Baseline Kernal weights for first pass noise metric
@@ -731,9 +715,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   // Exclude any image dead zone
   if (fp_acc_data->image_data_start_row > 0) {
     fp_acc_data->intra_skip_count =
-        VPXMAX(0,
-               fp_acc_data->intra_skip_count -
-                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0, fp_acc_data->intra_skip_count -
+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
   }
 
   fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -825,6 +808,8 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
                    fp_acc_data->image_data_start_row);
 }
 
+#define NZ_MOTION_PENALTY 128
+#define INTRA_MODE_PENALTY 1024
 void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
                                        FIRSTPASS_DATA *fp_acc_data,
                                        TileDataEnc *tile_data, MV *best_ref_mv,
@@ -834,6 +819,8 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile = tile_data->tile_info;
+  const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1);
+  const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1);
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
   const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
@@ -850,40 +837,19 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
 
-  LAYER_CONTEXT *const lc =
-      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
-                           : NULL;
   MODE_INFO mi_above, mi_left;
 
   double mb_intra_factor;
   double mb_brightness_factor;
   double mb_neutral_count;
+  int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
 
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
-  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
-
-  if (lc != NULL) {
-    // Use either last frame or alt frame for motion search.
-    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      if (first_ref_buf == NULL)
-        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
-    }
-
-    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      if (gld_yv12 == NULL) {
-        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-      }
-    } else {
-      gld_yv12 = NULL;
-    }
-  }
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +
-           (tile.mi_col_start >> 1);
-  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);
+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start;
+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
@@ -897,10 +863,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
   // Reset above block coeffs.
-  recon_yoffset =
-      (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;
-  recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +
-                   (tile.mi_col_start >> 1) * uv_mb_height;
+  recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16;
+  recon_uvoffset =
+      (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height;
 
   // Set up limit values for motion vectors to prevent them extending
   // outside the UMV borders.
@@ -908,8 +873,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   x->mv_limits.row_max =
       ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
 
-  for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);
-       ++mb_col, c++) {
+  for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) {
     int this_error;
     int this_intra_error;
     const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
@@ -955,7 +919,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     x->skip_encode = 0;
     x->fp_src_pred = 0;
     // Do intra prediction based on source pixels for tile boundaries
-    if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {
+    if (mb_col == mb_col_start && mb_col != 0) {
       xd->left_mi = &mi_left;
       x->fp_src_pred = 1;
     }
@@ -1002,12 +966,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
       switch (cm->bit_depth) {
         case VPX_BITS_8: break;
         case VPX_BITS_10: this_error >>= 4; break;
-        case VPX_BITS_12: this_error >>= 8; break;
         default:
-          assert(0 &&
-                 "cm->bit_depth should be VPX_BITS_8, "
-                 "VPX_BITS_10 or VPX_BITS_12");
-          return;
+          assert(cm->bit_depth == VPX_BITS_12);
+          this_error >>= 8;
+          break;
       }
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1073,30 +1035,34 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
 
     // Other than for the first frame do a motion search.
-    if ((lc == NULL && cm->current_video_frame > 0) ||
-        (lc != NULL && lc->current_video_frame_in_layer > 0)) {
-      int tmp_err, motion_error, raw_motion_error;
+    if (cm->current_video_frame > 0) {
+      int tmp_err, motion_error, this_motion_error, raw_motion_error;
       // Assume 0,0 motion with no mv overhead.
       MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
       struct buf_2d unscaled_last_source_buf_2d;
+      vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
 
       xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         motion_error = highbd_get_prediction_error(
             bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+        this_motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8);
       } else {
         motion_error =
             get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+        this_motion_error = motion_error;
       }
 #else
       motion_error =
           get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+      this_motion_error = motion_error;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       // Compute the motion error of the 0,0 motion using the last source
       // frame as the reference. Skip the further motion search on
-      // reconstructed frame if this error is small.
+      // reconstructed frame if this error is very small.
       unscaled_last_source_buf_2d.buf =
           cpi->unscaled_last_source->y_buffer + recon_yoffset;
       unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
@@ -1113,12 +1079,20 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
                                               &unscaled_last_source_buf_2d);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      // TODO(pengchong): Replace the hard-coded threshold
-      if (raw_motion_error > 25 || lc != NULL) {
+      if (raw_motion_error > NZ_MOTION_PENALTY) {
         // Test last reference frame using the previous best mv as the
         // starting point (best reference) for the search.
         first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
 
+        v_fn_ptr.vf = get_block_variance_fn(bsize);
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+          v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8);
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        this_motion_error =
+            vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0);
+
         // If the current best reference mv is not centered on 0,0 then do a
         // 0,0 based search as well.
         if (!is_zero_mv(best_ref_mv)) {
@@ -1128,13 +1102,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
           if (tmp_err < motion_error) {
             motion_error = tmp_err;
             mv = tmp_mv;
+            this_motion_error =
+                vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0);
           }
         }
 
         // Search in an older reference frame.
-        if (((lc == NULL && cm->current_video_frame > 1) ||
-             (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
-            gld_yv12 != NULL) {
+        if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
           // Assume 0,0 motion with no mv overhead.
           int gf_motion_error;
 
@@ -1280,7 +1254,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
             }
           }
 #endif
-
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
             if (mv.row > 0)
@@ -1306,17 +1279,16 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
             else if (mv.col < 0)
               --(fp_acc_data->sum_in_vectors);
           }
-          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+        }
+        if (this_intra_error < scaled_low_intra_thresh) {
           fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
-        } else {  // 0,0 mv but high error
+        } else {
           fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
         }
       } else {  // Intra < inter error
-        int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
         if (this_intra_error < scaled_low_intra_thresh) {
           fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          if (motion_error < scaled_low_intra_thresh) {
+          if (this_motion_error < scaled_low_intra_thresh) {
             fp_acc_data->intra_count_low += 1.0;
           } else {
             fp_acc_data->intra_count_high += 1.0;
@@ -1335,7 +1307,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     recon_uvoffset += uv_mb_height;
 
     // Accumulate row level stats to the corresponding tile stats
-    if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1)
+    if (cpi->row_mt && mb_col == mb_col_end - 1)
       accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
 
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
@@ -1372,9 +1344,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
   const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
 
-  LAYER_CONTEXT *const lc =
-      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
-                           : NULL;
   BufferPool *const pool = cm->buffer_pool;
 
   FIRSTPASS_DATA fp_temp_data;
@@ -1386,7 +1355,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
-  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
+  assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -1397,50 +1366,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   set_first_pass_params(cpi);
   vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
-  if (lc != NULL) {
-    twopass = &lc->twopass;
-
-    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-    cpi->ref_frame_flags = VP9_LAST_FLAG;
-
-    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
-        REF_FRAMES) {
-      cpi->gld_fb_idx =
-          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
-      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
-      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
-    } else {
-      cpi->refresh_golden_frame = 0;
-    }
-
-    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
-
-    vp9_scale_references(cpi);
-
-    // Use either last frame or alt frame for motion search.
-    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      if (first_ref_buf == NULL)
-        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
-    }
-
-    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      if (gld_yv12 == NULL) {
-        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-      }
-    } else {
-      gld_yv12 = NULL;
-    }
-
-    set_ref_ptrs(cm, xd,
-                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
-                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
-
-    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source, 0, EIGHTTAP, 0);
-  }
-
   vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
 
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
@@ -1524,18 +1449,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   vpx_extend_frame_borders(new_yv12);
 
-  if (lc != NULL) {
-    vp9_update_reference_frames(cpi);
-  } else {
-    // The frame we just compressed now becomes the last frame.
-    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
-               cm->new_fb_idx);
-  }
+  // The frame we just compressed now becomes the last frame.
+  ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
+             cm->new_fb_idx);
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
-  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX &&
-      lc == NULL) {
+  if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                cm->ref_frame_map[cpi->lst_fb_idx]);
   }
@@ -1560,9 +1480,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
 }
 
-static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = {
-  0.65, 0.70, 0.75, 0.85, 0.90, 0.90, 0.90, 1.00, 1.25
-};
+static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75,
+                                                            0.85, 0.90, 0.90,
+                                                            0.90, 1.00, 1.25 };
 
 static double calc_correction_factor(double err_per_mb, double err_divisor,
                                      int q) {
@@ -1583,7 +1503,26 @@ static double calc_correction_factor(double err_per_mb, double err_divisor,
   return fclamp(pow(error_term, power_term), 0.05, 5.0);
 }
 
-#define ERR_DIVISOR 115.0
+static double wq_err_divisor(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area <= 640 * 360) {
+    return 115.0;
+  } else if (screen_area < 1280 * 720) {
+    return 125.0;
+  } else if (screen_area <= 1920 * 1080) {
+    return 130.0;
+  } else if (screen_area < 3840 * 2160) {
+    return 150.0;
+  }
+
+  // Fall through to here only for 4K and above.
+  return 200.0;
+}
+
 #define NOISE_FACTOR_MIN 0.9
 #define NOISE_FACTOR_MAX 1.1
 static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
@@ -1643,7 +1582,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
     // content at the given rate.
     for (q = rc->best_quality; q < rc->worst_quality; ++q) {
       const double factor =
-          calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q);
+          calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q);
       const int bits_per_mb = vp9_rc_bits_per_mb(
           INTER_FRAME, q,
           factor * speed_term * cpi->twopass.bpm_factor * noise_factor,
@@ -1690,14 +1629,9 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
 }
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
-  SVC *const svc = &cpi->svc;
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const int is_two_pass_svc =
-      (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
   RATE_CONTROL *const rc = &cpi->rc;
-  TWO_PASS *const twopass =
-      is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass
-                      : &cpi->twopass;
+  TWO_PASS *const twopass = &cpi->twopass;
   double frame_rate;
   FIRSTPASS_STATS *stats;
 
@@ -1774,18 +1708,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // encoded in the second pass is a guess. However, the sum duration is not.
   // It is calculated based on the actual durations of all frames from the
   // first pass.
-
-  if (is_two_pass_svc) {
-    vp9_update_spatial_layer_framerate(cpi, frame_rate);
-    twopass->bits_left =
-        (int64_t)(stats->duration *
-                  svc->layer_context[svc->spatial_layer_id].target_bandwidth /
-                  10000000.0);
-  } else {
-    vp9_new_framerate(cpi, frame_rate);
-    twopass->bits_left =
-        (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-  }
+  vp9_new_framerate(cpi, frame_rate);
+  twopass->bits_left =
+      (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
 
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
@@ -1913,10 +1838,12 @@ static int detect_flash(const TWO_PASS *twopass, int offset) {
   // brief break in prediction (such as a flash) but subsequent frames
   // are reasonably well predicted by an earlier (pre flash) frame.
   // The recovery after a flash is indicated by a high pcnt_second_ref
-  // compared to pcnt_inter.
+  // useage or a second ref coded error notabley lower than the last
+  // frame coded error.
   return next_frame != NULL &&
-         next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
-         next_frame->pcnt_second_ref >= 0.5;
+         ((next_frame->sr_coded_error < next_frame->coded_error) ||
+          ((next_frame->pcnt_second_ref > next_frame->pcnt_inter) &&
+           (next_frame->pcnt_second_ref >= 0.5)));
 }
 
 // Update the motion related elements to the GF arf boost calculation.
@@ -1971,7 +1898,20 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
   return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
 }
 
-#define KF_BASELINE_ERR_PER_MB 12500.0
+static double kf_err_per_mb(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  unsigned int screen_area = (cm->width * cm->height);
+
+  // Use a different error per mb factor for calculating boost for
+  //  different formats.
+  if (screen_area < 1280 * 720) {
+    return 2000.0;
+  } else if (screen_area < 1920 * 1080) {
+    return 500.0;
+  }
+  return 250.0;
+}
+
 static double calc_kf_frame_boost(VP9_COMP *cpi,
                                   const FIRSTPASS_STATS *this_frame,
                                   double *sr_accumulator,
@@ -1984,7 +1924,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   const double active_area = calculate_active_area(cpi, this_frame);
 
   // Underlying boost factor is based on inter error ratio.
-  frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) /
+  frame_boost = (kf_err_per_mb(cpi) * active_area) /
                 DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
 
   // Update the accumulator for second ref error difference.
@@ -1997,8 +1937,11 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   if (this_frame_mv_in_out > 0.0)
     frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
 
-  // Q correction and scalling
-  frame_boost = frame_boost * boost_q_correction;
+  // Q correction and scaling
+  // The 40.0 value here is an experimentally derived baseline minimum.
+  // This value is in line with the minimum per frame boost in the alt_ref
+  // boost calculation.
+  frame_boost = ((frame_boost + 40.0) * boost_q_correction);
 
   return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
@@ -2105,10 +2048,15 @@ static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
 // Calculate the total bits to allocate in this GF/ARF group.
 static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
                                              double gf_group_err) {
+  VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
   const TWO_PASS *const twopass = &cpi->twopass;
   const int max_bits = frame_max_bits(rc, &cpi->oxcf);
   int64_t total_group_bits;
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+  int gop_frames =
+      rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
 
   // Calculate the bits to be allocated to the group as a whole.
   if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
@@ -2126,8 +2074,8 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
                                : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
-  if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
-    total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+  if (total_group_bits > (int64_t)max_bits * gop_frames)
+    total_group_bits = (int64_t)max_bits * gop_frames;
 
   return total_group_bits;
 }
@@ -2140,7 +2088,7 @@ static int calculate_boost_bits(int frame_count, int boost,
   // return 0 for invalid inputs (could arise e.g. through rounding errors)
   if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
 
-  allocation_chunks = (frame_count * 100) + boost;
+  allocation_chunks = (frame_count * NORMAL_BOOST) + boost;
 
   // Prevent overflow.
   if (boost > 1023) {
@@ -2154,18 +2102,6 @@ static int calculate_boost_bits(int frame_count, int boost,
                 0);
 }
 
-// Current limit on maximum number of active arfs in a GF/ARF group.
-#define MAX_ACTIVE_ARFS 2
-#define ARF_SLOT1 2
-#define ARF_SLOT2 3
-// This function indirects the choice of buffers for arfs.
-// At the moment the values are fixed but this may change as part of
-// the integration process with other codec features that swap buffers around.
-static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
-  arf_buffer_indices[0] = ARF_SLOT1;
-  arf_buffer_indices[1] = ARF_SLOT2;
-}
-
 // Used in corpus vbr: Calculates the total normalized group complexity score
 // for a given number of frames starting at the current position in the stats
 // file.
@@ -2185,11 +2121,129 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score,
     ++s;
     ++i;
   }
-  assert(i == frame_count);
 
   return score_total;
 }
 
+static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group,
+                           int *index_counter, int depth, int start, int end) {
+  TWO_PASS *twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+  FIRSTPASS_STATS fpf_frame;
+  const int mid = (start + end + 1) >> 1;
+  const int min_frame_interval = 2;
+  int idx;
+
+  // Process regular P frames
+  if ((end - start < min_frame_interval) ||
+      (depth > gf_group->allowed_max_layer_depth)) {
+    for (idx = start; idx <= end; ++idx) {
+      gf_group->update_type[*index_counter] = LF_UPDATE;
+      gf_group->arf_src_offset[*index_counter] = 0;
+      gf_group->frame_gop_index[*index_counter] = idx;
+      gf_group->rf_level[*index_counter] = INTER_NORMAL;
+      gf_group->layer_depth[*index_counter] = depth;
+      gf_group->gfu_boost[*index_counter] = NORMAL_BOOST;
+      ++(*index_counter);
+    }
+    gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth);
+    return;
+  }
+
+  assert(abs(mid - start) >= 1 && abs(mid - end) >= 1);
+
+  // Process ARF frame
+  gf_group->layer_depth[*index_counter] = depth;
+  gf_group->update_type[*index_counter] = ARF_UPDATE;
+  gf_group->arf_src_offset[*index_counter] = mid - start;
+  gf_group->frame_gop_index[*index_counter] = mid;
+  gf_group->rf_level[*index_counter] = GF_ARF_LOW;
+
+  for (idx = 0; idx <= mid; ++idx)
+    if (EOF == input_stats(twopass, &fpf_frame)) break;
+
+  gf_group->gfu_boost[*index_counter] =
+      VPXMAX(MIN_ARF_GF_BOOST,
+             calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth);
+
+  reset_fpf_position(twopass, start_pos);
+
+  ++(*index_counter);
+
+  find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1);
+
+  gf_group->update_type[*index_counter] = USE_BUF_FRAME;
+  gf_group->arf_src_offset[*index_counter] = 0;
+  gf_group->frame_gop_index[*index_counter] = mid;
+  gf_group->rf_level[*index_counter] = INTER_NORMAL;
+  gf_group->layer_depth[*index_counter] = depth;
+  ++(*index_counter);
+
+  find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end);
+}
+
+static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group,
+                                             int frame_index,
+                                             int source_alt_ref_active) {
+  if (source_alt_ref_active) {
+    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+    gf_group->rf_level[frame_index] = INTER_NORMAL;
+    gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1;
+    gf_group->gfu_boost[frame_index] = NORMAL_BOOST;
+  } else {
+    gf_group->update_type[frame_index] = GF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->layer_depth[frame_index] = 0;
+  }
+}
+
+static void define_gf_group_structure(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  int frame_index = 0;
+  int key_frame = cpi->common.frame_type == KEY_FRAME;
+  int layer_depth = 1;
+  int gop_frames =
+      rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
+
+  gf_group->frame_start = cpi->common.current_video_frame;
+  gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval;
+  gf_group->max_layer_depth = 0;
+  gf_group->allowed_max_layer_depth = 0;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame)
+    set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active);
+
+  ++frame_index;
+
+  // === [frame_index == 1] ===
+  if (rc->source_alt_ref_pending) {
+    gf_group->update_type[frame_index] = ARF_UPDATE;
+    gf_group->rf_level[frame_index] = GF_ARF_STD;
+    gf_group->layer_depth[frame_index] = layer_depth;
+    gf_group->arf_src_offset[frame_index] =
+        (unsigned char)(rc->baseline_gf_interval - 1);
+    gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+    gf_group->max_layer_depth = 1;
+    ++frame_index;
+    ++layer_depth;
+    gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf;
+  }
+
+  find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames);
+
+  set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending);
+  gf_group->arf_src_offset[frame_index] = 0;
+  gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval;
+
+  // Set the frame ops number.
+  gf_group->gf_group_size = frame_index;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    int gf_arf_bits) {
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -2198,17 +2252,12 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
-  int frame_index = 1;
+  int frame_index = 0;
   int target_frame_size;
   int key_frame;
   const int max_bits = frame_max_bits(&cpi->rc, oxcf);
   int64_t total_group_bits = gf_group_bits;
-  int mid_boost_bits = 0;
   int mid_frame_idx;
-  unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
-  int alt_frame_index = frame_index;
-  int has_temporal_layers =
-      is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
   int normal_frames;
   int normal_frame_bits;
   int last_frame_reduction = 0;
@@ -2216,81 +2265,97 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   double tot_norm_frame_score = 1.0;
   double this_frame_score = 1.0;
 
-  // Only encode alt reference frame in temporal base layer.
-  if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
-
-  key_frame =
-      cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi);
+  // Define the GF structure and specify
+  int gop_frames = gf_group->gf_group_size;
 
-  get_arf_buffer_indices(arf_buffer_indices);
+  key_frame = cpi->common.frame_type == KEY_FRAME;
 
   // For key frames the frame target rate is already set and it
   // is also the golden frame.
+  // === [frame_index == 0] ===
   if (!key_frame) {
-    if (rc->source_alt_ref_active) {
-      gf_group->update_type[0] = OVERLAY_UPDATE;
-      gf_group->rf_level[0] = INTER_NORMAL;
-      gf_group->bit_allocation[0] = 0;
-    } else {
-      gf_group->update_type[0] = GF_UPDATE;
-      gf_group->rf_level[0] = GF_ARF_STD;
-      gf_group->bit_allocation[0] = gf_arf_bits;
-    }
-    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
-
-    // Step over the golden frame / overlay frame
-    if (EOF == input_stats(twopass, &frame_stats)) return;
+    gf_group->bit_allocation[frame_index] =
+        rc->source_alt_ref_active ? 0 : gf_arf_bits;
   }
 
   // Deduct the boost bits for arf (or gf if it is not a key frame)
   // from the group total.
   if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
 
+  ++frame_index;
+
+  // === [frame_index == 1] ===
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
-    gf_group->update_type[alt_frame_index] = ARF_UPDATE;
-    gf_group->rf_level[alt_frame_index] = GF_ARF_STD;
-    gf_group->bit_allocation[alt_frame_index] = gf_arf_bits;
-
-    if (has_temporal_layers)
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval -
-                          cpi->svc.number_temporal_layers);
-    else
-      gf_group->arf_src_offset[alt_frame_index] =
-          (unsigned char)(rc->baseline_gf_interval - 1);
-
-    gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[alt_frame_index] =
-        arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
-                           rc->source_alt_ref_active];
-    if (!has_temporal_layers) ++frame_index;
-
-    if (cpi->multi_arf_enabled) {
-      // Set aside a slot for a level 1 arf.
-      gf_group->update_type[frame_index] = ARF_UPDATE;
-      gf_group->rf_level[frame_index] = GF_ARF_LOW;
-      gf_group->arf_src_offset[frame_index] =
-          (unsigned char)((rc->baseline_gf_interval >> 1) - 1);
-      gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1];
-      gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-      ++frame_index;
-    }
-  }
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
 
-  // Note index of the first normal inter frame int eh group (not gf kf arf)
-  gf_group->first_inter_index = frame_index;
+    ++frame_index;
+  }
 
   // Define middle frame
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
-  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  normal_frames = (rc->baseline_gf_interval - 1);
   if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
   else
     normal_frame_bits = (int)total_group_bits;
 
+  gf_group->gfu_boost[1] = rc->gfu_boost;
+
+  if (cpi->multi_layer_arf) {
+    int idx;
+    int arf_depth_bits[MAX_ARF_LAYERS] = { 0 };
+    int arf_depth_count[MAX_ARF_LAYERS] = { 0 };
+    int arf_depth_boost[MAX_ARF_LAYERS] = { 0 };
+    int total_arfs = 1;  // Account for the base layer ARF.
+
+    for (idx = 0; idx < gop_frames; ++idx) {
+      if (gf_group->update_type[idx] == ARF_UPDATE) {
+        arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx];
+        ++arf_depth_count[gf_group->layer_depth[idx]];
+      }
+    }
+
+    for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) {
+      if (arf_depth_boost[idx] == 0) break;
+      arf_depth_bits[idx] = calculate_boost_bits(
+          rc->baseline_gf_interval - total_arfs - arf_depth_count[idx],
+          arf_depth_boost[idx], total_group_bits);
+
+      total_group_bits -= arf_depth_bits[idx];
+      total_arfs += arf_depth_count[idx];
+    }
+
+    // offset the base layer arf
+    normal_frames -= (total_arfs - 1);
+    if (normal_frames > 1)
+      normal_frame_bits = (int)(total_group_bits / normal_frames);
+    else
+      normal_frame_bits = (int)total_group_bits;
+
+    target_frame_size = normal_frame_bits;
+    target_frame_size =
+        clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
+
+    // The first layer ARF has its bit allocation assigned.
+    for (idx = frame_index; idx < gop_frames; ++idx) {
+      switch (gf_group->update_type[idx]) {
+        case ARF_UPDATE:
+          gf_group->bit_allocation[idx] =
+              (int)(((int64_t)arf_depth_bits[gf_group->layer_depth[idx]] *
+                     gf_group->gfu_boost[idx]) /
+                    arf_depth_boost[gf_group->layer_depth[idx]]);
+          break;
+        case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break;
+        default: gf_group->bit_allocation[idx] = target_frame_size; break;
+      }
+    }
+    gf_group->bit_allocation[idx] = 0;
+
+    return;
+  }
+
   if (oxcf->vbr_corpus_complexity) {
     av_score = get_distribution_av_err(cpi, twopass);
     tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
@@ -2298,13 +2363,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
 
   // Allocate bits to the other frames in the group.
   for (i = 0; i < normal_frames; ++i) {
-    int arf_idx = 0;
     if (EOF == input_stats(twopass, &frame_stats)) break;
-
-    if (has_temporal_layers && frame_index == alt_frame_index) {
-      ++frame_index;
-    }
-
     if (oxcf->vbr_corpus_complexity) {
       this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
                                                     &frame_stats, av_score);
@@ -2318,21 +2377,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       target_frame_size -= last_frame_reduction;
     }
 
-    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
-      mid_boost_bits += (target_frame_size >> 4);
-      target_frame_size -= (target_frame_size >> 4);
-
-      if (frame_index <= mid_frame_idx) arf_idx = 1;
-    }
-    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx];
-    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx];
-
     target_frame_size =
         clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
 
-    gf_group->update_type[frame_index] = LF_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
     gf_group->bit_allocation[frame_index] = target_frame_size;
     ++frame_index;
   }
@@ -2344,27 +2391,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
   // vp9_rc_get_second_pass_params() the data will be undefined.
-  gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
-  gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
-
-  if (rc->source_alt_ref_pending) {
-    gf_group->update_type[frame_index] = OVERLAY_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
-
-    // Final setup for second arf and its overlay.
-    if (cpi->multi_arf_enabled) {
-      gf_group->bit_allocation[2] =
-          gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits;
-      gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE;
-      gf_group->bit_allocation[mid_frame_idx] = 0;
-    }
-  } else {
-    gf_group->update_type[frame_index] = GF_UPDATE;
-    gf_group->rf_level[frame_index] = GF_ARF_STD;
-  }
-
-  // Note whether multi-arf was enabled this group for next time.
-  cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
 }
 
 // Adjusts the ARNF filter for a GF group.
@@ -2376,15 +2402,19 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
 
   twopass->arnr_strength_adjustment = 0;
 
-  if ((section_zeromv < 0.10) || (section_noise <= (SECTION_NOISE_DEF * 0.75)))
+  if (section_noise < 150) {
     twopass->arnr_strength_adjustment -= 1;
+    if (section_noise < 75) twopass->arnr_strength_adjustment -= 1;
+  } else if (section_noise > 250)
+    twopass->arnr_strength_adjustment += 1;
+
   if (section_zeromv > 0.50) twopass->arnr_strength_adjustment += 1;
 }
 
 // Analyse and define a gf/arf group.
-#define ARF_DECAY_BREAKOUT 0.10
 #define ARF_ABS_ZOOM_THRESH 4.0
 
+#define MAX_GF_BOOST 5400
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2425,6 +2455,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   int gf_arf_bits;
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+  int is_alt_ref_flash = 0;
+
+  double gop_intra_factor = 1.0;
+  int gop_frames;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2465,36 +2499,51 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   {
     int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality,
                                                   cpi->common.bit_depth));
-    int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
-                                                cpi->common.bit_depth));
+    int q_term = (cm->current_video_frame == 0)
+                     ? int_max_q / 32
+                     : (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex,
+                                                     cpi->common.bit_depth) /
+                             6);
     active_min_gf_interval =
         rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200);
     active_min_gf_interval =
         VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf);
 
-    if (cpi->multi_arf_allowed) {
-      active_max_gf_interval = rc->max_gf_interval;
-    } else {
-      // The value chosen depends on the active Q range. At low Q we have
-      // bits to spare and are better with a smaller interval and smaller boost.
-      // At high Q when there are few bits to spare we are better with a longer
-      // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6));
-
-      // We have: active_min_gf_interval <=
-      // rc->max_gf_interval + arf_active_or_kf.
-      if (active_max_gf_interval < active_min_gf_interval) {
-        active_max_gf_interval = active_min_gf_interval;
-      } else {
-        active_max_gf_interval = VPXMIN(active_max_gf_interval,
-                                        rc->max_gf_interval + arf_active_or_kf);
-      }
+    // The value chosen depends on the active Q range. At low Q we have
+    // bits to spare and are better with a smaller interval and smaller boost.
+    // At high Q when there are few bits to spare we are better with a longer
+    // interval to spread the cost of the GF.
+    active_max_gf_interval = 11 + arf_active_or_kf + VPXMIN(5, q_term);
+
+    // Force max GF interval to be odd.
+    active_max_gf_interval = active_max_gf_interval | 0x01;
 
-      // Would the active max drop us out just before the near the next kf?
-      if ((active_max_gf_interval <= rc->frames_to_key) &&
-          (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval)))
-        active_max_gf_interval = rc->frames_to_key / 2;
+    // We have: active_min_gf_interval <=
+    // rc->max_gf_interval + arf_active_or_kf.
+    if (active_max_gf_interval < active_min_gf_interval) {
+      active_max_gf_interval = active_min_gf_interval;
+    } else {
+      active_max_gf_interval = VPXMIN(active_max_gf_interval,
+                                      rc->max_gf_interval + arf_active_or_kf);
     }
+
+    // Would the active max drop us out just before the near the next kf?
+    if ((active_max_gf_interval <= rc->frames_to_key) &&
+        (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval)))
+      active_max_gf_interval = rc->frames_to_key / 2;
+  }
+  active_max_gf_interval =
+      VPXMAX(active_max_gf_interval, active_min_gf_interval);
+
+  if (cpi->multi_layer_arf) {
+    int layers = 0;
+    int max_layers = VPXMIN(MAX_ARF_LAYERS, cpi->oxcf.enable_auto_arf);
+
+    // Adapt the intra_error factor to active_max_gf_interval limit.
+    for (i = active_max_gf_interval; i > 0; i >>= 1) ++layers;
+
+    layers = VPXMIN(max_layers, layers);
+    gop_intra_factor += (layers * 0.25);
   }
 
   i = 0;
@@ -2523,15 +2572,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
+    // Monitor for static sections.
+    if ((rc->frames_since_key + i - 1) > 1) {
+      zero_motion_accumulator = VPXMIN(
+          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+    }
+
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
-      // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
-
       // Break clause to detect very still sections after motion. For example,
       // a static image after a fade or other transition.
       if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
@@ -2551,18 +2602,27 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // Break out conditions.
-    if (
-        // Break at active_max_gf_interval unless almost totally static.
-        ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+    // Break at maximum of active_max_gf_interval unless almost totally static.
+    //
+    // Note that the addition of a test of rc->source_alt_ref_active is
+    // deliberate. The effect of this is that after a normal altref group even
+    // if the material is static there will be one normal length GF group
+    // before allowing longer GF groups. The reason for this is that in cases
+    // such as slide shows where slides are separated by a complex transition
+    // such as a fade, the arf group spanning the transition may not be coded
+    // at a very high quality and hence this frame (with its overlay) is a
+    // poor golden frame to use for an extended group.
+    if (((i >= active_max_gf_interval) &&
+         ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
         (
             // Don't break out with a very short interval.
             (i >= active_min_gf_interval) &&
             // If possible dont break very close to a kf
-            ((rc->frames_to_key - i) >= rc->min_gf_interval) &&
+            ((rc->frames_to_key - i) >= rc->min_gf_interval) && (i & 0x01) &&
             (!flash_detected) &&
             ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
              (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
-             (sr_accumulator > next_frame.intra_error)))) {
+             (sr_accumulator > gop_intra_factor * next_frame.intra_error)))) {
       break;
     }
 
@@ -2573,8 +2633,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
   // Should we use the alternate reference frame.
-  if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval)) {
+  if ((zero_motion_accumulator < 0.995) && allow_alt_ref &&
+      (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
+      (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
     const int forward_frames = (rc->frames_to_key - i >= i - 1)
                                    ? i - 1
                                    : VPXMAX(0, rc->frames_to_key - i);
@@ -2582,18 +2643,23 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Calculate the boost for alt ref.
     rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1));
     rc->source_alt_ref_pending = 1;
-
-    // Test to see if multi arf is appropriate.
-    cpi->multi_arf_enabled =
-        (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
-         (zero_motion_accumulator < 0.995))
-            ? 1
-            : 0;
   } else {
-    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1));
+    reset_fpf_position(twopass, start_pos);
+    rc->gfu_boost = VPXMIN(MAX_GF_BOOST, calc_arf_boost(cpi, (i - 1), 0));
     rc->source_alt_ref_pending = 0;
   }
 
+#define LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR 0.2
+  rc->arf_active_best_quality_adjustment_factor = 1.0;
+  if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf) &&
+      rc->frames_to_key <= rc->arf_active_best_quality_adjustment_window) {
+    rc->arf_active_best_quality_adjustment_factor =
+        LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR +
+        (1.0 - LAST_ALR_ACTIVE_BEST_QUALITY_ADJUSTMENT_FACTOR) *
+            (rc->frames_to_key - i) /
+            VPXMAX(1, (rc->arf_active_best_quality_adjustment_window - i));
+  }
+
 #ifdef AGGRESSIVE_VBR
   // Limit maximum boost based on interval length.
   rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 140);
@@ -2601,53 +2667,47 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
 #endif
 
-  // Set the interval until the next gf.
-  rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+  // Cap the ARF boost when perceptual quality AQ mode is enabled. This is
+  // designed to improve the perceptual quality of high value content and to
+  // make consistent quality across consecutive frames. It will hurt objective
+  // quality.
+  if (oxcf->aq_mode == PERCEPTUAL_AQ)
+    rc->gfu_boost = VPXMIN(rc->gfu_boost, MIN_ARF_GF_BOOST);
 
-  // Only encode alt reference frame in temporal base layer. So
-  // baseline_gf_interval should be multiple of a temporal layer group
-  // (typically the frame distance between two base layer frames)
-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_gf_interval = (rc->baseline_gf_interval + count) & (~count);
-    int j;
-    for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
-      if (EOF == input_stats(twopass, this_frame)) break;
-      gf_group_err +=
-          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
-      gf_group_raw_error += this_frame->coded_error;
-      gf_group_noise += this_frame->frame_noise_energy;
-      gf_group_skip_pct += this_frame->intra_skip_pct;
-      gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
-      gf_group_inter += this_frame->pcnt_inter;
-      gf_group_motion += this_frame->pcnt_motion;
-    }
-    rc->baseline_gf_interval = new_gf_interval;
-  }
-
-  rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+  rc->baseline_gf_interval = i - rc->source_alt_ref_pending;
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
+  if (rc->source_alt_ref_pending)
+    is_alt_ref_flash = detect_flash(twopass, rc->baseline_gf_interval);
+
   // Calculate the bits to be allocated to the gf/arf group as a whole
   gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
 
+  gop_frames =
+      rc->baseline_gf_interval + rc->source_alt_ref_pending - arf_active_or_kf;
+
+  // Store the average moise level measured for the group
+  // TODO(any): Experiment with removal of else condition (gop_frames = 0) so
+  // that consumption of group noise energy is based on previous gf group
+  if (gop_frames > 0)
+    twopass->gf_group.group_noise_energy = (int)(gf_group_noise / gop_frames);
+  else
+    twopass->gf_group.group_noise_energy = 0;
+
   // Calculate an estimate of the maxq needed for the group.
   // We are more aggressive about correcting for sections
   // where there could be significant overshoot than for easier
   // sections where we do not wish to risk creating an overshoot
   // of the allocated bit budget.
   if ((cpi->oxcf.rc_mode != VPX_Q) && (rc->baseline_gf_interval > 1)) {
-    const int vbr_group_bits_per_frame =
-        (int)(gf_group_bits / rc->baseline_gf_interval);
-    const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
-    const double group_av_noise = gf_group_noise / rc->baseline_gf_interval;
-    const double group_av_skip_pct =
-        gf_group_skip_pct / rc->baseline_gf_interval;
-    const double group_av_inactive_zone =
-        ((gf_group_inactive_zone_rows * 2) /
-         (rc->baseline_gf_interval * (double)cm->mb_rows));
+    const int vbr_group_bits_per_frame = (int)(gf_group_bits / gop_frames);
+    const double group_av_err = gf_group_raw_error / gop_frames;
+    const double group_av_noise = gf_group_noise / gop_frames;
+    const double group_av_skip_pct = gf_group_skip_pct / gop_frames;
+    const double group_av_inactive_zone = ((gf_group_inactive_zone_rows * 2) /
+                                           (gop_frames * (double)cm->mb_rows));
     int tmp_q = get_twopass_worst_quality(
         cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
         group_av_noise, vbr_group_bits_per_frame);
@@ -2663,20 +2723,23 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   // Context Adjustment of ARNR filter strength
   if (rc->baseline_gf_interval > 1) {
-    adjust_group_arnr_filter(cpi, (gf_group_noise / rc->baseline_gf_interval),
-                             (gf_group_inter / rc->baseline_gf_interval),
-                             (gf_group_motion / rc->baseline_gf_interval));
+    adjust_group_arnr_filter(cpi, (gf_group_noise / gop_frames),
+                             (gf_group_inter / gop_frames),
+                             (gf_group_motion / gop_frames));
   } else {
     twopass->arnr_strength_adjustment = 0;
   }
 
   // Calculate the extra bits to be used for boosted frame(s)
-  gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
-                                     gf_group_bits);
+  gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1),
+                                     rc->gfu_boost, gf_group_bits);
 
   // Adjust KF group bits and error remaining.
   twopass->kf_group_error_left -= gf_group_err;
 
+  // Decide GOP structure.
+  define_gf_group_structure(cpi);
+
   // Allocate bits to each of the frames in the GF group.
   allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
 
@@ -2684,10 +2747,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   reset_fpf_position(twopass, start_pos);
 
   // Calculate a section intra ratio used in setting max loop filter.
-  if (cpi->common.frame_type != KEY_FRAME) {
-    twopass->section_intra_rating = calculate_section_intra_ratio(
-        start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
-  }
+  twopass->section_intra_rating = calculate_section_intra_ratio(
+      start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
 
   if (oxcf->resize_mode == RESIZE_DYNAMIC) {
     // Default to starting GF groups at normal frame size.
@@ -2698,19 +2759,82 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->rolling_arf_group_target_bits = 0;
   twopass->rolling_arf_group_actual_bits = 0;
 #endif
+  rc->preserve_arf_as_gld = rc->preserve_next_arf_as_gld;
+  rc->preserve_next_arf_as_gld = 0;
+  // If alt ref frame is flash do not set preserve_arf_as_gld
+  if (!is_lossless_requested(&cpi->oxcf) && !cpi->use_svc &&
+      cpi->oxcf.aq_mode == NO_AQ && cpi->multi_layer_arf && !is_alt_ref_flash)
+    rc->preserve_next_arf_as_gld = 1;
+}
+
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
+// Slide show transition detection.
+// Tests for case where there is very low error either side of the current frame
+// but much higher just for this frame. This can help detect key frames in
+// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
+// It will not help if the transition is a fade or other multi-frame effect.
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+                            const FIRSTPASS_STATS *last_frame,
+                            const FIRSTPASS_STATS *next_frame) {
+  return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+         (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+         (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
+}
+
+// This test looks for anomalous changes in the nature of the intra signal
+// related to the previous and next frame as an indicator for coding a key
+// frame. This test serves to detect some additional scene cuts,
+// especially in lowish motion and low contrast sections, that are missed
+// by the other tests.
+static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
+                                 const FIRSTPASS_STATS *last_frame,
+                                 const FIRSTPASS_STATS *next_frame) {
+  double last_ii_ratio;
+  double this_ii_ratio;
+  double next_ii_ratio;
+  double last_pcnt_intra = 1.0 - last_frame->pcnt_inter;
+  double this_pcnt_intra = 1.0 - this_frame->pcnt_inter;
+  double next_pcnt_intra = 1.0 - next_frame->pcnt_inter;
+  double mod_this_intra = this_pcnt_intra + this_frame->pcnt_neutral;
+
+  // Calculate ii ratio for this frame last frame and next frame.
+  last_ii_ratio =
+      last_frame->intra_error / DOUBLE_DIVIDE_CHECK(last_frame->coded_error);
+  this_ii_ratio =
+      this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+  next_ii_ratio =
+      next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
+
+  // Return true the intra/inter ratio for the current frame is
+  // low but better in the next and previous frame and the relative useage of
+  // intra in the current frame is markedly higher than the last and next frame.
+  if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
+      (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
+      (this_pcnt_intra > (3 * next_pcnt_intra)) &&
+      ((this_pcnt_intra > 0.075) || (mod_this_intra > 0.85))) {
+    return 1;
+    // Very low inter intra ratio (i.e. not much gain from inter coding), most
+    // blocks neutral on coding method and better inter prediction either side
+  } else if ((this_ii_ratio < 1.25) && (mod_this_intra > 0.85) &&
+             (this_ii_ratio < last_ii_ratio * 0.9) &&
+             (this_ii_ratio < next_ii_ratio * 0.9)) {
+    return 1;
+  } else {
+    return 0;
+  }
 }
 
-// Threshold for use of the lagging second reference frame. High second ref
-// usage may point to a transient event like a flash or occlusion rather than
-// a real scene cut.
-#define SECOND_REF_USEAGE_THRESH 0.1
 // Minimum % intra coding observed in first pass (1.0 = 100%)
 #define MIN_INTRA_LEVEL 0.25
-// Minimum ratio between the % of intra coding and inter coding in the first
-// pass after discounting neutral blocks (discounting neutral blocks in this
-// way helps catch scene cuts in clips with very flat areas or letter box
-// format clips with image padding.
-#define INTRA_VS_INTER_THRESH 2.0
+// Threshold for use of the lagging second reference frame. Scene cuts do not
+// usually have a high second ref useage.
+#define SECOND_REF_USEAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
 // may be a good option.
@@ -2718,12 +2842,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 // Maximum threshold for the relative ratio of intra error score vs best
 // inter error score.
 #define KF_II_ERR_THRESHOLD 2.5
-// In real scene cuts there is almost always a sharp change in the intra
-// or inter error score.
-#define ERR_CHANGE_THRESHOLD 0.4
-// For real scene cuts we expect an improvment in the intra inter error
-// ratio in the next frame.
-#define II_IMPROVEMENT_THRESHOLD 3.5
 #define KF_II_MAX 128.0
 #define II_FACTOR 12.5
 // Test for very low intra complexity which could cause false key frames
@@ -2735,29 +2853,22 @@ static int test_candidate_kf(TWO_PASS *twopass,
                              const FIRSTPASS_STATS *next_frame) {
   int is_viable_kf = 0;
   double pcnt_intra = 1.0 - this_frame->pcnt_inter;
-  double modified_pcnt_inter =
-      this_frame->pcnt_inter - this_frame->pcnt_neutral;
 
   // Does the frame satisfy the primary criteria of a key frame?
   // See above for an explanation of the test criteria.
   // If so, then examine how well it predicts subsequent frames.
-  if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
-      (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+  if (!detect_flash(twopass, -1) && !detect_flash(twopass, 0) &&
+      (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
-       ((pcnt_intra > MIN_INTRA_LEVEL) &&
-        (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+       (slide_transition(this_frame, last_frame, next_frame)) ||
+       (intra_step_transition(this_frame, last_frame, next_frame)) ||
+       (((this_frame->coded_error > (next_frame->coded_error * 1.2)) &&
+         (this_frame->coded_error > (last_frame->coded_error * 1.2))) &&
+        (pcnt_intra > MIN_INTRA_LEVEL) &&
+        ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) &&
         ((this_frame->intra_error /
           DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
-         KF_II_ERR_THRESHOLD) &&
-        ((fabs(last_frame->coded_error - this_frame->coded_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         (fabs(last_frame->intra_error - this_frame->intra_error) /
-              DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
-          ERR_CHANGE_THRESHOLD) ||
-         ((next_frame->intra_error /
-           DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
-          II_IMPROVEMENT_THRESHOLD))))) {
+         KF_II_ERR_THRESHOLD)))) {
     int i;
     const FIRSTPASS_STATS *start_pos = twopass->stats_in;
     FIRSTPASS_STATS local_next_frame = *next_frame;
@@ -2814,7 +2925,10 @@ static int test_candidate_kf(TWO_PASS *twopass,
 
 #define FRAMES_TO_CHECK_DECAY 8
 #define MIN_KF_TOT_BOOST 300
-#define KF_BOOST_SCAN_MAX_FRAMES 32
+#define DEFAULT_SCAN_FRAMES_FOR_KF_BOOST 32
+#define MAX_SCAN_FRAMES_FOR_KF_BOOST 48
+#define MIN_SCAN_FRAMES_FOR_KF_BOOST 32
+#define KF_ABS_ZOOM_THRESH 6.0
 
 #ifdef AGGRESSIVE_VBR
 #define KF_MAX_FRAME_BOOST 80.0
@@ -2835,17 +2949,27 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   FIRSTPASS_STATS next_frame;
   FIRSTPASS_STATS last_frame;
   int kf_bits = 0;
+  int64_t max_kf_bits;
   double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
+  double zero_motion_sum = 0.0;
+  double zero_motion_avg;
+  double motion_compensable_sum = 0.0;
+  double motion_compensable_avg;
+  int num_frames = 0;
+  int kf_boost_scan_frames = DEFAULT_SCAN_FRAMES_FOR_KF_BOOST;
   double boost_score = 0.0;
   double kf_mod_err = 0.0;
+  double kf_raw_err = 0.0;
   double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   double sr_accumulator = 0.0;
+  double abs_mv_in_out_accumulator = 0.0;
   const double av_err = get_distribution_av_err(cpi, twopass);
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
+  rc->frames_since_key = 0;
 
   // Reset the GF group data structures.
   vp9_zero(*gf_group);
@@ -2856,7 +2980,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Clear the alt ref active flag and last group multi arf flags as they
   // can never be set for a key frame.
   rc->source_alt_ref_active = 0;
-  cpi->multi_arf_last_grp_enabled = 0;
 
   // KF is always a GF so clear frames till next gf counter.
   rc->frames_till_gf_update_due = 0;
@@ -2866,6 +2989,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_group_bits = 0;          // Total bits available to kf group
   twopass->kf_group_error_left = 0.0;  // Group modified error score.
 
+  kf_raw_err = this_frame->intra_error;
   kf_mod_err =
       calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
@@ -2950,18 +3074,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     rc->next_key_frame_forced = 0;
   }
 
-  if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) {
-    int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1;
-    int new_frame_to_key = (rc->frames_to_key + count) & (~count);
-    int j;
-    for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
-      if (EOF == input_stats(twopass, this_frame)) break;
-      kf_group_err +=
-          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
-    }
-    rc->frames_to_key = new_frame_to_key;
-  }
-
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
@@ -2998,16 +3110,46 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // how many bits to spend on it.
   boost_score = 0.0;
 
+  for (i = 0; i < VPXMIN(MAX_SCAN_FRAMES_FOR_KF_BOOST, (rc->frames_to_key - 1));
+       ++i) {
+    if (EOF == input_stats(twopass, &next_frame)) break;
+
+    zero_motion_sum += next_frame.pcnt_inter - next_frame.pcnt_motion;
+    motion_compensable_sum +=
+        1 - (double)next_frame.coded_error / next_frame.intra_error;
+    num_frames++;
+  }
+
+  if (num_frames >= MIN_SCAN_FRAMES_FOR_KF_BOOST) {
+    zero_motion_avg = zero_motion_sum / num_frames;
+    motion_compensable_avg = motion_compensable_sum / num_frames;
+    kf_boost_scan_frames = (int)(VPXMAX(64 * zero_motion_avg - 16,
+                                        160 * motion_compensable_avg - 112));
+    kf_boost_scan_frames =
+        VPXMAX(VPXMIN(kf_boost_scan_frames, MAX_SCAN_FRAMES_FOR_KF_BOOST),
+               MIN_SCAN_FRAMES_FOR_KF_BOOST);
+  }
+  reset_fpf_position(twopass, start_position);
+
   for (i = 0; i < (rc->frames_to_key - 1); ++i) {
     if (EOF == input_stats(twopass, &next_frame)) break;
 
-    if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+    // The zero motion test here insures that if we mark a kf group as static
+    // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
+    // It also allows for a larger boost on long static groups.
+    if ((i <= kf_boost_scan_frames) || (zero_motion_accumulator >= 0.99)) {
       double frame_boost;
       double zm_factor;
 
       // Monitor for static sections.
-      zero_motion_accumulator = VPXMIN(
-          zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      // First frame in kf group the second ref indicator is invalid.
+      if (i > 0) {
+        zero_motion_accumulator = VPXMIN(
+            zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+      } else {
+        zero_motion_accumulator =
+            next_frame.pcnt_inter - next_frame.pcnt_motion;
+      }
 
       // Factor 0.75-1.25 based on how much of frame is static.
       zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3021,7 +3163,15 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
                                         KF_MAX_FRAME_BOOST * zm_factor);
 
       boost_score += frame_boost;
-      if (frame_boost < 25.00) break;
+
+      // Measure of zoom. Large zoom tends to indicate reduced boost.
+      abs_mv_in_out_accumulator +=
+          fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+      if ((frame_boost < 25.00) ||
+          (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) ||
+          (sr_accumulator > (kf_raw_err * 1.50)))
+        break;
     } else {
       break;
     }
@@ -3033,17 +3183,30 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
   // Calculate a section intra ratio used in setting max loop filter.
-  twopass->section_intra_rating = calculate_section_intra_ratio(
+  twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Apply various clamps for min and max boost
-  rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
-  rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
-  rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  // Special case for static / slide show content but dont apply
+  // if the kf group is very short.
+  if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+    rc->kf_boost = MAX_KF_TOT_BOOST;
+  } else {
+    // Apply various clamps for min and max boost
+    rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+    rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+    rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+  }
 
   // Work out how many bits to allocate for the key frame itself.
   kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
                                  twopass->kf_group_bits);
+  // Based on the spatial complexity, increase the bits allocated to key frame.
+  kf_bits +=
+      (int)((twopass->kf_group_bits - kf_bits) * (kf_mod_err / kf_group_err));
+  max_kf_bits =
+      twopass->kf_group_bits - (rc->frames_to_key - 1) * FRAME_OVERHEAD_BITS;
+  max_kf_bits = lclamp(max_kf_bits, 0, INT_MAX);
+  kf_bits = VPXMIN(kf_bits, (int)max_kf_bits);
 
   twopass->kf_group_bits -= kf_bits;
 
@@ -3064,51 +3227,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to normal-sized frame on keyframes.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
-}
-
-// Define the reference buffers that will be updated post encode.
-static void configure_buffer_updates(VP9_COMP *cpi) {
-  TWO_PASS *const twopass = &cpi->twopass;
-
-  cpi->rc.is_src_frame_alt_ref = 0;
-  switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
-    case KF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-    case LF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-    case GF_UPDATE:
-      cpi->refresh_last_frame = 1;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 0;
-      break;
-    case OVERLAY_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 1;
-      cpi->refresh_alt_ref_frame = 0;
-      cpi->rc.is_src_frame_alt_ref = 1;
-      break;
-    case ARF_UPDATE:
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-      cpi->refresh_alt_ref_frame = 1;
-      break;
-    default: assert(0); break;
-  }
-  if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.temporal_layer_id > 0) {
-      cpi->refresh_last_frame = 0;
-      cpi->refresh_golden_frame = 0;
-    }
-    if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0)
-      cpi->refresh_golden_frame = 0;
-    if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0;
-  }
+#define ARF_ACTIVE_BEST_QUALITY_ADJUSTMENT_WINDOW_SIZE 64
+  // TODO(ravi.chaudhary@ittiam.com): Experiment without the below min
+  // condition. This might be helpful for small key frame intervals.
+  rc->arf_active_best_quality_adjustment_window =
+      VPXMIN(ARF_ACTIVE_BEST_QUALITY_ADJUSTMENT_WINDOW_SIZE, rc->frames_to_key);
 }
 
 static int is_skippable_frame(const VP9_COMP *cpi) {
@@ -3116,10 +3239,7 @@ static int is_skippable_frame(const VP9_COMP *cpi) {
   // first  pass, and so do its previous and forward frames, then this frame
   // can be skipped for partition check, and the partition size is assigned
   // according to the variance
-  const SVC *const svc = &cpi->svc;
-  const TWO_PASS *const twopass =
-      is_two_pass_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass
-                           : &cpi->twopass;
+  const TWO_PASS *const twopass = &cpi->twopass;
 
   return (!frame_is_intra_only(&cpi->common) &&
           twopass->stats_in - 2 > twopass->stats_in_start &&
@@ -3140,41 +3260,38 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS this_frame;
 
-  int target_rate;
-  LAYER_CONTEXT *const lc =
-      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
-                           : 0;
-
   if (!twopass->stats_in) return;
 
   // If this is an arf frame then we dont want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
-    configure_buffer_updates(cpi);
+
+    vp9_zero(this_frame);
+    this_frame =
+        cpi->twopass.stats_in_start[cm->current_video_frame +
+                                    gf_group->arf_src_offset[gf_group->index]];
+
+    vp9_configure_buffer_updates(cpi, gf_group->index);
+
     target_rate = gf_group->bit_allocation[gf_group->index];
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
     rc->base_frame_target = target_rate;
 
     cm->frame_type = INTER_FRAME;
 
-    if (lc != NULL) {
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-        if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-      }
-    }
-
     // Do the firstpass stats indicate that this frame is skippable for the
     // partition search?
     if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-        (!cpi->use_svc || is_two_pass_svc(cpi))) {
+        !cpi->use_svc) {
       cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
     }
 
+    // The multiplication by 256 reverses a scaling factor of (>> 8)
+    // applied when combining MB error values for the frame.
+    twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+    twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
+
     return;
   }
 
@@ -3182,12 +3299,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
 
   if (cpi->oxcf.rc_mode == VPX_Q) {
     twopass->active_worst_quality = cpi->oxcf.cq_level;
-  } else if (cm->current_video_frame == 0 ||
-             (lc != NULL && lc->current_video_frame_in_layer == 0)) {
+  } else if (cm->current_video_frame == 0) {
     const int frames_left =
-        (int)(twopass->total_stats.count -
-              ((lc != NULL) ? lc->current_video_frame_in_layer
-                            : cm->current_video_frame));
+        (int)(twopass->total_stats.count - cm->current_video_frame);
     // Special case code for first frame.
     const int section_target_bandwidth =
         (int)(twopass->bits_left / frames_left);
@@ -3236,59 +3350,36 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     cm->frame_type = INTER_FRAME;
   }
 
-  if (lc != NULL) {
-    if (cpi->svc.spatial_layer_id == 0) {
-      lc->is_key_frame = (cm->frame_type == KEY_FRAME);
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &=
-            (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-        lc->frames_from_key_frame = 0;
-        // Encode an intra only empty frame since we have a key frame.
-        cpi->svc.encode_intra_empty_frame = 1;
-      }
-    } else {
-      cm->frame_type = INTER_FRAME;
-      lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame;
-
-      if (lc->is_key_frame) {
-        cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-        lc->frames_from_key_frame = 0;
-      }
-    }
-  }
-
   // Define a new GF/ARF group. (Should always enter here for key frames).
   if (rc->frames_till_gf_update_due == 0) {
     define_gf_group(cpi, &this_frame);
 
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
-    if (lc != NULL) cpi->refresh_golden_frame = 1;
 
 #if ARF_STATS_OUTPUT
     {
       FILE *fpfile;
       fpfile = fopen("arf.stt", "a");
       ++arf_count;
-      fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame,
-              rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
-              rc->gfu_boost);
+      fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n",
+              cm->current_video_frame, rc->frames_till_gf_update_due,
+              rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type);
 
       fclose(fpfile);
     }
 #endif
   }
 
-  configure_buffer_updates(cpi);
+  vp9_configure_buffer_updates(cpi, gf_group->index);
 
   // Do the firstpass stats indicate that this frame is skippable for the
   // partition search?
   if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 &&
-      (!cpi->use_svc || is_two_pass_svc(cpi))) {
+      !cpi->use_svc) {
     cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
   }
 
-  target_rate = gf_group->bit_allocation[gf_group->index];
-  rc->base_frame_target = target_rate;
+  rc->base_frame_target = gf_group->bit_allocation[gf_group->index];
 
   // The multiplication by 256 reverses a scaling factor of (>> 8)
   // applied when combining MB error values for the frame.
@@ -3329,8 +3420,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
     rc->rate_error_estimate = 0;
   }
 
-  if (cpi->common.frame_type != KEY_FRAME &&
-      !vp9_is_upper_layer_key_frame(cpi)) {
+  if (cpi->common.frame_type != KEY_FRAME) {
     twopass->kf_group_bits -= bits_used;
     twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
   }
@@ -3350,7 +3440,8 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) {
 
     // Extend min or Max Q range to account for imbalance from the base
     // value when using AQ.
-    if (cpi->oxcf.aq_mode != NO_AQ) {
+    if (cpi->oxcf.aq_mode != NO_AQ && cpi->oxcf.aq_mode != PSNR_AQ &&
+        cpi->oxcf.aq_mode != PERCEPTUAL_AQ) {
       if (cm->seg.aq_av_offset < 0) {
         // The balance of the AQ map tends towarda lowering the average Q.
         aq_extend_min = 0;
diff --git a/libs/libvpx/vp9/encoder/vp9_firstpass.h b/libs/libvpx/vp9/encoder/vp9_firstpass.h
index 000ecd77926..a0a96e6ef63 100644
--- a/libs/libvpx/vp9/encoder/vp9_firstpass.h
+++ b/libs/libvpx/vp9/encoder/vp9_firstpass.h
@@ -8,8 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
-#define VP9_ENCODER_VP9_FIRSTPASS_H_
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
+
+#include <assert.h>
 
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -39,7 +41,10 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
-#define INVALID_ROW -1
+#define INVALID_ROW (-1)
+
+#define MAX_ARF_LAYERS 6
+#define SECTION_NOISE_DEF 250.0
 
 typedef struct {
   double frame_mb_intra_factor;
@@ -107,7 +112,9 @@ typedef enum {
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
-  FRAME_UPDATE_TYPES = 5
+  MID_OVERLAY_UPDATE = 5,
+  USE_BUF_FRAME = 6,  // Use show existing frame, no ref buffer update
+  FRAME_UPDATE_TYPES = 7
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -119,17 +126,29 @@ typedef enum {
 
 typedef struct {
   unsigned char index;
-  unsigned char first_inter_index;
-  RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
-  FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
-  int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+  RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2];
+  int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2];
+
+  int frame_start;
+  int frame_end;
+  // TODO(jingning): The array size of arf_stack could be reduced.
+  int arf_index_stack[MAX_LAG_BUFFERS * 2];
+  int top_arf_idx;
+  int stack_size;
+  int gf_group_size;
+  int max_layer_depth;
+  int allowed_max_layer_depth;
+  int group_noise_energy;
 } GF_GROUP;
 
 typedef struct {
   unsigned int section_intra_rating;
+  unsigned int key_frame_section_intra_rating;
   FIRSTPASS_STATS total_stats;
   FIRSTPASS_STATS this_frame_stats;
   const FIRSTPASS_STATS *stats_in;
@@ -182,7 +201,6 @@ struct ThreadData;
 struct TileDataEnc;
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
-void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
 void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
@@ -194,7 +212,6 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
 
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
-void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 
 // Post encode update of the rate control parameters for 2-pass
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
@@ -206,4 +223,4 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_FIRSTPASS_H_
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_job_queue.h b/libs/libvpx/vp9/encoder/vp9_job_queue.h
index 89c08f207a4..ad09c11198a 100644
--- a/libs/libvpx/vp9/encoder/vp9_job_queue.h
+++ b/libs/libvpx/vp9/encoder/vp9_job_queue.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_
-#define VP9_ENCODER_VP9_JOB_QUEUE_H_
+#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
 
 typedef enum {
   FIRST_PASS_JOB,
@@ -43,4 +43,4 @@ typedef struct {
   int num_jobs_acquired;
 } JobQueueHandle;
 
-#endif  // VP9_ENCODER_VP9_JOB_QUEUE_H_
+#endif  // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_lookahead.h b/libs/libvpx/vp9/encoder/vp9_lookahead.h
index 88be0ffcd58..c627bede239 100644
--- a/libs/libvpx/vp9/encoder/vp9_lookahead.h
+++ b/libs/libvpx/vp9/encoder/vp9_lookahead.h
@@ -8,17 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_
-#define VP9_ENCODER_VP9_LOOKAHEAD_H_
+#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
+#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
 
 #include "vpx_scale/yv12config.h"
 #include "vpx/vpx_encoder.h"
 #include "vpx/vpx_integer.h"
 
-#if CONFIG_SPATIAL_SVC
-#include "vpx/vp8cx.h"
-#endif
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -115,4 +111,4 @@ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_LOOKAHEAD_H_
+#endif  // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.c b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
index 46d626def17..831c79c1753 100644
--- a/libs/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -57,11 +57,12 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv,
   {
     uint32_t distortion;
     uint32_t sse;
+    // TODO(yunqing): may use higher tap interp filter than 2 taps if needed.
     cpi->find_fractional_mv_step(
         x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
-        &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
+        &v_fn_ptr, 0, mv_sf->subpel_search_level,
         cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0,
-        0);
+        0, USE_2_TAPS);
   }
 
   xd->mi[0]->mode = NEWMV;
diff --git a/libs/libvpx/vp9/encoder/vp9_mbgraph.h b/libs/libvpx/vp9/encoder/vp9_mbgraph.h
index df2fb98efa6..7b629861d51 100644
--- a/libs/libvpx/vp9/encoder/vp9_mbgraph.h
+++ b/libs/libvpx/vp9/encoder/vp9_mbgraph.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_MBGRAPH_H_
-#define VP9_ENCODER_VP9_MBGRAPH_H_
+#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_
+#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,7 +25,9 @@ typedef struct {
   } ref[MAX_REF_FRAMES];
 } MBGRAPH_MB_STATS;
 
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+  MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
 
 struct VP9_COMP;
 
@@ -35,4 +37,4 @@ void vp9_update_mbgraph_stats(struct VP9_COMP *cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_MBGRAPH_H_
+#endif  // VPX_VP9_ENCODER_VP9_MBGRAPH_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.c b/libs/libvpx/vp9/encoder/vp9_mcomp.c
index 44f01be25a0..d1688f9938f 100644
--- a/libs/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.c
@@ -29,11 +29,6 @@
 
 // #define NEW_DIAMOND_SEARCH
 
-static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
-                                             const MV *mv) {
-  return &buf->buf[mv->row * buf->stride + mv->col];
-}
-
 void vp9_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
   int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
   int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
@@ -263,27 +258,6 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
     }                                                             \
   }
 
-// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
-// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
-// later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST                \
-  {                                             \
-    unsigned int second;                        \
-    int br0 = br;                               \
-    int bc0 = bc;                               \
-    assert(tr == br || tc == bc);               \
-    if (tr == br && tc != bc) {                 \
-      kc = bc - tc;                             \
-    } else if (tr != br && tc == bc) {          \
-      kr = br - tr;                             \
-    }                                           \
-    CHECK_BETTER(second, br0 + kr, bc0);        \
-    CHECK_BETTER(second, br0, bc0 + kc);        \
-    if (br0 != br || bc0 != bc) {               \
-      CHECK_BETTER(second, br0 + kr, bc0 + kc); \
-    }                                           \
-  }
-
 #define SETUP_SUBPEL_SEARCH                                                 \
   const uint8_t *const z = x->plane[0].src.buf;                             \
   const int src_stride = x->plane[0].src.stride;                            \
@@ -329,8 +303,8 @@ static unsigned int setup_center_error(
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
-      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                               y_stride);
+      vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+                               h, CONVERT_TO_SHORTPTR(y + offset), y_stride);
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
@@ -388,14 +362,12 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
   *ir = (int)divide_and_round(x1 * b, y1);
 }
 
-uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
-                                 const MV *ref_mv, int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop, int iters_per_step,
-                                 int *cost_list, int *mvjcost, int *mvcost[2],
-                                 uint32_t *distortion, uint32_t *sse1,
-                                 const uint8_t *second_pred, int w, int h) {
+uint32_t vp9_skip_sub_pixel_tree(
+    const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
+    int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
+    int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+    uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
@@ -418,6 +390,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
   (void)sse;
   (void)thismse;
   (void)cost_list;
+  (void)use_accurate_subpel_search;
 
   return besterr;
 }
@@ -427,7 +400,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
@@ -439,6 +412,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
   (void)allow_hp;
   (void)forced_stop;
   (void)hstep;
+  (void)use_accurate_subpel_search;
 
   if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
@@ -492,8 +466,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
                                offset, mvjcost, mvcost, sse1, distortion);
@@ -552,8 +528,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   SETUP_SUBPEL_SEARCH;
+  (void)use_accurate_subpel_search;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z,
                                src_stride, y, y_stride, second_pred, w, h,
                                offset, mvjcost, mvcost, sse1, distortion);
@@ -638,12 +616,119 @@ static const MV search_step_table[12] = {
 };
 /* clang-format on */
 
+static int accurate_sub_pel_search(
+    const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf,
+    const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp,
+    const uint8_t *const src_address, const int src_stride,
+    const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred,
+    int w, int h, uint32_t *sse) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint64_t besterr;
+  assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+  assert(w != 0 && h != 0);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]);
+    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride,
+                                     pred16, w, this_mv, sf, w, h, 0, kernel,
+                                     MV_PRECISION_Q3, 0, 0, xd->bd);
+    if (second_pred != NULL) {
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w,
+                               h, pred16, w);
+      besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address,
+                        src_stride, sse);
+    } else {
+      besterr =
+          vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse);
+    }
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+    vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+                              0, kernel, MV_PRECISION_Q3, 0, 0);
+    if (second_pred != NULL) {
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+      besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+    } else {
+      besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+    }
+  }
+  if (besterr >= UINT_MAX) return UINT_MAX;
+  return (int)besterr;
+#else
+  int besterr;
+  DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
+  assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16);
+  assert(w != 0 && h != 0);
+  (void)xd;
+
+  vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
+                            0, kernel, MV_PRECISION_Q3, 0, 0);
+  if (second_pred != NULL) {
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
+    besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
+  } else {
+    besterr = vfp->vf(pred, w, src_address, src_stride, sse);
+  }
+  return besterr;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+// TODO(yunqing): this part can be further refactored.
+#if CONFIG_VP9_HIGHBITDEPTH
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                                 \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
+    int64_t tmpmse;                                                            \
+    const MV mv = { r, c };                                                    \
+    const MV ref_mv = { rr, rc };                                              \
+    thismse =                                                                  \
+        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
+                                y, y_stride, second_pred, w, h, &sse);         \
+    tmpmse = thismse;                                                          \
+    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);       \
+    if (tmpmse >= INT_MAX) {                                                   \
+      v = INT_MAX;                                                             \
+    } else if ((v = (uint32_t)tmpmse) < besterr) {                             \
+      besterr = v;                                                             \
+      br = r;                                                                  \
+      bc = c;                                                                  \
+      *distortion = thismse;                                                   \
+      *sse1 = sse;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    v = INT_MAX;                                                               \
+  }
+#else
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c)                                                 \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
+    const MV mv = { r, c };                                                    \
+    const MV ref_mv = { rr, rc };                                              \
+    thismse =                                                                  \
+        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
+                                y, y_stride, second_pred, w, h, &sse);         \
+    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +       \
+             thismse) < besterr) {                                             \
+      besterr = v;                                                             \
+      br = r;                                                                  \
+      bc = c;                                                                  \
+      *distortion = thismse;                                                   \
+      *sse1 = sse;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    v = INT_MAX;                                                               \
+  }
+
+#endif
+
 uint32_t vp9_find_best_sub_pixel_tree(
     const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   const uint8_t *const z = x->plane[0].src.buf;
   const uint8_t *const src_address = z;
   const int src_stride = x->plane[0].src.stride;
@@ -671,6 +756,17 @@ uint32_t vp9_find_best_sub_pixel_tree(
   int kr, kc;
   MvLimits subpel_mv_limits;
 
+  // TODO(yunqing): need to add 4-tap filter optimization to speed up the
+  // encoder.
+  const InterpKernel *kernel =
+      (use_accurate_subpel_search > 0)
+          ? ((use_accurate_subpel_search == USE_4_TAPS)
+                 ? vp9_filter_kernels[FOURTAP]
+                 : ((use_accurate_subpel_search == USE_8_TAPS)
+                        ? vp9_filter_kernels[EIGHTTAP]
+                        : vp9_filter_kernels[EIGHTTAP_SHARP]))
+          : vp9_filter_kernels[BILINEAR];
+
   vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
   minc = subpel_mv_limits.col_min;
   maxc = subpel_mv_limits.col_max;
@@ -695,16 +791,25 @@ uint32_t vp9_find_best_sub_pixel_tree(
       tr = br + search_step[idx].row;
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
         MV this_mv;
         this_mv.row = tr;
         this_mv.col = tc;
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                             src_stride, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+
+        if (use_accurate_subpel_search) {
+          thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+                                            src_address, src_stride, y,
+                                            y_stride, second_pred, w, h, &sse);
+        } else {
+          const uint8_t *const pre_address =
+              y + (tr >> 3) * y_stride + (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+        }
+
         cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
                                                 mvcost, error_per_bit);
 
@@ -726,14 +831,21 @@ uint32_t vp9_find_best_sub_pixel_tree(
     tc = bc + kc;
     tr = br + kr;
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
       MV this_mv = { tr, tc };
-      if (second_pred == NULL)
-        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                           src_stride, &sse);
-      else
-        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address,
-                            src_stride, &sse, second_pred);
+      if (use_accurate_subpel_search) {
+        thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp,
+                                          src_address, src_stride, y, y_stride,
+                                          second_pred, w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address,
+                             src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+      }
+
       cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
                                             error_per_bit);
 
@@ -755,10 +867,48 @@ uint32_t vp9_find_best_sub_pixel_tree(
       bc = tc;
     }
 
-    if (iters_per_step > 1 && best_idx != -1) SECOND_LEVEL_CHECKS_BEST;
+    if (iters_per_step > 0 && best_idx != -1) {
+      unsigned int second;
+      const int br0 = br;
+      const int bc0 = bc;
+      assert(tr == br || tc == bc);
+
+      if (tr == br && tc != bc) {
+        kc = bc - tc;
+        if (iters_per_step == 1) {
+          if (use_accurate_subpel_search) {
+            CHECK_BETTER1(second, br0, bc0 + kc);
+          } else {
+            CHECK_BETTER(second, br0, bc0 + kc);
+          }
+        }
+      } else if (tr != br && tc == bc) {
+        kr = br - tr;
+        if (iters_per_step == 1) {
+          if (use_accurate_subpel_search) {
+            CHECK_BETTER1(second, br0 + kr, bc0);
+          } else {
+            CHECK_BETTER(second, br0 + kr, bc0);
+          }
+        }
+      }
 
-    tr = br;
-    tc = bc;
+      if (iters_per_step > 1) {
+        if (use_accurate_subpel_search) {
+          CHECK_BETTER1(second, br0 + kr, bc0);
+          CHECK_BETTER1(second, br0, bc0 + kc);
+          if (br0 != br || bc0 != bc) {
+            CHECK_BETTER1(second, br0 + kr, bc0 + kc);
+          }
+        } else {
+          CHECK_BETTER(second, br0 + kr, bc0);
+          CHECK_BETTER(second, br0, bc0 + kc);
+          if (br0 != br || bc0 != bc) {
+            CHECK_BETTER(second, br0 + kr, bc0 + kc);
+          }
+        }
+      }
+    }
 
     search_step += 4;
     hstep >>= 1;
@@ -780,6 +930,7 @@ uint32_t vp9_find_best_sub_pixel_tree(
 }
 
 #undef CHECK_BETTER
+#undef CHECK_BETTER1
 
 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
                                int range) {
@@ -1490,7 +1641,7 @@ static int fast_dia_search(const MACROBLOCK *x, MV *ref_mv, int search_param,
 
 // Exhuastive motion search around a given centre position with a given
 // step size.
-static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+static int exhaustive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
                                   int range, int step, int sad_per_bit,
                                   const vp9_variance_fn_ptr_t *fn_ptr,
                                   const MV *center_mv) {
@@ -1576,6 +1727,510 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv,
   return best_sad;
 }
 
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+#if CONFIG_NON_GREEDY_MV
+
+#define LOG2_TABLE_SIZE 1024
+static const int log2_table[LOG2_TABLE_SIZE] = {
+  0,  // This is a dummy value
+  0,        1048576,  1661954,  2097152,  2434718,  2710530,  2943725,
+  3145728,  3323907,  3483294,  3627477,  3759106,  3880192,  3992301,
+  4096672,  4194304,  4286015,  4372483,  4454275,  4531870,  4605679,
+  4676053,  4743299,  4807682,  4869436,  4928768,  4985861,  5040877,
+  5093962,  5145248,  5194851,  5242880,  5289431,  5334591,  5378443,
+  5421059,  5462508,  5502851,  5542146,  5580446,  5617800,  5654255,
+  5689851,  5724629,  5758625,  5791875,  5824409,  5856258,  5887450,
+  5918012,  5947969,  5977344,  6006160,  6034437,  6062195,  6089453,
+  6116228,  6142538,  6168398,  6193824,  6218829,  6243427,  6267632,
+  6291456,  6314910,  6338007,  6360756,  6383167,  6405252,  6427019,
+  6448477,  6469635,  6490501,  6511084,  6531390,  6551427,  6571202,
+  6590722,  6609993,  6629022,  6647815,  6666376,  6684713,  6702831,
+  6720734,  6738427,  6755916,  6773205,  6790299,  6807201,  6823917,
+  6840451,  6856805,  6872985,  6888993,  6904834,  6920510,  6936026,
+  6951384,  6966588,  6981641,  6996545,  7011304,  7025920,  7040397,
+  7054736,  7068940,  7083013,  7096956,  7110771,  7124461,  7138029,
+  7151476,  7164804,  7178017,  7191114,  7204100,  7216974,  7229740,
+  7242400,  7254954,  7267405,  7279754,  7292003,  7304154,  7316208,
+  7328167,  7340032,  7351805,  7363486,  7375079,  7386583,  7398000,
+  7409332,  7420579,  7431743,  7442826,  7453828,  7464751,  7475595,
+  7486362,  7497053,  7507669,  7518211,  7528680,  7539077,  7549404,
+  7559660,  7569847,  7579966,  7590017,  7600003,  7609923,  7619778,
+  7629569,  7639298,  7648964,  7658569,  7668114,  7677598,  7687023,
+  7696391,  7705700,  7714952,  7724149,  7733289,  7742375,  7751407,
+  7760385,  7769310,  7778182,  7787003,  7795773,  7804492,  7813161,
+  7821781,  7830352,  7838875,  7847350,  7855777,  7864158,  7872493,
+  7880782,  7889027,  7897226,  7905381,  7913492,  7921561,  7929586,
+  7937569,  7945510,  7953410,  7961268,  7969086,  7976864,  7984602,
+  7992301,  7999960,  8007581,  8015164,  8022709,  8030217,  8037687,
+  8045121,  8052519,  8059880,  8067206,  8074496,  8081752,  8088973,
+  8096159,  8103312,  8110431,  8117516,  8124569,  8131589,  8138576,
+  8145532,  8152455,  8159347,  8166208,  8173037,  8179836,  8186605,
+  8193343,  8200052,  8206731,  8213380,  8220001,  8226593,  8233156,
+  8239690,  8246197,  8252676,  8259127,  8265550,  8271947,  8278316,
+  8284659,  8290976,  8297266,  8303530,  8309768,  8315981,  8322168,
+  8328330,  8334467,  8340579,  8346667,  8352730,  8358769,  8364784,
+  8370775,  8376743,  8382687,  8388608,  8394506,  8400381,  8406233,
+  8412062,  8417870,  8423655,  8429418,  8435159,  8440878,  8446576,
+  8452252,  8457908,  8463542,  8469155,  8474748,  8480319,  8485871,
+  8491402,  8496913,  8502404,  8507875,  8513327,  8518759,  8524171,
+  8529564,  8534938,  8540293,  8545629,  8550947,  8556245,  8561525,
+  8566787,  8572031,  8577256,  8582464,  8587653,  8592825,  8597980,
+  8603116,  8608236,  8613338,  8618423,  8623491,  8628542,  8633576,
+  8638593,  8643594,  8648579,  8653547,  8658499,  8663434,  8668354,
+  8673258,  8678145,  8683017,  8687874,  8692715,  8697540,  8702350,
+  8707145,  8711925,  8716690,  8721439,  8726174,  8730894,  8735599,
+  8740290,  8744967,  8749628,  8754276,  8758909,  8763528,  8768134,
+  8772725,  8777302,  8781865,  8786415,  8790951,  8795474,  8799983,
+  8804478,  8808961,  8813430,  8817886,  8822328,  8826758,  8831175,
+  8835579,  8839970,  8844349,  8848715,  8853068,  8857409,  8861737,
+  8866053,  8870357,  8874649,  8878928,  8883195,  8887451,  8891694,
+  8895926,  8900145,  8904353,  8908550,  8912734,  8916908,  8921069,
+  8925220,  8929358,  8933486,  8937603,  8941708,  8945802,  8949885,
+  8953957,  8958018,  8962068,  8966108,  8970137,  8974155,  8978162,
+  8982159,  8986145,  8990121,  8994086,  8998041,  9001986,  9005920,
+  9009844,  9013758,  9017662,  9021556,  9025440,  9029314,  9033178,
+  9037032,  9040877,  9044711,  9048536,  9052352,  9056157,  9059953,
+  9063740,  9067517,  9071285,  9075044,  9078793,  9082533,  9086263,
+  9089985,  9093697,  9097400,  9101095,  9104780,  9108456,  9112123,
+  9115782,  9119431,  9123072,  9126704,  9130328,  9133943,  9137549,
+  9141146,  9144735,  9148316,  9151888,  9155452,  9159007,  9162554,
+  9166092,  9169623,  9173145,  9176659,  9180165,  9183663,  9187152,
+  9190634,  9194108,  9197573,  9201031,  9204481,  9207923,  9211357,
+  9214784,  9218202,  9221613,  9225017,  9228412,  9231800,  9235181,
+  9238554,  9241919,  9245277,  9248628,  9251971,  9255307,  9258635,
+  9261956,  9265270,  9268577,  9271876,  9275169,  9278454,  9281732,
+  9285002,  9288266,  9291523,  9294773,  9298016,  9301252,  9304481,
+  9307703,  9310918,  9314126,  9317328,  9320523,  9323711,  9326892,
+  9330067,  9333235,  9336397,  9339552,  9342700,  9345842,  9348977,
+  9352106,  9355228,  9358344,  9361454,  9364557,  9367654,  9370744,
+  9373828,  9376906,  9379978,  9383043,  9386102,  9389155,  9392202,
+  9395243,  9398278,  9401306,  9404329,  9407345,  9410356,  9413360,
+  9416359,  9419351,  9422338,  9425319,  9428294,  9431263,  9434226,
+  9437184,  9440136,  9443082,  9446022,  9448957,  9451886,  9454809,
+  9457726,  9460638,  9463545,  9466446,  9469341,  9472231,  9475115,
+  9477994,  9480867,  9483735,  9486597,  9489454,  9492306,  9495152,
+  9497993,  9500828,  9503659,  9506484,  9509303,  9512118,  9514927,
+  9517731,  9520530,  9523324,  9526112,  9528895,  9531674,  9534447,
+  9537215,  9539978,  9542736,  9545489,  9548237,  9550980,  9553718,
+  9556451,  9559179,  9561903,  9564621,  9567335,  9570043,  9572747,
+  9575446,  9578140,  9580830,  9583514,  9586194,  9588869,  9591540,
+  9594205,  9596866,  9599523,  9602174,  9604821,  9607464,  9610101,
+  9612735,  9615363,  9617987,  9620607,  9623222,  9625832,  9628438,
+  9631040,  9633637,  9636229,  9638818,  9641401,  9643981,  9646556,
+  9649126,  9651692,  9654254,  9656812,  9659365,  9661914,  9664459,
+  9666999,  9669535,  9672067,  9674594,  9677118,  9679637,  9682152,
+  9684663,  9687169,  9689672,  9692170,  9694665,  9697155,  9699641,
+  9702123,  9704601,  9707075,  9709545,  9712010,  9714472,  9716930,
+  9719384,  9721834,  9724279,  9726721,  9729159,  9731593,  9734024,
+  9736450,  9738872,  9741291,  9743705,  9746116,  9748523,  9750926,
+  9753326,  9755721,  9758113,  9760501,  9762885,  9765266,  9767642,
+  9770015,  9772385,  9774750,  9777112,  9779470,  9781825,  9784175,
+  9786523,  9788866,  9791206,  9793543,  9795875,  9798204,  9800530,
+  9802852,  9805170,  9807485,  9809797,  9812104,  9814409,  9816710,
+  9819007,  9821301,  9823591,  9825878,  9828161,  9830441,  9832718,
+  9834991,  9837261,  9839527,  9841790,  9844050,  9846306,  9848559,
+  9850808,  9853054,  9855297,  9857537,  9859773,  9862006,  9864235,
+  9866462,  9868685,  9870904,  9873121,  9875334,  9877544,  9879751,
+  9881955,  9884155,  9886352,  9888546,  9890737,  9892925,  9895109,
+  9897291,  9899469,  9901644,  9903816,  9905985,  9908150,  9910313,
+  9912473,  9914629,  9916783,  9918933,  9921080,  9923225,  9925366,
+  9927504,  9929639,  9931771,  9933900,  9936027,  9938150,  9940270,
+  9942387,  9944502,  9946613,  9948721,  9950827,  9952929,  9955029,
+  9957126,  9959219,  9961310,  9963398,  9965484,  9967566,  9969645,
+  9971722,  9973796,  9975866,  9977934,  9980000,  9982062,  9984122,
+  9986179,  9988233,  9990284,  9992332,  9994378,  9996421,  9998461,
+  10000498, 10002533, 10004565, 10006594, 10008621, 10010644, 10012665,
+  10014684, 10016700, 10018713, 10020723, 10022731, 10024736, 10026738,
+  10028738, 10030735, 10032729, 10034721, 10036710, 10038697, 10040681,
+  10042662, 10044641, 10046617, 10048591, 10050562, 10052530, 10054496,
+  10056459, 10058420, 10060379, 10062334, 10064287, 10066238, 10068186,
+  10070132, 10072075, 10074016, 10075954, 10077890, 10079823, 10081754,
+  10083682, 10085608, 10087532, 10089453, 10091371, 10093287, 10095201,
+  10097112, 10099021, 10100928, 10102832, 10104733, 10106633, 10108529,
+  10110424, 10112316, 10114206, 10116093, 10117978, 10119861, 10121742,
+  10123620, 10125495, 10127369, 10129240, 10131109, 10132975, 10134839,
+  10136701, 10138561, 10140418, 10142273, 10144126, 10145976, 10147825,
+  10149671, 10151514, 10153356, 10155195, 10157032, 10158867, 10160699,
+  10162530, 10164358, 10166184, 10168007, 10169829, 10171648, 10173465,
+  10175280, 10177093, 10178904, 10180712, 10182519, 10184323, 10186125,
+  10187925, 10189722, 10191518, 10193311, 10195103, 10196892, 10198679,
+  10200464, 10202247, 10204028, 10205806, 10207583, 10209357, 10211130,
+  10212900, 10214668, 10216435, 10218199, 10219961, 10221721, 10223479,
+  10225235, 10226989, 10228741, 10230491, 10232239, 10233985, 10235728,
+  10237470, 10239210, 10240948, 10242684, 10244417, 10246149, 10247879,
+  10249607, 10251333, 10253057, 10254779, 10256499, 10258217, 10259933,
+  10261647, 10263360, 10265070, 10266778, 10268485, 10270189, 10271892,
+  10273593, 10275292, 10276988, 10278683, 10280376, 10282068, 10283757,
+  10285444, 10287130, 10288814, 10290495, 10292175, 10293853, 10295530,
+  10297204, 10298876, 10300547, 10302216, 10303883, 10305548, 10307211,
+  10308873, 10310532, 10312190, 10313846, 10315501, 10317153, 10318804,
+  10320452, 10322099, 10323745, 10325388, 10327030, 10328670, 10330308,
+  10331944, 10333578, 10335211, 10336842, 10338472, 10340099, 10341725,
+  10343349, 10344971, 10346592, 10348210, 10349828, 10351443, 10353057,
+  10354668, 10356279, 10357887, 10359494, 10361099, 10362702, 10364304,
+  10365904, 10367502, 10369099, 10370694, 10372287, 10373879, 10375468,
+  10377057, 10378643, 10380228, 10381811, 10383393, 10384973, 10386551,
+  10388128, 10389703, 10391276, 10392848, 10394418, 10395986, 10397553,
+  10399118, 10400682, 10402244, 10403804, 10405363, 10406920, 10408476,
+  10410030, 10411582, 10413133, 10414682, 10416230, 10417776, 10419320,
+  10420863, 10422404, 10423944, 10425482, 10427019, 10428554, 10430087,
+  10431619, 10433149, 10434678, 10436206, 10437731, 10439256, 10440778,
+  10442299, 10443819, 10445337, 10446854, 10448369, 10449882, 10451394,
+  10452905, 10454414, 10455921, 10457427, 10458932, 10460435, 10461936,
+  10463436, 10464935, 10466432, 10467927, 10469422, 10470914, 10472405,
+  10473895, 10475383, 10476870, 10478355, 10479839, 10481322, 10482802,
+  10484282,
+};
+
+#define LOG2_PRECISION 20
+static int64_t log2_approximation(int64_t v) {
+  assert(v > 0);
+  if (v < LOG2_TABLE_SIZE) {
+    return log2_table[v];
+  } else {
+    // use linear approximation when v >= 2^10
+    const int slope =
+        1477;  // slope = 1 / (log(2) * 1024) * (1 << LOG2_PRECISION)
+    assert(LOG2_TABLE_SIZE == 1 << 10);
+
+    return slope * (v - LOG2_TABLE_SIZE) + (10 << LOG2_PRECISION);
+  }
+}
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs,
+                                 int mv_num) {
+  int i;
+  int update = 0;
+  int64_t best_cost = 0;
+  vpx_clear_system_state();
+  for (i = 0; i < mv_num; ++i) {
+    if (nb_mvs[i].as_int != INVALID_MV) {
+      MV nb_mv = nb_mvs[i].as_mv;
+      const int64_t row_diff = abs(mv->row - nb_mv.row);
+      const int64_t col_diff = abs(mv->col - nb_mv.col);
+      const int64_t cost =
+          log2_approximation(1 + row_diff * row_diff + col_diff * col_diff);
+      if (update == 0) {
+        best_cost = cost;
+        update = 1;
+      } else {
+        best_cost = cost < best_cost ? cost : best_cost;
+      }
+    }
+  }
+  return best_cost;
+}
+
+static int64_t exhaustive_mesh_search_new(const MACROBLOCK *x, MV *best_mv,
+                                          int range, int step,
+                                          const vp9_variance_fn_ptr_t *fn_ptr,
+                                          const MV *center_mv, int lambda,
+                                          const int_mv *nb_full_mvs,
+                                          int full_mv_num) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  MV fcenter_mv = { center_mv->row, center_mv->col };
+  int64_t best_sad;
+  int r, c, i;
+  int start_col, end_col, start_row, end_row;
+  int col_step = (step > 1) ? step : 4;
+
+  assert(step >= 1);
+
+  clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *best_mv = fcenter_mv;
+  best_sad =
+      ((int64_t)fn_ptr->sdf(what->buf, what->stride,
+                            get_buf_from_mv(in_what, &fcenter_mv),
+                            in_what->stride)
+       << LOG2_PRECISION) +
+      lambda * vp9_nb_mvs_inconsistency(&fcenter_mv, nb_full_mvs, full_mv_num);
+  start_row = VPXMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+  start_col = VPXMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+  end_row = VPXMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+  end_col = VPXMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+  for (r = start_row; r <= end_row; r += step) {
+    for (c = start_col; c <= end_col; c += col_step) {
+      // Step > 1 means we are not checking every location in this pass.
+      if (step > 1) {
+        const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+        int64_t sad =
+            (int64_t)fn_ptr->sdf(what->buf, what->stride,
+                                 get_buf_from_mv(in_what, &mv), in_what->stride)
+            << LOG2_PRECISION;
+        if (sad < best_sad) {
+          sad +=
+              lambda * vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+          if (sad < best_sad) {
+            best_sad = sad;
+            *best_mv = mv;
+          }
+        }
+      } else {
+        // 4 sads in a single call if we are checking every location
+        if (c + 3 <= end_col) {
+          unsigned int sads[4];
+          const uint8_t *addrs[4];
+          for (i = 0; i < 4; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            addrs[i] = get_buf_from_mv(in_what, &mv);
+          }
+          fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+          for (i = 0; i < 4; ++i) {
+            int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
+            if (sad < best_sad) {
+              const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+              sad += lambda *
+                     vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        } else {
+          for (i = 0; i < end_col - c; ++i) {
+            const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+            int64_t sad = (int64_t)fn_ptr->sdf(what->buf, what->stride,
+                                               get_buf_from_mv(in_what, &mv),
+                                               in_what->stride)
+                          << LOG2_PRECISION;
+            if (sad < best_sad) {
+              sad += lambda *
+                     vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
+              if (sad < best_sad) {
+                best_sad = sad;
+                *best_mv = mv;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return best_sad;
+}
+
+static int64_t full_pixel_exhaustive_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                                         MV *centre_mv_full,
+                                         const vp9_variance_fn_ptr_t *fn_ptr,
+                                         MV *dst_mv, int lambda,
+                                         const int_mv *nb_full_mvs,
+                                         int full_mv_num) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+  int64_t bestsme;
+  int i;
+  int interval = sf->mesh_patterns[0].interval;
+  int range = sf->mesh_patterns[0].range;
+  int baseline_interval_divisor;
+  const MV dummy_mv = { 0, 0 };
+
+  // Trap illegal values for interval and range for this function.
+  if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+      (interval > range)) {
+    printf("ERROR: invalid range\n");
+    assert(0);
+  }
+
+  baseline_interval_divisor = range / interval;
+
+  // Check size of proposed first range against magnitude of the centre
+  // value used as a starting point.
+  range = VPXMAX(range, (5 * VPXMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+  range = VPXMIN(range, MAX_RANGE);
+  interval = VPXMAX(interval, range / baseline_interval_divisor);
+
+  // initial search
+  bestsme =
+      exhaustive_mesh_search_new(x, &temp_mv, range, interval, fn_ptr, &temp_mv,
+                                 lambda, nb_full_mvs, full_mv_num);
+
+  if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+    // Progressive searches with range and step size decreasing each time
+    // till we reach a step size of 1. Then break out.
+    for (i = 1; i < MAX_MESH_STEP; ++i) {
+      // First pass with coarser step and longer range
+      bestsme = exhaustive_mesh_search_new(
+          x, &temp_mv, sf->mesh_patterns[i].range,
+          sf->mesh_patterns[i].interval, fn_ptr, &temp_mv, lambda, nb_full_mvs,
+          full_mv_num);
+
+      if (sf->mesh_patterns[i].interval == 1) break;
+    }
+  }
+
+  bestsme = vp9_get_mvpred_var(x, &temp_mv, &dummy_mv, fn_ptr, 0);
+  *dst_mv = temp_mv;
+
+  return bestsme;
+}
+
+static double diamond_search_sad_new(const MACROBLOCK *x,
+                                     const search_site_config *cfg,
+                                     const MV *init_full_mv, MV *best_full_mv,
+                                     int search_param, int lambda, int *num00,
+                                     const vp9_variance_fn_ptr_t *fn_ptr,
+                                     const int_mv *nb_full_mvs,
+                                     int full_mv_num) {
+  int i, j, step;
+
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  const int what_stride = x->plane[0].src.stride;
+  const uint8_t *in_what;
+  const int in_what_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *best_address;
+
+  double bestsad;
+  int best_site = -1;
+  int last_site = -1;
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  //  const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step];
+  const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step];
+  const int tot_steps = cfg->total_steps - search_param;
+  vpx_clear_system_state();
+
+  *best_full_mv = *init_full_mv;
+  clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  *num00 = 0;
+
+  // Work out the start point for the search
+  in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride +
+            best_full_mv->col;
+  best_address = in_what;
+
+  // Check the starting position
+  {
+    const double mv_dist =
+        fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+    const double mv_cost =
+        vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num) /
+        (double)(1 << LOG2_PRECISION);
+    bestsad = mv_dist + lambda * mv_cost;
+  }
+
+  i = 0;
+
+  for (step = 0; step < tot_steps; step++) {
+    int all_in = 1, t;
+
+    // All_in is true if every one of the points we are checking are within
+    // the bounds of the image.
+    all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min);
+    all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max);
+    all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min);
+    all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max);
+
+    // If all the pixels are within the bounds we don't check whether the
+    // search point is valid in this loop,  otherwise we check each point
+    // for validity..
+    if (all_in) {
+      unsigned int sad_array[4];
+
+      for (j = 0; j < cfg->searches_per_step; j += 4) {
+        unsigned char const *block_offset[4];
+
+        for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
+
+        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                       sad_array);
+
+        for (t = 0; t < 4; t++, i++) {
+          if (sad_array[t] < bestsad) {
+            const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+                                 best_full_mv->col + ss_mv[i].col };
+            const double mv_dist = sad_array[t];
+            const double mv_cost =
+                vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num) /
+                (double)(1 << LOG2_PRECISION);
+            double thissad = mv_dist + lambda * mv_cost;
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+      }
+    } else {
+      for (j = 0; j < cfg->searches_per_step; j++) {
+        // Trap illegal vectors
+        const MV this_mv = { best_full_mv->row + ss_mv[i].row,
+                             best_full_mv->col + ss_mv[i].col };
+
+        if (is_mv_in(&x->mv_limits, &this_mv)) {
+          const uint8_t *const check_here = ss_os[i] + best_address;
+          const double mv_dist =
+              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+          if (mv_dist < bestsad) {
+            const double mv_cost =
+                vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num) /
+                (double)(1 << LOG2_PRECISION);
+            double thissad = mv_dist + lambda * mv_cost;
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_site = i;
+            }
+          }
+        }
+        i++;
+      }
+    }
+    if (best_site != last_site) {
+      best_full_mv->row += ss_mv[best_site].row;
+      best_full_mv->col += ss_mv[best_site].col;
+      best_address += ss_os[best_site];
+      last_site = best_site;
+    } else if (best_address == in_what) {
+      (*num00)++;
+    }
+  }
+  return bestsad;
+}
+
+void vp9_prepare_nb_full_mvs(const TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col, int rf_idx, BLOCK_SIZE bsize,
+                             int_mv *nb_full_mvs) {
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } };
+  int i;
+  for (i = 0; i < NB_MVS_NUM; ++i) {
+    int r = dirs[i][0] * mi_height;
+    int c = dirs[i][1] * mi_width;
+    if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 &&
+        mi_col + c < tpl_frame->mi_cols) {
+      const TplDepStats *tpl_ptr =
+          &tpl_frame
+               ->tpl_stats_ptr[(mi_row + r) * tpl_frame->stride + mi_col + c];
+      int_mv *mv =
+          get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row + r, mi_col + c);
+      if (tpl_ptr->ready[rf_idx]) {
+        nb_full_mvs[i].as_mv = get_full_mv(&mv->as_mv);
+      } else {
+        nb_full_mvs[i].as_int = INVALID_MV;
+      }
+    } else {
+      nb_full_mvs[i].as_int = INVALID_MV;
+    }
+  }
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
                              MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
@@ -1785,12 +2440,15 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
 }
 
 static const MV search_pos[4] = {
-  { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+  { -1, 0 },
+  { 0, -1 },
+  { 0, 1 },
+  { 1, 0 },
 };
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
                                            BLOCK_SIZE bsize, int mi_row,
-                                           int mi_col) {
+                                           int mi_col, const MV *ref_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
@@ -1812,6 +2470,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   const int norm_factor = 3 + (bw >> 5);
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+  MvLimits subpel_mv_limits;
 
   if (scaled_ref_frame) {
     int i;
@@ -1876,7 +2535,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
 
   {
     const uint8_t *const pos[4] = {
-      ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+      ref_buf - ref_stride,
+      ref_buf - 1,
+      ref_buf + 1,
+      ref_buf + ref_stride,
     };
 
     cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
@@ -1911,6 +2573,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   tmp_mv->row *= 8;
   tmp_mv->col *= 8;
 
+  vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+  clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+           subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
@@ -1919,11 +2585,78 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   return best_sad;
 }
 
+#if CONFIG_NON_GREEDY_MV
 // Runs sequence of diamond searches in smaller steps for RD.
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
+                                  MV *mvp_full, int step_param, int lambda,
+                                  int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const int_mv *nb_full_mvs, int full_mv_num,
+                                  MV *best_mv) {
+  int n, num00 = 0;
+  double thissme;
+  double bestsme;
+  const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param;
+  const MV center_mv = { 0, 0 };
+  vpx_clear_system_state();
+  bestsme =
+      diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, best_mv, step_param,
+                             lambda, &n, fn_ptr, nb_full_mvs, full_mv_num);
+
+  bestsme = vp9_get_mvpred_var(x, best_mv, &center_mv, fn_ptr, 0);
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps) do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+    if (num00) {
+      num00--;
+    } else {
+      MV temp_mv;
+      thissme = diamond_search_sad_new(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+                                       step_param + n, lambda, &num00, fn_ptr,
+                                       nb_full_mvs, full_mv_num);
+      thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n) do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *best_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV temp_mv = *best_mv;
+    thissme = vp9_refining_search_sad_new(x, &temp_mv, lambda, search_range,
+                                          fn_ptr, nb_full_mvs, full_mv_num);
+    thissme = vp9_get_mvpred_var(x, &temp_mv, &center_mv, fn_ptr, 0);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *best_mv = temp_mv;
+    }
+  }
+
+  bestsme = (double)full_pixel_exhaustive_new(cpi, x, best_mv, fn_ptr, best_mv,
+                                              lambda, nb_full_mvs, full_mv_num);
+  return bestsme;
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+// Runs sequence of diamond searches in smaller steps for RD.
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+              point as the best match, we will do a final 1-away diamond
+              refining search  */
+static int full_pixel_diamond(const VP9_COMP *const cpi,
+                              const MACROBLOCK *const x, MV *mvp_full,
                               int step_param, int sadpb, int further_steps,
                               int do_refine, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
@@ -1983,13 +2716,11 @@ static int full_pixel_diamond(const VP9_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
   return bestsme;
 }
 
-#define MIN_RANGE 7
-#define MAX_RANGE 256
-#define MIN_INTERVAL 1
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
-static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
-                                 MV *centre_mv_full, int sadpb, int *cost_list,
+static int full_pixel_exhaustive(const VP9_COMP *const cpi,
+                                 const MACROBLOCK *const x, MV *centre_mv_full,
+                                 int sadpb, int *cost_list,
                                  const vp9_variance_fn_ptr_t *fn_ptr,
                                  const MV *ref_mv, MV *dst_mv) {
   const SPEED_FEATURES *const sf = &cpi->sf;
@@ -2015,7 +2746,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   interval = VPXMAX(interval, range / baseline_interval_divisor);
 
   // initial search
-  bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+  bestsme = exhaustive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
                                    sadpb, fn_ptr, &temp_mv);
 
   if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
@@ -2023,7 +2754,7 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
     // till we reach a step size of 1. Then break out.
     for (i = 1; i < MAX_MESH_STEP; ++i) {
       // First pass with coarser step and longer range
-      bestsme = exhuastive_mesh_search(
+      bestsme = exhaustive_mesh_search(
           x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
           sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
 
@@ -2042,6 +2773,90 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
+#if CONFIG_NON_GREEDY_MV
+double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+                                   int lambda, int search_range,
+                                   const vp9_variance_fn_ptr_t *fn_ptr,
+                                   const int_mv *nb_full_mvs, int full_mv_num) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv);
+  double best_sad;
+  int i, j;
+  vpx_clear_system_state();
+  {
+    const double mv_dist =
+        fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride);
+    const double mv_cost =
+        vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num) /
+        (double)(1 << LOG2_PRECISION);
+    best_sad = mv_dist + lambda * mv_cost;
+  }
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+    const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) &
+                       ((best_full_mv->row + 1) < x->mv_limits.row_max) &
+                       ((best_full_mv->col - 1) > x->mv_limits.col_min) &
+                       ((best_full_mv->col + 1) < x->mv_limits.col_max);
+
+    if (all_in) {
+      unsigned int sads[4];
+      const uint8_t *const positions[4] = { best_address - in_what->stride,
+                                            best_address - 1, best_address + 1,
+                                            best_address + in_what->stride };
+
+      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { best_full_mv->row + neighbors[j].row,
+                        best_full_mv->col + neighbors[j].col };
+        const double mv_dist = sads[j];
+        const double mv_cost =
+            vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num) /
+            (double)(1 << LOG2_PRECISION);
+        const double thissad = mv_dist + lambda * mv_cost;
+        if (thissad < best_sad) {
+          best_sad = thissad;
+          best_site = j;
+        }
+      }
+    } else {
+      for (j = 0; j < 4; ++j) {
+        const MV mv = { best_full_mv->row + neighbors[j].row,
+                        best_full_mv->col + neighbors[j].col };
+
+        if (is_mv_in(&x->mv_limits, &mv)) {
+          const double mv_dist =
+              fn_ptr->sdf(what->buf, what->stride,
+                          get_buf_from_mv(in_what, &mv), in_what->stride);
+          const double mv_cost =
+              vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num) /
+              (double)(1 << LOG2_PRECISION);
+          const double thissad = mv_dist + lambda * mv_cost;
+          if (thissad < best_sad) {
+            best_sad = thissad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      best_full_mv->row += neighbors[best_site].row;
+      best_full_mv->col += neighbors[best_site].col;
+      best_address = get_buf_from_mv(in_what, best_full_mv);
+    }
+  }
+
+  return best_sad;
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const vp9_variance_fn_ptr_t *fn_ptr,
@@ -2167,14 +2982,16 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   return best_sad;
 }
 
-int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                          MV *mvp_full, int step_param, int search_method,
-                          int error_per_bit, int *cost_list, const MV *ref_mv,
-                          MV *tmp_mv, int var_max, int rd) {
+int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
+                          BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+                          int search_method, int error_per_bit, int *cost_list,
+                          const MV *ref_mv, MV *tmp_mv, int var_max, int rd) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const SEARCH_METHODS method = (SEARCH_METHODS)search_method;
-  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  const vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
   int var = 0;
+  int run_exhaustive_search = 0;
+
   if (cost_list) {
     cost_list[0] = INT_MAX;
     cost_list[1] = INT_MAX;
@@ -2205,35 +3022,38 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                           fn_ptr, 1, ref_mv, tmp_mv);
       break;
     case NSTEP:
+    case MESH:
       var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
                                MAX_MVSEARCH_STEPS - 1 - step_param, 1,
                                cost_list, fn_ptr, ref_mv, tmp_mv);
-
-      // Should we allow a follow on exhaustive search?
-      if ((sf->exhaustive_searches_thresh < INT_MAX) &&
-          !cpi->rc.is_src_frame_alt_ref) {
-        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>=
-            8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-          int var_ex;
-          MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit,
-                                         cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            *tmp_mv = tmp_mv_ex;
-          }
-        }
-      }
       break;
-    default: assert(0 && "Invalid search method.");
+    default: assert(0 && "Unknown search method");
+  }
+
+  if (method == NSTEP) {
+    if (sf->exhaustive_searches_thresh < INT_MAX &&
+        !cpi->rc.is_src_frame_alt_ref) {
+      const int64_t exhaustive_thr =
+          sf->exhaustive_searches_thresh >>
+          (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]));
+      if (var > exhaustive_thr) run_exhaustive_search = 1;
+    }
+  } else if (method == MESH) {
+    run_exhaustive_search = 1;
+  }
+
+  if (run_exhaustive_search) {
+    int var_ex;
+    MV tmp_mv_ex;
+    var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list,
+                                   fn_ptr, ref_mv, &tmp_mv_ex);
+    if (var_ex < var) {
+      var = var_ex;
+      *tmp_mv = tmp_mv_ex;
+    }
   }
 
-  if (method != NSTEP && rd && var < var_max)
+  if (method != NSTEP && method != MESH && rd && var < var_max)
     var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
 
   return var;
@@ -2274,7 +3094,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   (void)tc;            \
   (void)sse;           \
   (void)thismse;       \
-  (void)cost_list;
+  (void)cost_list;     \
+  (void)use_accurate_subpel_search;
 
 // Return the maximum MV.
 uint32_t vp9_return_max_sub_pixel_mv(
@@ -2282,7 +3103,7 @@ uint32_t vp9_return_max_sub_pixel_mv(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   COMMON_MV_TEST;
 
   (void)minr;
@@ -2304,7 +3125,7 @@ uint32_t vp9_return_min_sub_pixel_mv(
     int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop,
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h) {
+    int h, int use_accurate_subpel_search) {
   COMMON_MV_TEST;
 
   (void)maxr;
diff --git a/libs/libvpx/vp9/encoder/vp9_mcomp.h b/libs/libvpx/vp9/encoder/vp9_mcomp.h
index b8db2c35368..cafa2d15046 100644
--- a/libs/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libs/libvpx/vp9/encoder/vp9_mcomp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_MCOMP_H_
-#define VP9_ENCODER_VP9_MCOMP_H_
+#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_
+#define VPX_VP9_ENCODER_VP9_MCOMP_H_
 
 #include "vp9/encoder/vp9_block.h"
 #include "vpx_dsp/variance.h"
@@ -38,6 +38,11 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+                                             const MV *mv) {
+  return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride);
 void vp9_init3smotion_compensation(search_site_config *cfg, int stride);
 
@@ -59,14 +64,15 @@ struct SPEED_FEATURES;
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
-                            int sad_per_bit, int distance,
+                            int error_per_bit, int search_range,
                             const struct vp9_variance_vtable *fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
 unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            MACROBLOCK *x, BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
+                                           int mi_row, int mi_col,
+                                           const MV *ref_mv);
 
 typedef uint32_t(fractional_mv_step_fp)(
     const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
@@ -74,7 +80,7 @@ typedef uint32_t(fractional_mv_step_fp)(
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
     uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w,
-    int h);
+    int h, int use_accurate_subpel_search);
 
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned;
@@ -106,7 +112,11 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
 struct VP9_COMP;
 
-int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+// "mvp_full" is the MV search starting point;
+// "ref_mv" is the context reference MV;
+// "tmp_mv" is the searched best MV.
+int vp9_full_pixel_search(const struct VP9_COMP *const cpi,
+                          const MACROBLOCK *const x, BLOCK_SIZE bsize,
                           MV *mvp_full, int step_param, int search_method,
                           int error_per_bit, int *cost_list, const MV *ref_mv,
                           MV *tmp_mv, int var_max, int rd);
@@ -115,8 +125,60 @@ void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits,
                                     const MvLimits *umv_window_limits,
                                     const MV *ref_mv);
 
+#if CONFIG_NON_GREEDY_MV
+#define NB_MVS_NUM 4
+struct TplDepStats;
+double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
+                                   int lambda, int search_range,
+                                   const vp9_variance_fn_ptr_t *fn_ptr,
+                                   const int_mv *nb_full_mvs, int full_mv_num);
+
+double vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x,
+                                  MV *mvp_full, int step_param, int lambda,
+                                  int do_refine,
+                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const int_mv *nb_full_mvs, int full_mv_num,
+                                  MV *best_mv);
+
+int64_t vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs,
+                                 int mv_num);
+static INLINE MV get_full_mv(const MV *mv) {
+  MV out_mv;
+  out_mv.row = mv->row >> 3;
+  out_mv.col = mv->col >> 3;
+  return out_mv;
+}
+struct TplDepFrame;
+void vp9_prepare_nb_full_mvs(const struct TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col, int rf_idx, BLOCK_SIZE bsize,
+                             int_mv *nb_full_mvs);
+
+static INLINE BLOCK_SIZE get_square_block_size(BLOCK_SIZE bsize) {
+  BLOCK_SIZE square_bsize;
+  switch (bsize) {
+    case BLOCK_4X4:
+    case BLOCK_4X8:
+    case BLOCK_8X4: square_bsize = BLOCK_4X4; break;
+    case BLOCK_8X8:
+    case BLOCK_8X16:
+    case BLOCK_16X8: square_bsize = BLOCK_8X8; break;
+    case BLOCK_16X16:
+    case BLOCK_16X32:
+    case BLOCK_32X16: square_bsize = BLOCK_16X16; break;
+    case BLOCK_32X32:
+    case BLOCK_32X64:
+    case BLOCK_64X32:
+    case BLOCK_64X64: square_bsize = BLOCK_32X32; break;
+    default:
+      square_bsize = BLOCK_INVALID;
+      assert(0 && "ERROR: invalid block size");
+      break;
+  }
+  return square_bsize;
+}
+#endif  // CONFIG_NON_GREEDY_MV
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_MCOMP_H_
+#endif  // VPX_VP9_ENCODER_VP9_MCOMP_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.c b/libs/libvpx/vp9/encoder/vp9_multi_thread.c
index da06fb151d8..c66c0354928 100644
--- a/libs/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -13,6 +13,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_multi_thread.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
 
 void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
                                int tile_id) {
@@ -50,6 +51,20 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
   return job_info;
 }
 
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+                                TileDataEnc *const this_tile) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int sb_rows =
+      (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
+  int i;
+
+  this_tile->row_base_thresh_freq_fact =
+      (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
+                        sizeof(*(this_tile->row_base_thresh_freq_fact)));
+  for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
+    this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+}
+
 void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   struct VP9Common *cm = &cpi->common;
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
@@ -59,6 +74,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   int jobs_per_tile_col, total_jobs;
 
+  // Allocate memory that is large enough for all row_mt stages. First pass
+  // uses 16x16 block size.
   jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
   // Calculate the total number of jobs
   total_jobs = jobs_per_tile_col * tile_cols;
@@ -83,14 +100,11 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
     TileDataEnc *this_tile = &cpi->tile_data[tile_col];
     vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
     if (cpi->sf.adaptive_rd_thresh_row_mt) {
-      const int sb_rows =
-          (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
-      int i;
-      this_tile->row_base_thresh_freq_fact =
-          (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
-                            sizeof(*(this_tile->row_base_thresh_freq_fact)));
-      for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
-        this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
+      if (this_tile->row_base_thresh_freq_fact != NULL) {
+        vpx_free(this_tile->row_base_thresh_freq_fact);
+        this_tile->row_base_thresh_freq_fact = NULL;
+      }
+      vp9_row_mt_alloc_rd_thresh(cpi, this_tile);
     }
   }
 
@@ -146,11 +160,9 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
       TileDataEnc *this_tile =
           &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols +
                           tile_col];
-      if (cpi->sf.adaptive_rd_thresh_row_mt) {
-        if (this_tile->row_base_thresh_freq_fact != NULL) {
-          vpx_free(this_tile->row_base_thresh_freq_fact);
-          this_tile->row_base_thresh_freq_fact = NULL;
-        }
+      if (this_tile->row_base_thresh_freq_fact != NULL) {
+        vpx_free(this_tile->row_base_thresh_freq_fact);
+        this_tile->row_base_thresh_freq_fact = NULL;
       }
     }
   }
@@ -219,11 +231,19 @@ void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
   MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
   JobQueue *job_queue = multi_thread_ctxt->job_queue;
   const int tile_cols = 1 << cm->log2_tile_cols;
-  int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;
+  int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs;
   const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   int tile_col, i;
 
-  jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;
+  switch (job_type) {
+    case ENCODE_JOB: jobs_per_tile_col = sb_rows; break;
+    case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break;
+    case ARNR_JOB:
+      jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT);
+      break;
+    default: assert(0);
+  }
+
   total_jobs = jobs_per_tile_col * tile_cols;
 
   multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
diff --git a/libs/libvpx/vp9/encoder/vp9_multi_thread.h b/libs/libvpx/vp9/encoder/vp9_multi_thread.h
index bfc0c0ae4f6..a2276f4fe63 100644
--- a/libs/libvpx/vp9/encoder/vp9_multi_thread.h
+++ b/libs/libvpx/vp9/encoder/vp9_multi_thread.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H
-#define VP9_ENCODER_VP9_MULTI_THREAD_H
+#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
+#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_job_queue.h"
@@ -29,10 +29,13 @@ void vp9_multi_thread_tile_init(VP9_COMP *cpi);
 
 void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
 
+void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
+                                TileDataEnc *const this_tile);
+
 void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
 
 int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
                               int *tile_completion_status, int *cur_tile_id,
                               int tile_cols);
 
-#endif  // VP9_ENCODER_VP9_MULTI_THREAD_H
+#endif  // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
index 276a0c78525..9696529c504 100644
--- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -32,7 +32,7 @@ static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
 
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   ne->enabled = 0;
-  ne->level = kLowLow;
+  ne->level = (width * height < 1280 * 720) ? kLowLow : kLow;
   ne->value = 0;
   ne->count = 0;
   ne->thresh = 90;
@@ -46,6 +46,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
     ne->thresh = 115;
   }
   ne->num_frames_estimate = 15;
+  ne->adapt_thresh = (3 * ne->thresh) >> 1;
 }
 
 static int enable_noise_estimation(VP9_COMP *const cpi) {
@@ -97,7 +98,7 @@ NOISE_LEVEL vp9_noise_estimate_extract_level(NOISE_ESTIMATE *const ne) {
   } else {
     if (ne->value > ne->thresh)
       noise_level = kMedium;
-    else if (ne->value > ((9 * ne->thresh) >> 4))
+    else if (ne->value > (ne->thresh >> 1))
       noise_level = kLow;
     else
       noise_level = kLowLow;
@@ -112,10 +113,6 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   // Estimate of noise level every frame_period frames.
   int frame_period = 8;
   int thresh_consec_zeromv = 6;
-  unsigned int thresh_sum_diff = 100;
-  unsigned int thresh_sum_spatial = (200 * 200) << 8;
-  unsigned int thresh_spatial_var = (32 * 32) << 8;
-  int min_blocks_estimate = cm->mi_rows * cm->mi_cols >> 7;
   int frame_counter = cm->current_video_frame;
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
@@ -124,11 +121,8 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     last_source = &cpi->denoiser.last_source;
     // Tune these thresholds for different resolutions when denoising is
     // enabled.
-    if (cm->width > 640 && cm->width < 1920) {
-      thresh_consec_zeromv = 4;
-      thresh_sum_diff = 200;
-      thresh_sum_spatial = (120 * 120) << 8;
-      thresh_spatial_var = (48 * 48) << 8;
+    if (cm->width > 640 && cm->width <= 1920) {
+      thresh_consec_zeromv = 2;
     }
   }
 #endif
@@ -148,8 +142,10 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
       ne->last_h = cm->height;
     }
     return;
-  } else if (cm->current_video_frame > 60 &&
-             cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) {
+  } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 &&
+             cpi->rc.frames_since_key > cpi->svc.number_spatial_layers &&
+             cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+             cpi->rc.avg_frame_low_motion < (low_res ? 60 : 40)) {
     // Force noise estimation to 0 and denoiser off if content has high motion.
     ne->level = kLowLow;
     ne->count = 0;
@@ -157,17 +153,19 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
         cpi->svc.current_superframe > 1) {
-      vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+      vp9_denoiser_set_noise_level(cpi, ne->level);
       copy_frame(&cpi->denoiser.last_source, cpi->Source);
     }
 #endif
     return;
   } else {
-    int num_samples = 0;
-    uint64_t avg_est = 0;
+    unsigned int bin_size = 100;
+    unsigned int hist[MAX_VAR_HIST_BINS] = { 0 };
+    unsigned int hist_avg[MAX_VAR_HIST_BINS];
+    unsigned int max_bin = 0;
+    unsigned int max_bin_count = 0;
+    unsigned int bin_cnt;
     int bsize = BLOCK_16X16;
-    static const unsigned char const_source[16] = { 0, 0, 0, 0, 0, 0, 0, 0,
-                                                    0, 0, 0, 0, 0, 0, 0, 0 };
     // Loop over sub-sample of 16x16 blocks of frame, and for blocks that have
     // been encoded as zero/small mv at least x consecutive frames, compute
     // the variance to update estimate of noise in the source.
@@ -207,8 +205,11 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           // Only consider blocks that are likely steady background. i.e, have
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
-          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
-          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) {
+          // 4 sub-blocks for 16x16 block. And exclude this frame if
+          // high_source_sad is true (i.e., scene/content change).
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv &&
+              !cpi->rc.high_source_sad &&
+              !cpi->svc.high_source_sad_superframe) {
             int is_skin = 0;
             if (cpi->use_skin_detection) {
               is_skin =
@@ -217,25 +218,15 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
             }
             if (!is_skin) {
               unsigned int sse;
-              // Compute variance.
+              // Compute variance between co-located blocks from current and
+              // last input frames.
               unsigned int variance = cpi->fn_ptr[bsize].vf(
                   src_y, src_ystride, last_src_y, last_src_ystride, &sse);
-              // Only consider this block as valid for noise measurement if the
-              // average term (sse - variance = N * avg^{2}, N = 16X16) of the
-              // temporal residual is small (avoid effects from lighting
-              // change).
-              if ((sse - variance) < thresh_sum_diff) {
-                unsigned int sse2;
-                const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
-                    src_y, src_ystride, const_source, 0, &sse2);
-                // Avoid blocks with high brightness and high spatial variance.
-                if ((sse2 - spatial_variance) < thresh_sum_spatial &&
-                    spatial_variance < thresh_spatial_var) {
-                  avg_est += low_res ? variance >> 4
-                                     : variance / ((spatial_variance >> 9) + 1);
-                  num_samples++;
-                }
-              }
+              unsigned int hist_index = variance / bin_size;
+              if (hist_index < MAX_VAR_HIST_BINS)
+                hist[hist_index]++;
+              else if (hist_index < 3 * (MAX_VAR_HIST_BINS >> 1))
+                hist[MAX_VAR_HIST_BINS - 1]++;  // Account for the tail
             }
           }
         }
@@ -251,25 +242,57 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     }
     ne->last_w = cm->width;
     ne->last_h = cm->height;
-    // Update noise estimate if we have at a minimum number of block samples,
-    // and avg_est > 0 (avg_est == 0 can happen if the application inputs
-    // duplicate frames).
-    if (num_samples > min_blocks_estimate && avg_est > 0) {
-      // Normalize.
-      avg_est = avg_est / num_samples;
-      // Update noise estimate.
-      ne->value = (int)((15 * ne->value + avg_est) >> 4);
+    // Adjust histogram to account for effect that histogram flattens
+    // and shifts to zero as scene darkens.
+    if (hist[0] > 10 && (hist[MAX_VAR_HIST_BINS - 1] > hist[0] >> 2)) {
+      hist[0] = 0;
+      hist[1] >>= 2;
+      hist[2] >>= 2;
+      hist[3] >>= 2;
+      hist[4] >>= 1;
+      hist[5] >>= 1;
+      hist[6] = 3 * hist[6] >> 1;
+      hist[MAX_VAR_HIST_BINS - 1] >>= 1;
+    }
+
+    // Average hist[] and find largest bin
+    for (bin_cnt = 0; bin_cnt < MAX_VAR_HIST_BINS; bin_cnt++) {
+      if (bin_cnt == 0)
+        hist_avg[bin_cnt] = (hist[0] + hist[1] + hist[2]) / 3;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 1)
+        hist_avg[bin_cnt] = hist[MAX_VAR_HIST_BINS - 1] >> 2;
+      else if (bin_cnt == MAX_VAR_HIST_BINS - 2)
+        hist_avg[bin_cnt] = (hist[bin_cnt - 1] + 2 * hist[bin_cnt] +
+                             (hist[bin_cnt + 1] >> 1) + 2) >>
+                            2;
+      else
+        hist_avg[bin_cnt] =
+            (hist[bin_cnt - 1] + 2 * hist[bin_cnt] + hist[bin_cnt + 1] + 2) >>
+            2;
+
+      if (hist_avg[bin_cnt] > max_bin_count) {
+        max_bin_count = hist_avg[bin_cnt];
+        max_bin = bin_cnt;
+      }
+    }
+
+    // Scale by 40 to work with existing thresholds
+    ne->value = (int)((3 * ne->value + max_bin * 40) >> 2);
+    // Quickly increase VNR strength when the noise level increases suddenly.
+    if (ne->level < kMedium && ne->value > ne->adapt_thresh) {
+      ne->count = ne->num_frames_estimate;
+    } else {
       ne->count++;
-      if (ne->count == ne->num_frames_estimate) {
-        // Reset counter and check noise level condition.
-        ne->num_frames_estimate = 30;
-        ne->count = 0;
-        ne->level = vp9_noise_estimate_extract_level(ne);
+    }
+    if (ne->count == ne->num_frames_estimate) {
+      // Reset counter and check noise level condition.
+      ne->num_frames_estimate = 30;
+      ne->count = 0;
+      ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
-          vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
+      if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
+        vp9_denoiser_set_noise_level(cpi, ne->level);
 #endif
-      }
     }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
diff --git a/libs/libvpx/vp9/encoder/vp9_noise_estimate.h b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h
index 335cdbe6434..7fc94ff8c95 100644
--- a/libs/libvpx/vp9/encoder/vp9_noise_estimate.h
+++ b/libs/libvpx/vp9/encoder/vp9_noise_estimate.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_
-#define VP9_ENCODER_NOISE_ESTIMATE_H_
+#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
+#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
 
 #include "vp9/encoder/vp9_block.h"
 #include "vp9/encoder/vp9_skin_detection.h"
@@ -23,6 +23,8 @@
 extern "C" {
 #endif
 
+#define MAX_VAR_HIST_BINS 20
+
 typedef enum noise_level { kLowLow, kLow, kMedium, kHigh } NOISE_LEVEL;
 
 typedef struct noise_estimate {
@@ -30,6 +32,7 @@ typedef struct noise_estimate {
   NOISE_LEVEL level;
   int value;
   int thresh;
+  int adapt_thresh;
   int count;
   int last_w;
   int last_h;
@@ -48,4 +51,4 @@ void vp9_update_noise_estimate(struct VP9_COMP *const cpi);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_NOISE_ESTIMATE_H_
+#endif  // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_partition_models.h b/libs/libvpx/vp9/encoder/vp9_partition_models.h
new file mode 100644
index 00000000000..09c0e30a47d
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/vp9_partition_models.h
@@ -0,0 +1,975 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+// Neural net model config. It defines the layout of a neural net model, such as
+// the number of inputs/outputs, number of layers, the number of nodes in each
+// layer, as well as the weights and bias of each node.
+typedef struct {
+  int num_inputs;         // Number of input nodes, i.e. features.
+  int num_outputs;        // Number of output nodes.
+  int num_hidden_layers;  // Number of hidden layers, maximum 10.
+  // Number of nodes for each hidden layer.
+  int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+  // Weight parameters, indexed by layer.
+  const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+  // Bias parameters, indexed by layer.
+  const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Partition search breakout model.
+#define FEATURES 4
+#define Q_CTX 3
+#define RESOLUTION_CTX 2
+static const float
+    vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.016673f,
+              -0.001025f,
+              -0.000032f,
+              0.000833f,
+              1.94261885f - 2.1f,
+          },
+          {
+              -0.160867f,
+              -0.002101f,
+              0.000011f,
+              0.002448f,
+              1.65738142f - 2.5f,
+          },
+          {
+              -0.628934f,
+              -0.011459f,
+              -0.000009f,
+              0.013833f,
+              1.47982645f - 1.6f,
+          },
+      },
+      {
+          {
+              -0.064309f,
+              -0.006121f,
+              0.000232f,
+              0.005778f,
+              0.7989465f - 5.0f,
+          },
+          {
+              -0.314957f,
+              -0.009346f,
+              -0.000225f,
+              0.010072f,
+              2.80695581f - 5.5f,
+          },
+          {
+              -0.635535f,
+              -0.015135f,
+              0.000091f,
+              0.015247f,
+              2.90381241f - 5.0f,
+          },
+      },
+    };
+
+static const float
+    vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.010554f,
+              -0.003081f,
+              -0.000134f,
+              0.004491f,
+              1.68445992f - 3.5f,
+          },
+          {
+              -0.051489f,
+              -0.007609f,
+              0.000016f,
+              0.009792f,
+              1.28089404f - 2.5f,
+          },
+          {
+              -0.163097f,
+              -0.013081f,
+              0.000022f,
+              0.019006f,
+              1.36129403f - 3.2f,
+          },
+      },
+      {
+          {
+              -0.024629f,
+              -0.006492f,
+              -0.000254f,
+              0.004895f,
+              1.27919173f - 4.5f,
+          },
+          {
+              -0.083936f,
+              -0.009827f,
+              -0.000200f,
+              0.010399f,
+              2.73731065f - 4.5f,
+          },
+          {
+              -0.279052f,
+              -0.013334f,
+              0.000289f,
+              0.023203f,
+              2.43595719f - 3.5f,
+          },
+      },
+    };
+
+static const float
+    vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = {
+      {
+          {
+              -0.013154f,
+              -0.002404f,
+              -0.000977f,
+              0.008450f,
+              2.57404566f - 5.5f,
+          },
+          {
+              -0.019146f,
+              -0.004018f,
+              0.000064f,
+              0.008187f,
+              2.15043926f - 2.5f,
+          },
+          {
+              -0.075755f,
+              -0.010858f,
+              0.000030f,
+              0.024505f,
+              2.06848121f - 2.5f,
+          },
+      },
+      {
+          {
+              -0.007636f,
+              -0.002751f,
+              -0.000682f,
+              0.005968f,
+              0.19225763f - 4.5f,
+          },
+          {
+              -0.047306f,
+              -0.009113f,
+              -0.000518f,
+              0.016007f,
+              2.61068869f - 4.0f,
+          },
+          {
+              -0.069336f,
+              -0.010448f,
+              -0.001120f,
+              0.023083f,
+              1.47591054f - 5.5f,
+          },
+      },
+    };
+
+static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX]
+                                                   [FEATURES + 1] = {
+                                                     {
+                                                         {
+                                                             -0.011807f,
+                                                             -0.009873f,
+                                                             -0.000931f,
+                                                             0.034768f,
+                                                             1.32254851f - 2.0f,
+                                                         },
+                                                         {
+                                                             -0.003861f,
+                                                             -0.002701f,
+                                                             0.000100f,
+                                                             0.013876f,
+                                                             1.96755111f - 1.5f,
+                                                         },
+                                                         {
+                                                             -0.013522f,
+                                                             -0.008677f,
+                                                             -0.000562f,
+                                                             0.034468f,
+                                                             1.53440356f - 1.5f,
+                                                         },
+                                                     },
+                                                     {
+                                                         {
+                                                             -0.003221f,
+                                                             -0.002125f,
+                                                             0.000993f,
+                                                             0.012768f,
+                                                             0.03541421f - 2.0f,
+                                                         },
+                                                         {
+                                                             -0.006069f,
+                                                             -0.007335f,
+                                                             0.000229f,
+                                                             0.026104f,
+                                                             0.17135315f - 1.5f,
+                                                         },
+                                                         {
+                                                             -0.039894f,
+                                                             -0.011419f,
+                                                             0.000070f,
+                                                             0.061817f,
+                                                             0.6739977f - 1.5f,
+                                                         },
+                                                     },
+                                                   };
+#undef FEATURES
+#undef Q_CTX
+#undef RESOLUTION_CTX
+
+// Rectangular partition search pruning model.
+#define FEATURES 8
+#define LABELS 4
+#define NODES 16
+static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * NODES] = {
+  -0.432522f, 0.133070f,  -0.169187f, 0.768340f,  0.891228f,  0.554458f,
+  0.356000f,  0.403621f,  0.809165f,  0.778214f,  -0.520357f, 0.301451f,
+  -0.386972f, -0.314402f, 0.021878f,  1.148746f,  -0.462258f, -0.175524f,
+  -0.344589f, -0.475159f, -0.232322f, 0.471147f,  -0.489948f, 0.467740f,
+  -0.391550f, 0.208601f,  0.054138f,  0.076859f,  -0.309497f, -0.095927f,
+  0.225917f,  0.011582f,  -0.520730f, -0.585497f, 0.174036f,  0.072521f,
+  0.120771f,  -0.517234f, -0.581908f, -0.034003f, -0.694722f, -0.364368f,
+  0.290584f,  0.038373f,  0.685654f,  0.394019f,  0.759667f,  1.257502f,
+  -0.610516f, -0.185434f, 0.211997f,  -0.172458f, 0.044605f,  0.145316f,
+  -0.182525f, -0.147376f, 0.578742f,  0.312412f,  -0.446135f, -0.389112f,
+  0.454033f,  0.260490f,  0.664285f,  0.395856f,  -0.231827f, 0.215228f,
+  0.014856f,  -0.395462f, 0.479646f,  -0.391445f, -0.357788f, 0.166238f,
+  -0.056818f, -0.027783f, 0.060880f,  -1.604710f, 0.531268f,  0.282184f,
+  0.714944f,  0.093523f,  -0.218312f, -0.095546f, -0.285621f, -0.190871f,
+  -0.448340f, -0.016611f, 0.413913f,  -0.286720f, -0.158828f, -0.092635f,
+  -0.279551f, 0.166509f,  -0.088162f, 0.446543f,  -0.276830f, -0.065642f,
+  -0.176346f, -0.984754f, 0.338738f,  0.403809f,  0.738065f,  1.154439f,
+  0.750764f,  0.770959f,  -0.269403f, 0.295651f,  -0.331858f, 0.367144f,
+  0.279279f,  0.157419f,  -0.348227f, -0.168608f, -0.956000f, -0.647136f,
+  0.250516f,  0.858084f,  0.809802f,  0.492408f,  0.804841f,  0.282802f,
+  0.079395f,  -0.291771f, -0.024382f, -1.615880f, -0.445166f, -0.407335f,
+  -0.483044f, 0.141126f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer0[NODES] = {
+  0.275384f,  -0.053745f, 0.000000f,  0.000000f, -0.178103f, 0.513965f,
+  -0.161352f, 0.228551f,  0.000000f,  1.013712f, 0.000000f,  0.000000f,
+  -1.144009f, -0.000006f, -0.241727f, 2.048764f,
+};
+
+static const float vp9_rect_part_nn_weights_16_layer1[NODES * LABELS] = {
+  -1.435278f, 2.204691f,  -0.410718f, 0.202708f,  0.109208f,  1.059142f,
+  -0.306360f, 0.845906f,  0.489654f,  -1.121915f, -0.169133f, -0.003385f,
+  0.660590f,  -0.018711f, 1.227158f,  -2.967504f, 1.407345f,  -1.293243f,
+  -0.386921f, 0.300492f,  0.338824f,  -0.083250f, -0.069454f, -1.001827f,
+  -0.327891f, 0.899353f,  0.367397f,  -0.118601f, -0.171936f, -0.420646f,
+  -0.803319f, 2.029634f,  0.940268f,  -0.664484f, 0.339916f,  0.315944f,
+  0.157374f,  -0.402482f, -0.491695f, 0.595827f,  0.015031f,  0.255887f,
+  -0.466327f, -0.212598f, 0.136485f,  0.033363f,  -0.796921f, 1.414304f,
+  -0.282185f, -2.673571f, -0.280994f, 0.382658f,  -0.350902f, 0.227926f,
+  0.062602f,  -1.000199f, 0.433731f,  1.176439f,  -0.163216f, -0.229015f,
+  -0.640098f, -0.438852f, -0.947700f, 2.203434f,
+};
+
+static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = {
+  -0.875510f,
+  0.982408f,
+  0.560854f,
+  -0.415209f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_16_layer0,
+      vp9_rect_part_nn_weights_16_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_16_layer0,
+      vp9_rect_part_nn_bias_16_layer1,
+  },
+};
+
+static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * NODES] = {
+  -0.147312f, -0.753248f, 0.540206f,  0.661415f,  0.484117f,  -0.341609f,
+  0.016183f,  0.064177f,  0.781580f,  0.902232f,  -0.505342f, 0.325183f,
+  -0.231072f, -0.120107f, -0.076216f, 0.120038f,  0.403695f,  -0.463301f,
+  -0.192158f, 0.407442f,  0.106633f,  1.072371f,  -0.446779f, 0.467353f,
+  0.318812f,  -0.505996f, -0.008768f, -0.239598f, 0.085480f,  0.284640f,
+  -0.365045f, -0.048083f, -0.112090f, -0.067089f, 0.304138f,  -0.228809f,
+  0.383651f,  -0.196882f, 0.477039f,  -0.217978f, -0.506931f, -0.125675f,
+  0.050456f,  1.086598f,  0.732128f,  0.326941f,  0.103952f,  0.121769f,
+  -0.154487f, -0.255514f, 0.030591f,  -0.382797f, -0.019981f, -0.326570f,
+  0.149691f,  -0.435633f, -0.070795f, 0.167691f,  0.251413f,  -0.153405f,
+  0.160347f,  0.455107f,  -0.968580f, -0.575879f, 0.623115f,  -0.069793f,
+  -0.379768f, -0.965807f, -0.062057f, 0.071312f,  0.457098f,  0.350372f,
+  -0.460659f, -0.985393f, 0.359963f,  -0.093677f, 0.404272f,  -0.326896f,
+  -0.277752f, 0.609322f,  -0.114193f, -0.230701f, 0.089208f,  0.645381f,
+  0.494485f,  0.467876f,  -0.166187f, 0.251044f,  -0.394661f, 0.192895f,
+  -0.344777f, -0.041893f, -0.111163f, 0.066347f,  0.378158f,  -0.455465f,
+  0.339839f,  -0.418207f, -0.356515f, -0.227536f, -0.211091f, -0.122945f,
+  0.361772f,  -0.338095f, 0.004564f,  -0.398510f, 0.060876f,  -2.132504f,
+  -0.086776f, -0.029166f, 0.039241f,  0.222534f,  -0.188565f, -0.288792f,
+  -0.160789f, -0.123905f, 0.397916f,  -0.063779f, 0.167210f,  -0.445004f,
+  0.056889f,  0.207280f,  0.000101f,  0.384507f,  -1.721239f, -2.036402f,
+  -2.084403f, -2.060483f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer0[NODES] = {
+  -0.859251f, -0.109938f, 0.091838f,  0.187817f,  -0.728265f, 0.253080f,
+  0.000000f,  -0.357195f, -0.031290f, -1.373237f, -0.761086f, 0.000000f,
+  -0.024504f, 1.765711f,  0.000000f,  1.505390f,
+};
+
+static const float vp9_rect_part_nn_weights_32_layer1[NODES * LABELS] = {
+  0.680940f,  1.367178f,  0.403075f,  0.029957f,  0.500917f,  1.407776f,
+  -0.354002f, 0.011667f,  1.663767f,  0.959155f,  0.428323f,  -0.205345f,
+  -0.081850f, -3.920103f, -0.243802f, -4.253933f, -0.034020f, -1.361057f,
+  0.128236f,  -0.138422f, -0.025790f, -0.563518f, -0.148715f, -0.344381f,
+  -1.677389f, -0.868332f, -0.063792f, 0.052052f,  0.359591f,  2.739808f,
+  -0.414304f, 3.036597f,  -0.075368f, -1.019680f, 0.642501f,  0.209779f,
+  -0.374539f, -0.718294f, -0.116616f, -0.043212f, -1.787809f, -0.773262f,
+  0.068734f,  0.508309f,  0.099334f,  1.802239f,  -0.333538f, 2.708645f,
+  -0.447682f, -2.355555f, -0.506674f, -0.061028f, -0.310305f, -0.375475f,
+  0.194572f,  0.431788f,  -0.789624f, -0.031962f, 0.358353f,  0.382937f,
+  0.232002f,  2.321813f,  -0.037523f, 2.104652f,
+};
+
+static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = {
+  -0.693383f,
+  0.773661f,
+  0.426878f,
+  -0.070619f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_32_layer0,
+      vp9_rect_part_nn_weights_32_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_32_layer0,
+      vp9_rect_part_nn_bias_32_layer1,
+  },
+};
+#undef NODES
+
+#define NODES 24
+static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * NODES] = {
+  0.024671f,  -0.220610f, -0.284362f, -0.069556f, -0.315700f, 0.187861f,
+  0.139782f,  0.063110f,  0.796561f,  0.172868f,  -0.662194f, -1.393074f,
+  0.085003f,  0.393381f,  0.358477f,  -0.187268f, -0.370745f, 0.218287f,
+  0.027271f,  -0.254089f, -0.048236f, -0.459137f, 0.253171f,  0.122598f,
+  -0.550107f, -0.568456f, 0.159866f,  -0.246534f, 0.096384f,  -0.255460f,
+  0.077864f,  -0.334837f, 0.026921f,  -0.697252f, 0.345262f,  1.343578f,
+  0.815984f,  1.118211f,  1.574016f,  0.578476f,  -0.285967f, -0.508672f,
+  0.118137f,  0.037695f,  1.540510f,  1.256648f,  1.163819f,  1.172027f,
+  0.661551f,  -0.111980f, -0.434204f, -0.894217f, 0.570524f,  0.050292f,
+  -0.113680f, 0.000784f,  -0.211554f, -0.369394f, 0.158306f,  -0.512505f,
+  -0.238696f, 0.091498f,  -0.448490f, -0.491268f, -0.353112f, -0.303315f,
+  -0.428438f, 0.127998f,  -0.406790f, -0.401786f, -0.279888f, -0.384223f,
+  0.026100f,  0.041621f,  -0.315818f, -0.087888f, 0.353497f,  0.163123f,
+  -0.380128f, -0.090334f, -0.216647f, -0.117849f, -0.173502f, 0.301871f,
+  0.070854f,  0.114627f,  -0.050545f, -0.160381f, 0.595294f,  0.492696f,
+  -0.453858f, -1.154139f, 0.126000f,  0.034550f,  0.456665f,  -0.236618f,
+  -0.112640f, 0.050759f,  -0.449162f, 0.110059f,  0.147116f,  0.249358f,
+  -0.049894f, 0.063351f,  -0.004467f, 0.057242f,  -0.482015f, -0.174335f,
+  -0.085617f, -0.333808f, -0.358440f, -0.069006f, 0.099260f,  -1.243430f,
+  -0.052963f, 0.112088f,  -2.661115f, -2.445893f, -2.688174f, -2.624232f,
+  0.030494f,  0.161311f,  0.012136f,  0.207564f,  -2.776856f, -2.791940f,
+  -2.623962f, -2.918820f, 1.231619f,  -0.376692f, -0.698078f, 0.110336f,
+  -0.285378f, 0.258367f,  -0.180159f, -0.376608f, -0.034348f, -0.130206f,
+  0.160020f,  0.852977f,  0.580573f,  1.450782f,  1.357596f,  0.787382f,
+  -0.544004f, -0.014795f, 0.032121f,  -0.557696f, 0.159994f,  -0.540908f,
+  0.180380f,  -0.398045f, 0.705095f,  0.515103f,  -0.511521f, -1.271374f,
+  -0.231019f, 0.423647f,  0.064907f,  -0.255338f, -0.877748f, -0.667205f,
+  0.267847f,  0.135229f,  0.617844f,  1.349849f,  1.012623f,  0.730506f,
+  -0.078571f, 0.058401f,  0.053221f,  -2.426146f, -0.098808f, -0.138508f,
+  -0.153299f, 0.149116f,  -0.444243f, 0.301807f,  0.065066f,  0.092929f,
+  -0.372784f, -0.095540f, 0.192269f,  0.237894f,  0.080228f,  -0.214074f,
+  -0.011426f, -2.352367f, -0.085394f, -0.190361f, -0.001177f, 0.089197f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer0[NODES] = {
+  0.000000f,  -0.057652f, -0.175413f, -0.175389f, -1.084097f, -1.423801f,
+  -0.076307f, -0.193803f, 0.000000f,  -0.066474f, -0.050318f, -0.019832f,
+  -0.038814f, -0.144184f, 2.652451f,  2.415006f,  0.197464f,  -0.729842f,
+  -0.173774f, 0.239171f,  0.486425f,  2.463304f,  -0.175279f, 2.352637f,
+};
+
+static const float vp9_rect_part_nn_weights_64_layer1[NODES * LABELS] = {
+  -0.063237f, 1.925696f,  -0.182145f, -0.226687f, 0.602941f,  -0.941140f,
+  0.814598f,  -0.117063f, 0.282988f,  0.066369f,  0.096951f,  1.049735f,
+  -0.188188f, -0.281227f, -4.836746f, -5.047797f, 0.892358f,  0.417145f,
+  -0.279849f, 1.335945f,  0.660338f,  -2.757938f, -0.115714f, -1.862183f,
+  -0.045980f, -1.597624f, -0.586822f, -0.615589f, -0.330537f, 1.068496f,
+  -0.167290f, 0.141290f,  -0.112100f, 0.232761f,  0.252307f,  -0.399653f,
+  0.353118f,  0.241583f,  2.635241f,  4.026119f,  -1.137327f, -0.052446f,
+  -0.139814f, -1.104256f, -0.759391f, 2.508457f,  -0.526297f, 2.095348f,
+  -0.444473f, -1.090452f, 0.584122f,  0.468729f,  -0.368865f, 1.041425f,
+  -1.079504f, 0.348837f,  0.390091f,  0.416191f,  0.212906f,  -0.660255f,
+  0.053630f,  0.209476f,  3.595525f,  2.257293f,  -0.514030f, 0.074203f,
+  -0.375862f, -1.998307f, -0.930310f, 1.866686f,  -0.247137f, 1.087789f,
+  0.100186f,  0.298150f,  0.165265f,  0.050478f,  0.249167f,  0.371789f,
+  -0.294497f, 0.202954f,  0.037310f,  0.193159f,  0.161551f,  0.301597f,
+  0.299286f,  0.185946f,  0.822976f,  2.066130f,  -1.724588f, 0.055977f,
+  -0.330747f, -0.067747f, -0.475801f, 1.555958f,  -0.025808f, -0.081516f,
+};
+
+static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = {
+  -0.090723f,
+  0.894968f,
+  0.844754f,
+  -3.496194f,
+};
+
+static const NN_CONFIG vp9_rect_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_rect_part_nn_weights_64_layer0,
+      vp9_rect_part_nn_weights_64_layer1,
+  },
+  {
+      vp9_rect_part_nn_bias_64_layer0,
+      vp9_rect_part_nn_bias_64_layer1,
+  },
+};
+#undef FEATURES
+#undef LABELS
+#undef NODES
+
+#define FEATURES 7
+// Partition pruning model(neural nets).
+static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = {
+  -3.571348f, 0.014835f,  -3.255393f, -0.098090f, -0.013120f, 0.000221f,
+  0.056273f,  0.190179f,  -0.268130f, -1.828242f, -0.010655f, 0.937244f,
+  -0.435120f, 0.512125f,  1.610679f,  0.190816f,  -0.799075f, -0.377348f,
+  -0.144232f, 0.614383f,  -0.980388f, 1.754150f,  -0.185603f, -0.061854f,
+  -0.807172f, 1.240177f,  1.419531f,  -0.438544f, -5.980774f, 0.139045f,
+  -0.032359f, -0.068887f, -1.237918f, 0.115706f,  0.003164f,  2.924212f,
+  1.246838f,  -0.035833f, 0.810011f,  -0.805894f, 0.010966f,  0.076463f,
+  -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f,
+  -0.168961f, -3.326450f, -2.731094f, 0.002518f,  0.018840f,  -1.656815f,
+  0.068039f,  0.010586f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer0[8] = {
+  -3.469882f, 0.683989f, 0.194010f,  0.313782f,
+  -3.153335f, 2.245849f, -1.946190f, -3.740020f,
+};
+
+static const float vp9_partition_nn_weights_64x64_layer1[8] = {
+  -8.058566f, 0.108306f, -0.280620f, -0.818823f,
+  -6.445117f, 0.865364f, -1.127127f, -8.808660f,
+};
+
+static const float vp9_partition_nn_bias_64x64_layer1[1] = {
+  6.46909416f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_64x64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_64x64_layer0,
+      vp9_partition_nn_weights_64x64_layer1,
+  },
+  {
+      vp9_partition_nn_bias_64x64_layer0,
+      vp9_partition_nn_bias_64x64_layer1,
+  },
+};
+
+static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = {
+  -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f,  0.027221f,
+  -0.039137f, -0.907724f, -3.151662f, 0.007106f,  0.018726f,  -0.534928f,
+  0.022744f,  0.000159f,  -1.717189f, -3.229031f, -0.027311f, 0.269863f,
+  -0.400747f, -0.394366f, -0.108878f, 0.603027f,  0.455369f,  -0.197170f,
+  1.241746f,  -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f,
+  -0.138347f, -0.030754f, -0.200774f, 0.453795f,  0.055625f,  -3.163116f,
+  -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f,  -0.036439f,
+  -0.801228f, 0.313409f,  -0.159942f, 0.031267f,  0.886454f,  -1.531644f,
+  -0.089655f, 0.037683f,  -0.163441f, -0.130454f, -0.058344f, 0.060011f,
+  0.275387f,  1.552226f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer0[8] = {
+  -0.838372f, -2.609089f, -0.055763f, 1.329485f,
+  -1.297638f, -2.636622f, -0.826909f, 1.012644f,
+};
+
+static const float vp9_partition_nn_weights_32x32_layer1[8] = {
+  -1.792632f, -7.322353f, -0.683386f, 0.676564f,
+  -1.488118f, -7.527719f, 1.240163f,  0.614309f,
+};
+
+static const float vp9_partition_nn_bias_32x32_layer1[1] = {
+  4.97422546f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_32x32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_32x32_layer0,
+      vp9_partition_nn_weights_32x32_layer1,
+  },
+  {
+      vp9_partition_nn_bias_32x32_layer0,
+      vp9_partition_nn_bias_32x32_layer1,
+  },
+};
+
+static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = {
+  -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f,
+  0.130891f,  -3.096753f, 0.174968f,  -0.188769f, -0.640796f, 1.305661f,
+  1.700638f,  -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f,
+  -0.148617f, 0.172733f,  -0.018619f, 2.152595f,  0.778405f,  -0.156455f,
+  0.612995f,  -0.467878f, 0.152022f,  -0.236183f, 0.339635f,  -0.087119f,
+  -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f,  -0.793705f,
+  -6.399260f, 0.010624f,  -0.064199f, -0.650621f, 0.338087f,  -0.001531f,
+  1.023655f,  -3.700272f, -0.055281f, -0.386884f, 0.375504f,  -0.898678f,
+  0.281156f,  -0.314611f, 0.863354f,  -0.040582f, -0.145019f, 0.029329f,
+  -2.197880f, -0.108733f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer0[8] = {
+  0.411516f,  -2.143737f, -3.693192f, 2.123142f,
+  -1.356910f, -3.561016f, -0.765045f, -2.417082f,
+};
+
+static const float vp9_partition_nn_weights_16x16_layer1[8] = {
+  -0.619755f, -2.202391f, -4.337171f, 0.611319f,
+  0.377677f,  -4.998723f, -1.052235f, 1.949922f,
+};
+
+static const float vp9_partition_nn_bias_16x16_layer1[1] = {
+  3.20981717f,
+};
+
+static const NN_CONFIG vp9_partition_nnconfig_16x16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_partition_nn_weights_16x16_layer0,
+      vp9_partition_nn_weights_16x16_layer1,
+  },
+  {
+      vp9_partition_nn_bias_16x16_layer0,
+      vp9_partition_nn_bias_16x16_layer1,
+  },
+};
+#undef FEATURES
+
+#define FEATURES 6
+static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = {
+  -0.249572f, 0.205532f,  -2.175608f, 1.094836f,  -2.986370f, 0.193160f,
+  -0.143823f, 0.378511f,  -1.997788f, -2.166866f, -1.930158f, -1.202127f,
+  -0.611875f, -0.506422f, -0.432487f, 0.071205f,  0.578172f,  -0.154285f,
+  -0.051830f, 0.331681f,  -1.457177f, -2.443546f, -2.000302f, -1.389283f,
+  0.372084f,  -0.464917f, 2.265235f,  2.385787f,  2.312722f,  2.127868f,
+  -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f,  0.193976f,
+  -0.305611f, 0.256632f,  0.309388f,  -0.437439f, 1.702640f,  -5.007069f,
+  -0.323450f, 0.294227f,  1.267193f,  1.056601f,  0.387181f,  -0.191215f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer0[8] = {
+  -0.044396f, -0.938166f, 0.000000f,  -0.916375f,
+  1.242299f,  0.000000f,  -0.405734f, 0.014206f,
+};
+
+static const float vp9_var_part_nn_weights_64_layer1[8] = {
+  1.635945f,  0.979557f,  0.455315f, 1.197199f,
+  -2.251024f, -0.464953f, 1.378676f, -0.111927f,
+};
+
+static const float vp9_var_part_nn_bias_64_layer1[1] = {
+  -0.37972447f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_64_layer0,
+      vp9_var_part_nn_weights_64_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_64_layer0,
+      vp9_var_part_nn_bias_64_layer1,
+  },
+};
+
+static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = {
+  0.067243f,  -0.083598f, -2.191159f, 2.726434f,  -3.324013f, 3.477977f,
+  0.323736f,  -0.510199f, 2.960693f,  2.937661f,  2.888476f,  2.938315f,
+  -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f,
+  0.665153f,  -0.273210f, 0.028279f,  0.972220f,  -0.445596f, 1.756611f,
+  -0.177892f, -0.091758f, 0.436661f,  -0.521506f, 0.133786f,  0.266743f,
+  0.637367f,  -0.160084f, -1.396269f, 1.020841f,  -1.112971f, 0.919496f,
+  -0.235883f, 0.651954f,  0.109061f,  -0.429463f, 0.740839f,  -0.962060f,
+  0.299519f,  -0.386298f, 1.550231f,  2.464915f,  1.311969f,  2.561612f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer0[8] = {
+  0.368242f, 0.736617f, 0.000000f,  0.757287f,
+  0.000000f, 0.613248f, -0.776390f, 0.928497f,
+};
+
+static const float vp9_var_part_nn_weights_32_layer1[8] = {
+  0.939884f, -2.420850f, -0.410489f, -0.186690f,
+  0.063287f, -0.522011f, 0.484527f,  -0.639625f,
+};
+
+static const float vp9_var_part_nn_bias_32_layer1[1] = {
+  -0.6455006f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_32_layer0,
+      vp9_var_part_nn_weights_32_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_32_layer0,
+      vp9_var_part_nn_bias_32_layer1,
+  },
+};
+
+static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = {
+  0.742567f,  -0.580624f, -0.244528f, 0.331661f,  -0.113949f, -0.559295f,
+  -0.386061f, 0.438653f,  1.467463f,  0.211589f,  0.513972f,  1.067855f,
+  -0.876679f, 0.088560f,  -0.687483f, -0.380304f, -0.016412f, 0.146380f,
+  0.015318f,  0.000351f,  -2.764887f, 3.269717f,  2.752428f,  -2.236754f,
+  0.561539f,  -0.852050f, -0.084667f, 0.202057f,  0.197049f,  0.364922f,
+  -0.463801f, 0.431790f,  1.872096f,  -0.091887f, -0.055034f, 2.443492f,
+  -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f,
+  0.642021f,  -0.875117f, 2.040794f,  1.921070f,  1.792413f,  1.839727f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer0[8] = {
+  2.901234f, -1.940932f, -0.198970f, -0.406524f,
+  0.059422f, -1.879207f, -0.232340f, 2.979821f,
+};
+
+static const float vp9_var_part_nn_weights_16_layer1[8] = {
+  -0.528731f, 0.375234f, -0.088422f, 0.668629f,
+  0.870449f,  0.578735f, 0.546103f,  -1.957207f,
+};
+
+static const float vp9_var_part_nn_bias_16_layer1[1] = {
+  -1.95769405f,
+};
+
+static const NN_CONFIG vp9_var_part_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  1,         // num_outputs
+  1,         // num_hidden_layers
+  {
+      8,
+  },  // num_hidden_nodes
+  {
+      vp9_var_part_nn_weights_16_layer0,
+      vp9_var_part_nn_weights_16_layer1,
+  },
+  {
+      vp9_var_part_nn_bias_16_layer0,
+      vp9_var_part_nn_bias_16_layer1,
+  },
+};
+#undef FEATURES
+
+#define FEATURES 12
+#define LABELS 1
+#define NODES 8
+static const float vp9_part_split_nn_weights_64_layer0[FEATURES * NODES] = {
+  -0.609728f, -0.409099f, -0.472449f, 0.183769f,  -0.457740f, 0.081089f,
+  0.171003f,  0.578696f,  -0.019043f, -0.856142f, 0.557369f,  -1.779424f,
+  -0.274044f, -0.320632f, -0.392531f, -0.359462f, -0.404106f, -0.288357f,
+  0.200620f,  0.038013f,  -0.430093f, 0.235083f,  -0.487442f, 0.424814f,
+  -0.232758f, -0.442943f, 0.229397f,  -0.540301f, -0.648421f, -0.649747f,
+  -0.171638f, 0.603824f,  0.468497f,  -0.421580f, 0.178840f,  -0.533838f,
+  -0.029471f, -0.076296f, 0.197426f,  -0.187908f, -0.003950f, -0.065740f,
+  0.085165f,  -0.039674f, -5.640702f, 1.909538f,  -1.434604f, 3.294606f,
+  -0.788812f, 0.196864f,  0.057012f,  -0.019757f, 0.336233f,  0.075378f,
+  0.081503f,  0.491864f,  -1.899470f, -1.764173f, -1.888137f, -1.762343f,
+  0.845542f,  0.202285f,  0.381948f,  -0.150996f, 0.556893f,  -0.305354f,
+  0.561482f,  -0.021974f, -0.703117f, 0.268638f,  -0.665736f, 1.191005f,
+  -0.081568f, -0.115653f, 0.272029f,  -0.140074f, 0.072683f,  0.092651f,
+  -0.472287f, -0.055790f, -0.434425f, 0.352055f,  0.048246f,  0.372865f,
+  0.111499f,  -0.338304f, 0.739133f,  0.156519f,  -0.594644f, 0.137295f,
+  0.613350f,  -0.165102f, -1.003731f, 0.043070f,  -0.887896f, -0.174202f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer0[NODES] = {
+  1.182714f,  0.000000f,  0.902019f,  0.953115f,
+  -1.372486f, -1.288740f, -0.155144f, -3.041362f,
+};
+
+static const float vp9_part_split_nn_weights_64_layer1[NODES * LABELS] = {
+  0.841214f,  0.456016f,  0.869270f, 1.692999f,
+  -1.700494f, -0.911761f, 0.030111f, -1.447548f,
+};
+
+static const float vp9_part_split_nn_bias_64_layer1[LABELS] = {
+  1.17782545f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_64 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_64_layer0,
+      vp9_part_split_nn_weights_64_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_64_layer0,
+      vp9_part_split_nn_bias_64_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_32_layer0[FEATURES * NODES] = {
+  -0.105488f, -0.218662f, 0.010980f,  -0.226979f, 0.028076f,  0.743430f,
+  0.789266f,  0.031907f,  -1.464200f, 0.222336f,  -1.068493f, -0.052712f,
+  -0.176181f, -0.102654f, -0.973932f, -0.182637f, -0.198000f, 0.335977f,
+  0.271346f,  0.133005f,  1.674203f,  0.689567f,  0.657133f,  0.283524f,
+  0.115529f,  0.738327f,  0.317184f,  -0.179736f, 0.403691f,  0.679350f,
+  0.048925f,  0.271338f,  -1.538921f, -0.900737f, -1.377845f, 0.084245f,
+  0.803122f,  -0.107806f, 0.103045f,  -0.023335f, -0.098116f, -0.127809f,
+  0.037665f,  -0.523225f, 1.622185f,  1.903999f,  1.358889f,  1.680785f,
+  0.027743f,  0.117906f,  -0.158810f, 0.057775f,  0.168257f,  0.062414f,
+  0.086228f,  -0.087381f, -3.066082f, 3.021855f,  -4.092155f, 2.550104f,
+  -0.230022f, -0.207445f, -0.000347f, 0.034042f,  0.097057f,  0.220088f,
+  -0.228841f, -0.029405f, -1.507174f, -1.455184f, 2.624904f,  2.643355f,
+  0.319912f,  0.585531f,  -1.018225f, -0.699606f, 1.026490f,  0.169952f,
+  -0.093579f, -0.142352f, -0.107256f, 0.059598f,  0.043190f,  0.507543f,
+  -0.138617f, 0.030197f,  0.059574f,  -0.634051f, -0.586724f, -0.148020f,
+  -0.334380f, 0.459547f,  1.620600f,  0.496850f,  0.639480f,  -0.465715f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer0[NODES] = {
+  -1.125885f, 0.753197f, -0.825808f, 0.004839f,
+  0.583920f,  0.718062f, 0.976741f,  0.796188f,
+};
+
+static const float vp9_part_split_nn_weights_32_layer1[NODES * LABELS] = {
+  -0.458745f, 0.724624f, -0.479720f, -2.199872f,
+  1.162661f,  1.194153f, -0.716896f, 0.824080f,
+};
+
+static const float vp9_part_split_nn_bias_32_layer1[LABELS] = {
+  0.71644074f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_32 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_32_layer0,
+      vp9_part_split_nn_weights_32_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_32_layer0,
+      vp9_part_split_nn_bias_32_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_16_layer0[FEATURES * NODES] = {
+  -0.003629f, -0.046852f, 0.220428f,  -0.033042f, 0.049365f,  0.112818f,
+  -0.306149f, -0.005872f, 1.066947f,  -2.290226f, 2.159505f,  -0.618714f,
+  -0.213294f, 0.451372f,  -0.199459f, 0.223730f,  -0.321709f, 0.063364f,
+  0.148704f,  -0.293371f, 0.077225f,  -0.421947f, -0.515543f, -0.240975f,
+  -0.418516f, 1.036523f,  -0.009165f, 0.032484f,  1.086549f,  0.220322f,
+  -0.247585f, -0.221232f, -0.225050f, 0.993051f,  0.285907f,  1.308846f,
+  0.707456f,  0.335152f,  0.234556f,  0.264590f,  -0.078033f, 0.542226f,
+  0.057777f,  0.163471f,  0.039245f,  -0.725960f, 0.963780f,  -0.972001f,
+  0.252237f,  -0.192745f, -0.836571f, -0.460539f, -0.528713f, -0.160198f,
+  -0.621108f, 0.486405f,  -0.221923f, 1.519426f,  -0.857871f, 0.411595f,
+  0.947188f,  0.203339f,  0.174526f,  0.016382f,  0.256879f,  0.049818f,
+  0.057836f,  -0.659096f, 0.459894f,  0.174695f,  0.379359f,  0.062530f,
+  -0.210201f, -0.355788f, -0.208432f, -0.401723f, -0.115373f, 0.191336f,
+  -0.109342f, 0.002455f,  -0.078746f, -0.391871f, 0.149892f,  -0.239615f,
+  -0.520709f, 0.118568f,  -0.437975f, 0.118116f,  -0.565426f, -0.206446f,
+  0.113407f,  0.558894f,  0.534627f,  1.154350f,  -0.116833f, 1.723311f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer0[NODES] = {
+  0.013109f,  -0.034341f, 0.679845f,  -0.035781f,
+  -0.104183f, 0.098055f,  -0.041130f, 0.160107f,
+};
+
+static const float vp9_part_split_nn_weights_16_layer1[NODES * LABELS] = {
+  1.499564f, -0.403259f, 1.366532f, -0.469868f,
+  0.482227f, -2.076697f, 0.527691f, 0.540495f,
+};
+
+static const float vp9_part_split_nn_bias_16_layer1[LABELS] = {
+  0.01134653f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_16 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_16_layer0,
+      vp9_part_split_nn_weights_16_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_16_layer0,
+      vp9_part_split_nn_bias_16_layer1,
+  },
+};
+
+static const float vp9_part_split_nn_weights_8_layer0[FEATURES * NODES] = {
+  -0.668875f, -0.159078f, -0.062663f, -0.483785f, -0.146814f, -0.608975f,
+  -0.589145f, 0.203704f,  -0.051007f, -0.113769f, -0.477511f, -0.122603f,
+  -1.329890f, 1.403386f,  0.199636f,  -0.161139f, 2.182090f,  -0.014307f,
+  0.015755f,  -0.208468f, 0.884353f,  0.815920f,  0.632464f,  0.838225f,
+  1.369483f,  -0.029068f, 0.570213f,  -0.573546f, 0.029617f,  0.562054f,
+  -0.653093f, -0.211910f, -0.661013f, -0.384418f, -0.574038f, -0.510069f,
+  0.173047f,  -0.274231f, -1.044008f, -0.422040f, -0.810296f, 0.144069f,
+  -0.406704f, 0.411230f,  -0.144023f, 0.745651f,  -0.595091f, 0.111787f,
+  0.840651f,  0.030123f,  -0.242155f, 0.101486f,  -0.017889f, -0.254467f,
+  -0.285407f, -0.076675f, -0.549542f, -0.013544f, -0.686566f, -0.755150f,
+  1.623949f,  -0.286369f, 0.170976f,  0.016442f,  -0.598353f, -0.038540f,
+  0.202597f,  -0.933582f, 0.599510f,  0.362273f,  0.577722f,  0.477603f,
+  0.767097f,  0.431532f,  0.457034f,  0.223279f,  0.381349f,  0.033777f,
+  0.423923f,  -0.664762f, 0.385662f,  0.075744f,  0.182681f,  0.024118f,
+  0.319408f,  -0.528864f, 0.976537f,  -0.305971f, -0.189380f, -0.241689f,
+  -1.318092f, 0.088647f,  -0.109030f, -0.945654f, 1.082797f,  0.184564f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer0[NODES] = {
+  -0.237472f, 2.051396f,  0.297062f, -0.730194f,
+  0.060472f,  -0.565959f, 0.560869f, -0.395448f,
+};
+
+static const float vp9_part_split_nn_weights_8_layer1[NODES * LABELS] = {
+  0.568121f,  1.575915f,  -0.544309f, 0.751595f,
+  -0.117911f, -1.340730f, -0.739671f, 0.661216f,
+};
+
+static const float vp9_part_split_nn_bias_8_layer1[LABELS] = {
+  -0.63375306f,
+};
+
+static const NN_CONFIG vp9_part_split_nnconfig_8 = {
+  FEATURES,  // num_inputs
+  LABELS,    // num_outputs
+  1,         // num_hidden_layers
+  {
+      NODES,
+  },  // num_hidden_nodes
+  {
+      vp9_part_split_nn_weights_8_layer0,
+      vp9_part_split_nn_weights_8_layer1,
+  },
+  {
+      vp9_part_split_nn_bias_8_layer0,
+      vp9_part_split_nn_bias_8_layer1,
+  },
+};
+#undef NODES
+#undef FEATURES
+#undef LABELS
+
+// Partition pruning model(linear).
+static const float vp9_partition_feature_mean[24] = {
+  303501.697372f, 3042630.372158f, 24.694696f, 1.392182f,
+  689.413511f,    162.027012f,     1.478213f,  0.0,
+  135382.260230f, 912738.513263f,  28.845217f, 1.515230f,
+  544.158492f,    131.807995f,     1.436863f,  0.0f,
+  43682.377587f,  208131.711766f,  28.084737f, 1.356677f,
+  138.254122f,    119.522553f,     1.252322f,  0.0f,
+};
+
+static const float vp9_partition_feature_std[24] = {
+  673689.212982f, 5996652.516628f, 0.024449f, 1.989792f,
+  985.880847f,    0.014638f,       2.001898f, 0.0f,
+  208798.775332f, 1812548.443284f, 0.018693f, 1.838009f,
+  396.986910f,    0.015657f,       1.332541f, 0.0f,
+  55888.847031f,  448587.962714f,  0.017900f, 1.904776f,
+  98.652832f,     0.016598f,       1.320992f, 0.0f,
+};
+
+// Error tolerance: 0.01%-0.0.05%-0.1%
+static const float vp9_partition_linear_weights[24] = {
+  0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f,
+  0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f,
+  0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f,
+  0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f,
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.c b/libs/libvpx/vp9/encoder/vp9_picklpf.c
index 1c2c55b9e4b..3a620df693c 100644
--- a/libs/libvpx/vp9/encoder/vp9_picklpf.c
+++ b/libs/libvpx/vp9/encoder/vp9_picklpf.c
@@ -24,10 +24,20 @@
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_quantize.h"
 
+static unsigned int get_section_intra_rating(const VP9_COMP *cpi) {
+  unsigned int section_intra_rating;
+
+  section_intra_rating = (cpi->common.frame_type == KEY_FRAME)
+                             ? cpi->twopass.key_frame_section_intra_rating
+                             : cpi->twopass.section_intra_rating;
+
+  return section_intra_rating;
+}
+
 static int get_max_filter_level(const VP9_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
-    return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
-                                                 : MAX_LOOP_FILTER;
+    unsigned int section_intra_rating = get_section_intra_rating(cpi);
+    return section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 : MAX_LOOP_FILTER;
   } else {
     return MAX_LOOP_FILTER;
   }
@@ -81,6 +91,7 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
   // Sum squared error at each filter level
   int64_t ss_err[MAX_LOOP_FILTER + 1];
+  unsigned int section_intra_rating = get_section_intra_rating(cpi);
 
   // Set each entry to -1
   memset(ss_err, 0xFF, sizeof(ss_err));
@@ -99,8 +110,8 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
     // Bias against raising loop filter in favor of lowering it.
     int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
 
-    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
-      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+    if ((cpi->oxcf.pass == 2) && (section_intra_rating < 20))
+      bias = (bias * section_intra_rating) / 20;
 
     // yx, bias less for large block size
     if (cm->tx_mode != ONLY_4X4) bias >>= 1;
@@ -150,7 +161,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
   VP9_COMMON *const cm = &cpi->common;
   struct loopfilter *const lf = &cm->lf;
 
-  lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+  lf->sharpness_level = 0;
 
   if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
     lf->filter_level = 0;
@@ -169,20 +180,17 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
       case VPX_BITS_10:
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
         break;
-      case VPX_BITS_12:
+      default:
+        assert(cm->bit_depth == VPX_BITS_12);
         filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
         break;
-      default:
-        assert(0 &&
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
-               "or VPX_BITS_12");
-        return;
     }
 #else
     int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
         cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
+        (cm->base_qindex < 200 || cm->width * cm->height > 320 * 240) &&
         cpi->oxcf.content != VP9E_CONTENT_SCREEN && cm->frame_type != KEY_FRAME)
       filt_guess = 5 * filt_guess >> 3;
 
diff --git a/libs/libvpx/vp9/encoder/vp9_picklpf.h b/libs/libvpx/vp9/encoder/vp9_picklpf.h
index cecca058b47..8881b44daaa 100644
--- a/libs/libvpx/vp9/encoder/vp9_picklpf.h
+++ b/libs/libvpx/vp9/encoder/vp9_picklpf.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_PICKLPF_H_
-#define VP9_ENCODER_VP9_PICKLPF_H_
+#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_
+#define VPX_VP9_ENCODER_VP9_PICKLPF_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -26,4 +26,4 @@ void vp9_pick_filter_level(const struct yv12_buffer_config *sd,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_PICKLPF_H_
+#endif  // VPX_VP9_ENCODER_VP9_PICKLPF_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.c b/libs/libvpx/vp9/encoder/vp9_pickmode.c
index f2f323a282d..513b9f678c4 100644
--- a/libs/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libs/libvpx/vp9/encoder/vp9_pickmode.c
@@ -41,6 +41,17 @@ typedef struct {
   int in_use;
 } PRED_BUFFER;
 
+typedef struct {
+  PRED_BUFFER *best_pred;
+  PREDICTION_MODE best_mode;
+  TX_SIZE best_tx_size;
+  TX_SIZE best_intra_tx_size;
+  MV_REFERENCE_FRAME best_ref_frame;
+  MV_REFERENCE_FRAME best_second_ref_frame;
+  uint8_t best_mode_skip_txfm;
+  INTERP_FILTER best_pred_filter;
+} BEST_PICKMODE;
+
 static const int pos_shift_16x16[4][4] = {
   { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 }
 };
@@ -222,13 +233,22 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (rv && search_subpel) {
-    int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
-    if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
+    SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+    if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL;
+    if (cpi->sf.mv.enable_adaptive_subpel_force_stop) {
+      const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh;
+      if (abs(tmp_mv->as_mv.row) >= mv_thresh ||
+          abs(tmp_mv->as_mv.col) >= mv_thresh)
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above;
+      else
+        subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below;
+    }
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
         x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
-        cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0,
+        cpi->sf.use_accurate_subpel_search);
     *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
                                x->mvcost, MV_COST_WEIGHT);
   }
@@ -326,6 +346,82 @@ static int ac_thr_factor(const int speed, const int width, const int height,
   return 1;
 }
 
+static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                                 MACROBLOCKD *const xd, unsigned int var,
+                                 unsigned int sse, int64_t ac_thr,
+                                 unsigned int source_variance, int is_intra) {
+  // TODO(marpan): Tune selection for intra-modes, screen content, etc.
+  TX_SIZE tx_size;
+  unsigned int var_thresh = is_intra ? (unsigned int)ac_thr : 1;
+  int limit_tx = 1;
+  if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+      (source_variance == 0 || var < var_thresh))
+    limit_tx = 0;
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = VPXMIN(max_txsize_lookup[bsize],
+                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && limit_tx &&
+        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
+      tx_size = TX_8X8;
+    else if (tx_size > TX_16X16 && limit_tx)
+      tx_size = TX_16X16;
+    // For screen-content force 4X4 tx_size over 8X8, for large variance.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 &&
+        bsize <= BLOCK_16X16 && ((var >> 5) > (unsigned int)ac_thr))
+      tx_size = TX_4X4;
+  } else {
+    tx_size = VPXMIN(max_txsize_lookup[bsize],
+                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+  return tx_size;
+}
+
+static void compute_intra_yprediction(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+                                      MACROBLOCK *x, MACROBLOCKD *xd) {
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[0];
+  uint8_t *const src_buf_base = p->src.buf;
+  uint8_t *const dst_buf_base = pd->dst.buf;
+  const int src_stride = p->src.stride;
+  const int dst_stride = pd->dst.stride;
+  // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+  // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  int row, col;
+  // If mb_to_right_edge is < 0 we are in a situation in which
+  // the current block size extends into the UMV and we won't
+  // visit the sub blocks that are wholly within the UMV.
+  const int max_blocks_wide =
+      num_4x4_w + (xd->mb_to_right_edge >= 0
+                       ? 0
+                       : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high =
+      num_4x4_h + (xd->mb_to_bottom_edge >= 0
+                       ? 0
+                       : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (row = 0; row < max_blocks_high; row += (1 << tx_size)) {
+    // Skip visiting the sub blocks that are wholly within the UMV.
+    for (col = 0; col < max_blocks_wide; col += (1 << tx_size)) {
+      p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+      pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
+      vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode,
+                              x->skip_encode ? p->src.buf : pd->dst.buf,
+                              x->skip_encode ? src_stride : dst_stride,
+                              pd->dst.buf, dst_stride, col, row, 0);
+    }
+  }
+  p->src.buf = src_buf_base;
+  pd->dst.buf = dst_buf_base;
+}
+
 static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
@@ -342,7 +438,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   struct macroblockd_plane *const pd = &xd->plane[0];
   const uint32_t dc_quant = pd->dequant[0];
   const uint32_t ac_quant = pd->dequant[1];
-  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  int64_t dc_thr = dc_quant * dc_quant >> 6;
   int64_t ac_thr = ac_quant * ac_quant >> 6;
   unsigned int var;
   int sum;
@@ -386,26 +482,17 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                           cpi->common.height, abs(sum) >> (bw + bh));
 #endif
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      tx_size = VPXMIN(max_txsize_lookup[bsize],
-                       tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      tx_size = TX_8X8;
-    else if (tx_size > TX_16X16)
-      tx_size = TX_16X16;
-  } else {
-    tx_size = VPXMIN(max_txsize_lookup[bsize],
-                     tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
-
-  assert(tx_size >= TX_8X8);
+  tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+                              x->source_variance, 0);
+  // The code below for setting skip flag assumes tranform size of at least 8x8,
+  // so force this lower limit on transform.
+  if (tx_size < TX_8X8) tx_size = TX_8X8;
   xd->mi[0]->tx_size = tx_size;
 
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source &&
+      x->source_variance == 0)
+    dc_thr = dc_thr << 1;
+
   // Evaluate if the partition block is a skippable block in Y plane.
   {
     unsigned int sse16x16[16] = { 0 };
@@ -473,33 +560,29 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      if (cpi->oxcf.speed < 8 || x->color_sensitivity[i - 1]) {
-        struct macroblock_plane *const p = &x->plane[i];
-        struct macroblockd_plane *const pd = &xd->plane[i];
-        const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
-        const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-        const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
-        const int uv_bw = b_width_log2_lookup[uv_bsize];
-        const int uv_bh = b_height_log2_lookup[uv_bsize];
-        const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
-                       (uv_bh - b_height_log2_lookup[unit_size]);
-        const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-        const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
-        int j = i - 1;
-
-        vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
-        flag_preduv_computed[i - 1] = 1;
-        var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-            p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
-
-        if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
-            (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
-          skip_uv[j] = 1;
-        else
-          break;
-      } else {
-        skip_uv[i - 1] = 1;
-      }
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const int uv_bw = b_width_log2_lookup[uv_bsize];
+      const int uv_bh = b_height_log2_lookup[uv_bsize];
+      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+                     (uv_bh - b_height_log2_lookup[unit_size]);
+      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      int j = i - 1;
+
+      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+      flag_preduv_computed[i - 1] = 1;
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
+          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+        skip_uv[j] = 1;
+      else
+        break;
     }
 
     // If the transform in YUV planes are skippable, the mode search checks
@@ -543,7 +626,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                               MACROBLOCKD *xd, int *out_rate_sum,
                               int64_t *out_dist_sum, unsigned int *var_y,
-                              unsigned int *sse_y) {
+                              unsigned int *sse_y, int is_intra) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -563,24 +646,8 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   *var_y = var;
   *sse_y = sse;
 
-  if (cpi->common.tx_mode == TX_MODE_SELECT) {
-    if (sse > (var << 2))
-      xd->mi[0]->tx_size =
-          VPXMIN(max_txsize_lookup[bsize],
-                 tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    else
-      xd->mi[0]->tx_size = TX_8X8;
-
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
-        cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id))
-      xd->mi[0]->tx_size = TX_8X8;
-    else if (xd->mi[0]->tx_size > TX_16X16)
-      xd->mi[0]->tx_size = TX_16X16;
-  } else {
-    xd->mi[0]->tx_size =
-        VPXMIN(max_txsize_lookup[bsize],
-               tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-  }
+  xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr,
+                                         x->source_variance, is_intra);
 
   // Evaluate if the partition block is a skippable block in Y plane.
   {
@@ -641,7 +708,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
 
 static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                       int *skippable, int64_t *sse, BLOCK_SIZE bsize,
-                      TX_SIZE tx_size, int rd_computed) {
+                      TX_SIZE tx_size, int rd_computed, int is_intra) {
   MACROBLOCKD *xd = &x->e_mbd;
   const struct macroblockd_plane *pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
@@ -658,25 +725,6 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   const int bw = 4 * num_4x4_w;
   const int bh = 4 * num_4x4_h;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  // TODO(jingning): Implement the high bit-depth Hadamard transforms and
-  // remove this check condition.
-  // TODO(marpan): Use this path (model_rd) for 8bit under certain conditions
-  // for now, as the vp9_quantize_fp below for highbitdepth build is slow.
-  if (xd->bd != 8 ||
-      (cpi->oxcf.speed > 5 && cpi->common.frame_type != KEY_FRAME &&
-       bsize < BLOCK_32X32)) {
-    unsigned int var_y, sse_y;
-    (void)tx_size;
-    if (!rd_computed)
-      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
-                        &var_y, &sse_y);
-    *sse = INT_MAX;
-    *skippable = 0;
-    return;
-  }
-#endif
-
   if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
       (bsize < BLOCK_32X32 ||
        (cpi->use_svc &&
@@ -685,7 +733,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
     (void)tx_size;
     if (!rd_computed)
       model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc->rate, &this_rdc->dist,
-                        &var_y, &sse_y);
+                        &var_y, &sse_y, is_intra);
     *sse = INT_MAX;
     *skippable = 0;
     return;
@@ -695,9 +743,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
 
   // The max tx_size passed in is TX_16X16.
   assert(tx_size != TX_32X32);
-
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+                              p->src.stride, pd->dst.buf, pd->dst.stride,
+                              x->e_mbd.bd);
+  } else {
+    vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride);
+  }
+#else
   vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
+#endif
   *skippable = 1;
   // Keep track of the row and column of the blocks we use so that we know
   // if we are in the unrestricted motion border.
@@ -726,13 +784,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                             qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
-          case TX_4X4:
+          default:
+            assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
             vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                             qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
-          default: assert(0); break;
         }
         *skippable &= (*eob == 0);
         eob_cost += 1;
@@ -876,6 +934,7 @@ static void encode_breakout_test(
   // Skipping threshold for dc.
   unsigned int thresh_dc;
   int motion_low = 1;
+
   if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
   if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
       mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
@@ -981,8 +1040,8 @@ static void estimate_block_intra(int plane, int block, int row, int col,
   VP9_COMP *const cpi = args->cpi;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *const p = &x->plane[0];
-  struct macroblockd_plane *const pd = &xd->plane[0];
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   const BLOCK_SIZE bsize_tx = txsize_to_bsize[tx_size];
   uint8_t *const src_buf_base = p->src.buf;
   uint8_t *const dst_buf_base = pd->dst.buf;
@@ -992,8 +1051,8 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   (void)block;
 
-  p->src.buf = &src_buf_base[4 * (row * src_stride + col)];
-  pd->dst.buf = &dst_buf_base[4 * (row * dst_stride + col)];
+  p->src.buf = &src_buf_base[4 * (row * (int64_t)src_stride + col)];
+  pd->dst.buf = &dst_buf_base[4 * (row * (int64_t)dst_stride + col)];
   // Use source buffer as an approximation for the fully reconstructed buffer.
   vp9_predict_intra_block(xd, b_width_log2_lookup[plane_bsize], tx_size,
                           args->mode, x->skip_encode ? p->src.buf : pd->dst.buf,
@@ -1002,13 +1061,12 @@ static void estimate_block_intra(int plane, int block, int row, int col,
 
   if (plane == 0) {
     int64_t this_sse = INT64_MAX;
-    // TODO(jingning): This needs further refactoring.
     block_yrd(cpi, x, &this_rdc, &args->skippable, &this_sse, bsize_tx,
-              VPXMIN(tx_size, TX_16X16), 0);
+              VPXMIN(tx_size, TX_16X16), 0, 1);
   } else {
     unsigned int var = 0;
     unsigned int sse = 0;
-    model_rd_for_sb_uv(cpi, plane_bsize, x, xd, &this_rdc, &var, &sse, plane,
+    model_rd_for_sb_uv(cpi, bsize_tx, x, xd, &this_rdc, &var, &sse, plane,
                        plane);
   }
 
@@ -1292,18 +1350,16 @@ static void vp9_pickmode_ctx_den_update(
     VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig,
     int ref_frame_cost[MAX_REF_FRAMES],
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred,
-    TX_SIZE best_tx_size, PREDICTION_MODE best_mode,
-    MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter,
-    uint8_t best_mode_skip_txfm) {
+    BEST_PICKMODE *bp) {
   ctx_den->zero_last_cost_orig = zero_last_cost_orig;
   ctx_den->ref_frame_cost = ref_frame_cost;
   ctx_den->frame_mv = frame_mv;
   ctx_den->reuse_inter_pred = reuse_inter_pred;
-  ctx_den->best_tx_size = best_tx_size;
-  ctx_den->best_mode = best_mode;
-  ctx_den->best_ref_frame = best_ref_frame;
-  ctx_den->best_pred_filter = best_pred_filter;
-  ctx_den->best_mode_skip_txfm = best_mode_skip_txfm;
+  ctx_den->best_tx_size = bp->best_tx_size;
+  ctx_den->best_mode = bp->best_mode;
+  ctx_den->best_ref_frame = bp->best_ref_frame;
+  ctx_den->best_pred_filter = bp->best_pred_filter;
+  ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm;
 }
 
 static void recheck_zeromv_after_denoising(
@@ -1322,6 +1378,7 @@ static void recheck_zeromv_after_denoising(
         cpi->svc.number_spatial_layers == 1 &&
         decision == FILTER_ZEROMV_BLOCK))) {
     // Check if we should pick ZEROMV on denoised signal.
+    VP9_COMMON *const cm = &cpi->common;
     int rate = 0;
     int64_t dist = 0;
     uint32_t var_y = UINT_MAX;
@@ -1330,11 +1387,13 @@ static void recheck_zeromv_after_denoising(
     mi->mode = ZEROMV;
     mi->ref_frame[0] = LAST_FRAME;
     mi->ref_frame[1] = NONE;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
     mi->mv[0].as_int = 0;
     mi->interp_filter = EIGHTTAP;
+    if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
     xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0];
     vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y);
+    model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y, 0);
     this_rdc.rate = rate + ctx_den->ref_frame_cost[LAST_FRAME] +
                     cpi->inter_mode_cost[x->mbmi_ext->mode_context[LAST_FRAME]]
                                         [INTER_OFFSET(ZEROMV)];
@@ -1346,6 +1405,7 @@ static void recheck_zeromv_after_denoising(
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
       mi->interp_filter = ctx_den->best_pred_filter;
       if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
@@ -1416,27 +1476,217 @@ static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row,
   return force_skip_low_temp_var;
 }
 
+static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
+                              int mi_row, int mi_col, PRED_BUFFER *tmp,
+                              BLOCK_SIZE bsize, int reuse_inter_pred,
+                              PRED_BUFFER **this_mode_pred, unsigned int *var_y,
+                              unsigned int *sse_y, int force_smooth_filter,
+                              int *this_early_term, int *flag_preduv_computed,
+                              int use_model_yrd_large) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
+
+  int pf_rate[3] = { 0 };
+  int64_t pf_dist[3] = { 0 };
+  int curr_rate[3] = { 0 };
+  unsigned int pf_var[3] = { 0 };
+  unsigned int pf_sse[3] = { 0 };
+  TX_SIZE pf_tx_size[3] = { 0 };
+  int64_t best_cost = INT64_MAX;
+  INTERP_FILTER best_filter = SWITCHABLE, filter;
+  PRED_BUFFER *current_pred = *this_mode_pred;
+  uint8_t skip_txfm = SKIP_TXFM_NONE;
+  int best_early_term = 0;
+  int best_flag_preduv_computed[2] = { 0 };
+  INTERP_FILTER filter_start = force_smooth_filter ? EIGHTTAP_SMOOTH : EIGHTTAP;
+  for (filter = filter_start; filter <= EIGHTTAP_SMOOTH; ++filter) {
+    int64_t cost;
+    mi->interp_filter = filter;
+    vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    // For large partition blocks, extra testing is done.
+    if (use_model_yrd_large)
+      model_rd_for_sb_y_large(cpi, bsize, x, xd, &pf_rate[filter],
+                              &pf_dist[filter], &pf_var[filter],
+                              &pf_sse[filter], mi_row, mi_col, this_early_term,
+                              flag_preduv_computed);
+    else
+      model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
+                        &pf_var[filter], &pf_sse[filter], 0);
+    curr_rate[filter] = pf_rate[filter];
+    pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
+    cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
+    pf_tx_size[filter] = mi->tx_size;
+    if (cost < best_cost) {
+      best_filter = filter;
+      best_cost = cost;
+      skip_txfm = x->skip_txfm[0];
+      best_early_term = *this_early_term;
+      best_flag_preduv_computed[0] = flag_preduv_computed[0];
+      best_flag_preduv_computed[1] = flag_preduv_computed[1];
+
+      if (reuse_inter_pred) {
+        if (*this_mode_pred != current_pred) {
+          free_pred_buffer(*this_mode_pred);
+          *this_mode_pred = current_pred;
+        }
+        current_pred = &tmp[get_pred_buffer(tmp, 3)];
+        pd->dst.buf = current_pred->data;
+        pd->dst.stride = bw;
+      }
+    }
+  }
+
+  if (reuse_inter_pred && *this_mode_pred != current_pred)
+    free_pred_buffer(current_pred);
+
+  mi->interp_filter = best_filter;
+  mi->tx_size = pf_tx_size[best_filter];
+  this_rdc->rate = curr_rate[best_filter];
+  this_rdc->dist = pf_dist[best_filter];
+  *var_y = pf_var[best_filter];
+  *sse_y = pf_sse[best_filter];
+  x->skip_txfm[0] = skip_txfm;
+  *this_early_term = best_early_term;
+  flag_preduv_computed[0] = best_flag_preduv_computed[0];
+  flag_preduv_computed[1] = best_flag_preduv_computed[1];
+  if (reuse_inter_pred) {
+    pd->dst.buf = (*this_mode_pred)->data;
+    pd->dst.stride = (*this_mode_pred)->stride;
+  }
+}
+
+static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
+                         int_mv frame_mv[][MAX_REF_FRAMES],
+                         MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref,
+                         BLOCK_SIZE bsize, int mi_row, int mi_col,
+                         int best_pred_sad, int *rate_mv,
+                         unsigned int best_sse_sofar, RD_COST *best_rdc) {
+  SVC *const svc = &cpi->svc;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mi = xd->mi[0];
+  SPEED_FEATURES *const sf = &cpi->sf;
+
+  if (ref_frame > LAST_FRAME && gf_temporal_ref &&
+      cpi->oxcf.rc_mode == VPX_CBR) {
+    int tmp_sad;
+    uint32_t dis;
+    int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
+    if (bsize < BLOCK_16X16) return -1;
+
+    tmp_sad = vp9_int_pro_motion_estimation(
+        cpi, x, bsize, mi_row, mi_col,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
+
+    if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1;
+    if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1;
+
+    frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
+    *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
+                               &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+    frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
+    frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
+
+    cpi->find_fractional_mv_step(
+        x, &frame_mv[NEWMV][ref_frame].as_mv,
+        &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
+        cpi->common.allow_high_precision_mv, x->errorperbit,
+        &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0,
+        cpi->sf.use_accurate_subpel_search);
+  } else if (svc->use_base_mv && svc->spatial_layer_id) {
+    if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
+      const int pre_stride = xd->plane[0].pre[0].stride;
+      unsigned int base_mv_sse = UINT_MAX;
+      int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
+      const uint8_t *const pre_buf =
+          xd->plane[0].pre[0].buf +
+          (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
+          (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
+      cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                            pre_buf, pre_stride, &base_mv_sse);
+
+      // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+      // for SVC encoding.
+      if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 &&
+          frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+          frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+        return -1;
+
+      // Exit NEWMV search if base_mv_sse is large.
+      if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+        return -1;
+      if (base_mv_sse < (best_sse_sofar << 1)) {
+        // Base layer mv is good.
+        // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+        // (0, 0) mode is already tested.
+        unsigned int base_mv_sse_normalized =
+            base_mv_sse >>
+            (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+        if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+            base_mv_sse_normalized < 400 &&
+            frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+            frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+          return -1;
+        if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                    &frame_mv[NEWMV][ref_frame], rate_mv,
+                                    best_rdc->rdcost, 1)) {
+          return -1;
+        }
+      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                         &frame_mv[NEWMV][ref_frame], rate_mv,
+                                         best_rdc->rdcost, 0)) {
+        return -1;
+      }
+    } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                       &frame_mv[NEWMV][ref_frame], rate_mv,
+                                       best_rdc->rdcost, 0)) {
+      return -1;
+    }
+  } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
+                                     &frame_mv[NEWMV][ref_frame], rate_mv,
+                                     best_rdc->rdcost, 0)) {
+    return -1;
+  }
+
+  return 0;
+}
+
+static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
+  bp->best_mode = ZEROMV;
+  bp->best_ref_frame = LAST_FRAME;
+  bp->best_tx_size = TX_SIZES;
+  bp->best_intra_tx_size = TX_SIZES;
+  bp->best_pred_filter = EIGHTTAP;
+  bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
+  bp->best_second_ref_frame = NONE;
+  bp->best_pred = NULL;
+}
+
 void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                          int mi_row, int mi_col, RD_COST *rd_cost,
                          BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) {
   VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
-  const SVC *const svc = &cpi->svc;
+  SVC *const svc = &cpi->svc;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
-  PREDICTION_MODE best_mode = ZEROMV;
-  MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
+
+  BEST_PICKMODE best_pickmode;
+
+  MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
-  TX_SIZE best_tx_size = TX_SIZES;
-  INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
-  uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
@@ -1451,15 +1701,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       (cpi->sf.adaptive_rd_thresh_row_mt)
           ? &(tile_data->row_base_thresh_freq_fact[thresh_freq_fact_idx])
           : tile_data->thresh_freq_fact[bsize];
-
+#if CONFIG_VP9_TEMPORAL_DENOISING
+  const int denoise_recheck_zeromv = 1;
+#endif
   INTERP_FILTER filter_ref;
-  const int bsl = mi_width_log2_lookup[bsize];
-  const int pred_filter_search =
-      cm->interp_filter == SWITCHABLE
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  int pred_filter_search = cm->interp_filter == SWITCHABLE;
   int const_motion[MAX_REF_FRAMES] = { 0 };
   const int bh = num_4x4_blocks_high_lookup[bsize] << 2;
   const int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -1472,7 +1718,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]);
 #endif
   struct buf_2d orig_dst = pd->dst;
-  PRED_BUFFER *best_pred = NULL;
   PRED_BUFFER *this_mode_pred = NULL;
   const int pixels_in_block = bh * bw;
   int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready;
@@ -1488,22 +1733,84 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int skip_ref_find_pred[4] = { 0 };
   unsigned int sse_zeromv_normalized = UINT_MAX;
   unsigned int best_sse_sofar = UINT_MAX;
-  unsigned int thresh_svc_skip_golden = 500;
+  int gf_temporal_ref = 0;
+  int force_test_gf_zeromv = 0;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
   int64_t zero_last_cost_orig = INT64_MAX;
   int denoise_svc_pickmode = 1;
 #endif
   INTERP_FILTER filter_gf_svc = EIGHTTAP;
-  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+  MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME;
+  const struct segmentation *const seg = &cm->seg;
   int comp_modes = 0;
   int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
   int flag_svc_subpel = 0;
   int svc_mv_col = 0;
   int svc_mv_row = 0;
+  int no_scaling = 0;
+  int large_block = 0;
+  int use_model_yrd_large = 0;
+  unsigned int thresh_svc_skip_golden = 500;
+  unsigned int thresh_skip_golden = 500;
+  int force_smooth_filter = cpi->sf.force_smooth_interpol;
+  int scene_change_detected =
+      cpi->rc.high_source_sad ||
+      (cpi->use_svc && cpi->svc.high_source_sad_superframe);
+
+  init_best_pickmode(&best_pickmode);
+
+  x->encode_breakout = seg->enabled
+                           ? cpi->segment_encode_breakout[mi->segment_id]
+                           : cpi->encode_breakout;
+
+  x->source_variance = UINT_MAX;
+  if (cpi->sf.default_interp_filter == BILINEAR) {
+    best_pickmode.best_pred_filter = BILINEAR;
+    filter_gf_svc = BILINEAR;
+  }
+  if (cpi->use_svc && svc->spatial_layer_id > 0) {
+    int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id,
+                         svc->number_temporal_layers);
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
+  }
+  if (svc->spatial_layer_id > 0 &&
+      (svc->high_source_sad_superframe || no_scaling))
+    thresh_svc_skip_golden = 0;
+  // Lower the skip threshold if lower spatial layer is better quality relative
+  // to current layer.
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 &&
+           cm->base_qindex > svc->lower_layer_qindex + 15)
+    thresh_svc_skip_golden = 100;
+  // Increase skip threshold if lower spatial layer is lower quality relative
+  // to current layer.
+  else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 &&
+           cm->base_qindex < svc->lower_layer_qindex - 20)
+    thresh_svc_skip_golden = 1000;
+
+  if (!cpi->use_svc ||
+      (svc->use_gf_temporal_ref_current_layer &&
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
+    struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf;
+    struct scale_factors *const sf_golden =
+        &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+    gf_temporal_ref = 1;
+    // For temporal long term prediction, check that the golden reference
+    // is same scale as last reference, otherwise disable.
+    if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) ||
+        (sf_last->y_scale_fp != sf_golden->y_scale_fp)) {
+      gf_temporal_ref = 0;
+    } else {
+      if (cpi->rc.avg_frame_low_motion > 70)
+        thresh_svc_skip_golden = 500;
+      else
+        thresh_svc_skip_golden = 0;
+    }
+  }
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
-
   memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
 
   if (reuse_inter_pred) {
@@ -1528,16 +1835,25 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   x->skip = 0;
 
+  if (cpi->sf.cb_pred_filter_search) {
+    const int bsl = mi_width_log2_lookup[bsize];
+    pred_filter_search = cm->interp_filter == SWITCHABLE
+                             ? (((mi_row + mi_col) >> bsl) +
+                                get_chessboard_index(cm->current_video_frame)) &
+                                   0x1
+                             : 0;
+  }
   // Instead of using vp9_get_pred_context_switchable_interp(xd) to assign
   // filter_ref, we use a less strict condition on assigning filter_ref.
   // This is to reduce the probabily of entering the flow of not assigning
   // filter_ref and then skip filter search.
-  if (xd->above_mi && is_inter_block(xd->above_mi))
-    filter_ref = xd->above_mi->interp_filter;
-  else if (xd->left_mi && is_inter_block(xd->left_mi))
-    filter_ref = xd->left_mi->interp_filter;
-  else
-    filter_ref = cm->interp_filter;
+  filter_ref = cm->interp_filter;
+  if (cpi->sf.default_interp_filter != BILINEAR) {
+    if (xd->above_mi && is_inter_block(xd->above_mi))
+      filter_ref = xd->above_mi->interp_filter;
+    else if (xd->left_mi && is_inter_block(xd->left_mi))
+      filter_ref = xd->left_mi->interp_filter;
+  }
 
   // initialize mode decisions
   vp9_rd_cost_reset(&best_rdc);
@@ -1558,23 +1874,24 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       x->source_variance =
           vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+        cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && mi->segment_id > 0 &&
+        x->zero_temp_sad_source && x->source_variance == 0) {
+      mi->segment_id = 0;
+      vp9_init_plane_quantizers(cpi, x);
+    }
   }
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
-    if (cpi->use_svc) {
-      int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-                                   cpi->svc.temporal_layer_id,
-                                   cpi->svc.number_temporal_layers);
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame;
-    }
+    if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi);
     if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode)
       vp9_denoiser_reset_frame_stats(ctx);
   }
 #endif
 
-  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc &&
+  if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref &&
       !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
     usable_ref_frame = LAST_FRAME;
   } else {
@@ -1601,14 +1918,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
-      cpi->svc.spatial_layer_id > 0) {
+      svc->spatial_layer_id > 0 && !gf_temporal_ref) {
     if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) svc_force_zero_mode[LAST_FRAME - 1] = 1;
+      if (vp9_is_scaled(sf)) {
+        svc_force_zero_mode[LAST_FRAME - 1] = 1;
+        inter_layer_ref = LAST_FRAME;
+      }
     }
     if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+      if (vp9_is_scaled(sf)) {
+        svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
+        inter_layer_ref = GOLDEN_FRAME;
+      }
     }
   }
 
@@ -1624,6 +1947,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
   }
 
+  if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad ||
+                                 cpi->rc.avg_frame_low_motion < 60))
+    usable_ref_frame = LAST_FRAME;
+
   if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
@@ -1638,7 +1965,21 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
     comp_modes = 2;
 
+  // If the segment reference frame feature is enabled and it's set to GOLDEN
+  // reference, then make sure we don't skip checking GOLDEN, this is to
+  // prevent possibility of not picking any mode.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+    usable_ref_frame = GOLDEN_FRAME;
+    skip_ref_find_pred[GOLDEN_FRAME] = 0;
+    thresh_svc_skip_golden = 0;
+  }
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
+    // Skip find_predictor if the reference frame is not in the
+    // ref_frame_flags (i.e., not used as a reference for this frame).
+    skip_ref_find_pred[ref_frame] =
+        !(cpi->ref_frame_flags & flag_list[ref_frame]);
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                       &ref_frame_skip_mask, flag_list, tile_data, mi_row,
@@ -1652,16 +1993,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
   // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
   // an averaging filter for downsampling (phase = 8). If so, we will test
-  // a nonzero motion mode on the spatial (goldeen) reference.
+  // a nonzero motion mode on the spatial reference.
   // The nonzero motion is half pixel shifted to left and top (-4, -4).
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
-      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc_force_zero_mode[inter_layer_ref - 1] &&
+      svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 &&
+      !gf_temporal_ref) {
     svc_mv_col = -4;
     svc_mv_row = -4;
     flag_svc_subpel = 1;
   }
 
+  // For SVC with quality layers, when QP of lower layer is lower
+  // than current layer: force check of GF-ZEROMV before early exit
+  // due to skip flag.
+  if (svc->spatial_layer_id > 0 && no_scaling &&
+      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      cm->base_qindex > svc->lower_layer_qindex + 10)
+    force_test_gf_zeromv = 1;
+
+  // For low motion content use x->sb_is_skin in addition to VeryHighSad
+  // for setting large_block.
+  large_block = (x->content_state_sb == kVeryHighSad ||
+                 (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+                 cpi->oxcf.speed < 7)
+                    ? bsize > BLOCK_32X32
+                    : bsize >= BLOCK_32X32;
+  use_model_yrd_large =
+      cpi->oxcf.rc_mode == VPX_CBR && large_block &&
+      !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
+      cm->base_qindex;
+
   for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
     int rate_mv = 0;
     int mode_rd_thresh;
@@ -1675,7 +2037,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int inter_mv_mode = 0;
     int skip_this_mv = 0;
     int comp_pred = 0;
-    int force_gf_mv = 0;
+    int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
     second_ref_frame = NONE;
 
@@ -1699,8 +2061,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (ref_frame > usable_ref_frame) continue;
     if (skip_ref_find_pred[ref_frame]) continue;
 
-    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
-      force_gf_mv = 1;
+    if (svc->previous_frame_is_intra_only) {
+      if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+    }
+
+    // If the segment reference frame feature is enabled then do nothing if the
+    // current ref frame is not allowed.
+    if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+        get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+      continue;
+
+    if (flag_svc_subpel && ref_frame == inter_layer_ref) {
+      force_mv_inter_layer = 1;
       // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
       // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
       if (this_mode == NEWMV) {
@@ -1713,7 +2086,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
     if (comp_pred) {
-      const struct segmentation *const seg = &cm->seg;
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1722,15 +2094,33 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
     }
 
-    // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
-    // is below threshold.
-    if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
-        sse_zeromv_normalized < thresh_svc_skip_golden)
+    // For CBR mode: skip the golden reference search if sse of zeromv_last is
+    // below threshold.
+    if (ref_frame == GOLDEN_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+        ((cpi->use_svc && sse_zeromv_normalized < thresh_svc_skip_golden) ||
+         (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-        this_mode != NEARESTMV) {
-      continue;
+    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+
+    // For screen content. If zero_temp_sad source is computed: skip
+    // non-zero motion check for stationary blocks. If the superblock is
+    // non-stationary then for flat blocks skip the zero last check (keep golden
+    // as it may be inter-layer reference). Otherwise (if zero_temp_sad_source
+    // is not computed) skip non-zero motion check for flat blocks.
+    // TODO(marpan): Compute zero_temp_sad_source per coding block.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) {
+      if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
+        if ((frame_mv[this_mode][ref_frame].as_int != 0 &&
+             x->zero_temp_sad_source) ||
+            (frame_mv[this_mode][ref_frame].as_int == 0 &&
+             x->source_variance == 0 && ref_frame == LAST_FRAME &&
+             !x->zero_temp_sad_source))
+          continue;
+      } else if (frame_mv[this_mode][ref_frame].as_int != 0 &&
+                 x->source_variance == 0) {
+        continue;
+      }
     }
 
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue;
@@ -1759,14 +2149,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         continue;
     }
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
-
     if (const_motion[ref_frame] && this_mode == NEARMV) continue;
 
     // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
     // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
     // later.
-    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_mv_inter_layer && force_skip_low_temp_var &&
+        ref_frame == GOLDEN_FRAME &&
         frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
@@ -1780,34 +2169,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
     if (cpi->use_svc) {
-      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
 
-    if (sf->reference_masking &&
-        !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-          ref_frame == LAST_FRAME)) {
-      if (usable_ref_frame < ALTREF_FRAME) {
-        if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
-          i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-          if ((cpi->ref_frame_flags & flag_list[i]))
-            if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
-              ref_frame_skip_mask |= (1 << ref_frame);
+    // Disable this drop out case if the ref frame segment level feature is
+    // enabled for this segment. This is to prevent the possibility that we end
+    // up unable to pick any mode.
+    if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+      if (sf->reference_masking &&
+          !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+            ref_frame == LAST_FRAME)) {
+        if (usable_ref_frame < ALTREF_FRAME) {
+          if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+            i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+            if ((cpi->ref_frame_flags & flag_list[i]))
+              if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+                ref_frame_skip_mask |= (1 << ref_frame);
+          }
+        } else if (!cpi->rc.is_src_frame_alt_ref &&
+                   !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+                     ref_frame == ALTREF_FRAME)) {
+          int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+          int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+               (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+            ref_frame_skip_mask |= (1 << ref_frame);
         }
-      } else if (!cpi->rc.is_src_frame_alt_ref &&
-                 !(frame_mv[this_mode][ref_frame].as_int == 0 &&
-                   ref_frame == ALTREF_FRAME)) {
-        int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
-        int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-        if (((cpi->ref_frame_flags & flag_list[ref1]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-            ((cpi->ref_frame_flags & flag_list[ref2]) &&
-             (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
-          ref_frame_skip_mask |= (1 << ref_frame);
       }
+      if (ref_frame_skip_mask & (1 << ref_frame)) continue;
     }
-    if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
     // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -1820,8 +2214,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
-    mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
-                                         : rd_threshes[mode_index];
+    mode_rd_thresh = best_pickmode.best_mode_skip_txfm
+                         ? rd_threshes[mode_index] << 1
+                         : rd_threshes[mode_index];
 
     // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding
     // speed with little/no subjective quality loss.
@@ -1835,92 +2230,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         (!cpi->sf.adaptive_rd_thresh_row_mt &&
          rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
                              &rd_thresh_freq_fact[mode_index])))
-      continue;
-
-    if (this_mode == NEWMV && !force_gf_mv) {
-      if (ref_frame > LAST_FRAME && !cpi->use_svc &&
-          cpi->oxcf.rc_mode == VPX_CBR) {
-        int tmp_sad;
-        uint32_t dis;
-        int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
 
-        if (bsize < BLOCK_16X16) continue;
-
-        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
-
-        if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue;
-        if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
-          continue;
-
-        frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int;
-        rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv,
-                                  &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-                                  x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-        frame_mv[NEWMV][ref_frame].as_mv.row >>= 3;
-        frame_mv[NEWMV][ref_frame].as_mv.col >>= 3;
-
-        cpi->find_fractional_mv_step(
-            x, &frame_mv[NEWMV][ref_frame].as_mv,
-            &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv,
-            cpi->common.allow_high_precision_mv, x->errorperbit,
-            &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-            cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-            x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0,
-            0);
-      } else if (svc->use_base_mv && svc->spatial_layer_id) {
-        if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
-          const int pre_stride = xd->plane[0].pre[0].stride;
-          unsigned int base_mv_sse = UINT_MAX;
-          int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
-          const uint8_t *const pre_buf =
-              xd->plane[0].pre[0].buf +
-              (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
-              (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
-          cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                pre_buf, pre_stride, &base_mv_sse);
-
-          // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
-          // for SVC encoding.
-          if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 &&
-              frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
-              frame_mv[NEWMV][ref_frame].as_mv.col == 0)
-            continue;
-
-          // Exit NEWMV search if base_mv_sse is large.
-          if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
-            continue;
-          if (base_mv_sse < (best_sse_sofar << 1)) {
-            // Base layer mv is good.
-            // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
-            // (0, 0) mode is already tested.
-            unsigned int base_mv_sse_normalized =
-                base_mv_sse >>
-                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
-            if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
-                base_mv_sse_normalized < 400 &&
-                frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
-                frame_mv[NEWMV][ref_frame].as_mv.col == 0)
-              continue;
-            if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                        &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                        best_rdc.rdcost, 1)) {
-              continue;
-            }
-          } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                             &frame_mv[NEWMV][ref_frame],
-                                             &rate_mv, best_rdc.rdcost, 0)) {
-            continue;
-          }
-        } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                           &frame_mv[NEWMV][ref_frame],
-                                           &rate_mv, best_rdc.rdcost, 0)) {
-          continue;
-        }
-      } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
-                                         &frame_mv[NEWMV][ref_frame], &rate_mv,
-                                         best_rdc.rdcost, 0)) {
+    if (this_mode == NEWMV && !force_mv_inter_layer) {
+      if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize,
+                        mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar,
+                        &best_rdc))
         continue;
-      }
     }
 
     // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
@@ -1978,70 +2294,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
         pred_filter_search &&
         (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
+         (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
-      int pf_rate[3];
-      int64_t pf_dist[3];
-      int curr_rate[3];
-      unsigned int pf_var[3];
-      unsigned int pf_sse[3];
-      TX_SIZE pf_tx_size[3];
-      int64_t best_cost = INT64_MAX;
-      INTERP_FILTER best_filter = SWITCHABLE, filter;
-      PRED_BUFFER *current_pred = this_mode_pred;
       rd_computed = 1;
-
-      for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) {
-        int64_t cost;
-        mi->interp_filter = filter;
-        vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-        model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter],
-                          &pf_var[filter], &pf_sse[filter]);
-        curr_rate[filter] = pf_rate[filter];
-        pf_rate[filter] += vp9_get_switchable_rate(cpi, xd);
-        cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]);
-        pf_tx_size[filter] = mi->tx_size;
-        if (cost < best_cost) {
-          best_filter = filter;
-          best_cost = cost;
-          skip_txfm = x->skip_txfm[0];
-
-          if (reuse_inter_pred) {
-            if (this_mode_pred != current_pred) {
-              free_pred_buffer(this_mode_pred);
-              this_mode_pred = current_pred;
-            }
-            current_pred = &tmp[get_pred_buffer(tmp, 3)];
-            pd->dst.buf = current_pred->data;
-            pd->dst.stride = bw;
-          }
-        }
-      }
-
-      if (reuse_inter_pred && this_mode_pred != current_pred)
-        free_pred_buffer(current_pred);
-
-      mi->interp_filter = best_filter;
-      mi->tx_size = pf_tx_size[best_filter];
-      this_rdc.rate = curr_rate[best_filter];
-      this_rdc.dist = pf_dist[best_filter];
-      var_y = pf_var[best_filter];
-      sse_y = pf_sse[best_filter];
-      x->skip_txfm[0] = skip_txfm;
-      if (reuse_inter_pred) {
-        pd->dst.buf = this_mode_pred->data;
-        pd->dst.stride = this_mode_pred->stride;
-      }
+      search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize,
+                        reuse_inter_pred, &this_mode_pred, &var_y, &sse_y,
+                        force_smooth_filter, &this_early_term,
+                        flag_preduv_computed, use_model_yrd_large);
     } else {
-      // For low motion content use x->sb_is_skin in addition to VeryHighSad
-      // for setting large_block.
-      const int large_block =
-          (x->content_state_sb == kVeryHighSad ||
-           (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
-           cpi->oxcf.speed < 7)
-              ? bsize > BLOCK_32X32
-              : bsize >= BLOCK_32X32;
       mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
 
       if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -2051,19 +2312,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
 
       // For large partition blocks, extra testing is done.
-      if (cpi->oxcf.rc_mode == VPX_CBR && large_block &&
-          !cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id) &&
-          cm->base_qindex) {
+      if (use_model_yrd_large) {
+        rd_computed = 1;
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
                                 &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
                                 &this_early_term, flag_preduv_computed);
       } else {
         rd_computed = 1;
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                          &var_y, &sse_y);
+                          &var_y, &sse_y, 0);
       }
       // Save normalized sse (between current and last frame) for (0, 0) motion.
-      if (cpi->use_svc && ref_frame == LAST_FRAME &&
+      if (ref_frame == LAST_FRAME &&
           frame_mv[this_mode][ref_frame].as_int == 0) {
         sse_zeromv_normalized =
             sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
@@ -2074,8 +2334,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (!this_early_term) {
       this_sse = (int64_t)sse_y;
       block_yrd(cpi, x, &this_rdc, &is_skippable, &this_sse, bsize,
-                VPXMIN(mi->tx_size, TX_16X16), rd_computed);
-
+                VPXMIN(mi->tx_size, TX_16X16), rd_computed, 0);
       x->skip_txfm[0] = is_skippable;
       if (is_skippable) {
         this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
@@ -2095,9 +2354,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
           this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
       }
     } else {
-      this_rdc.rate += cm->interp_filter == SWITCHABLE
-                           ? vp9_get_switchable_rate(cpi, xd)
-                           : 0;
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+      }
       this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     }
 
@@ -2138,7 +2398,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
     // Skipping checking: test to see if this block can be reconstructed by
     // prediction only.
-    if (cpi->allow_encode_breakout) {
+    if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected &&
+        !svc->high_num_blocks_with_motion) {
       encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
                            var_y, sse_y, yv12_mb, &this_rdc.rate,
                            &this_rdc.dist, flag_preduv_computed);
@@ -2149,6 +2410,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       }
     }
 
+    // On spatially flat blocks for screne content: bias against zero-last
+    // if the sse_y is non-zero. Only on scene change or high motion frames.
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+        (scene_change_detected || svc->high_num_blocks_with_motion) &&
+        ref_frame == LAST_FRAME && frame_mv[this_mode][ref_frame].as_int == 0 &&
+        svc->spatial_layer_id == 0 && x->source_variance == 0 && sse_y > 0) {
+      this_rdc.rdcost = this_rdc.rdcost << 2;
+    }
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc_pickmode &&
         cpi->denoiser.denoising_level > kDenLowLow) {
@@ -2165,71 +2435,86 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
-      best_mode = this_mode;
-      best_pred_filter = mi->interp_filter;
-      best_tx_size = mi->tx_size;
-      best_ref_frame = ref_frame;
-      best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
-      best_second_ref_frame = second_ref_frame;
+      best_pickmode.best_mode = this_mode;
+      best_pickmode.best_pred_filter = mi->interp_filter;
+      best_pickmode.best_tx_size = mi->tx_size;
+      best_pickmode.best_ref_frame = ref_frame;
+      best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
+      best_pickmode.best_second_ref_frame = second_ref_frame;
 
       if (reuse_inter_pred) {
-        free_pred_buffer(best_pred);
-        best_pred = this_mode_pred;
+        free_pred_buffer(best_pickmode.best_pred);
+        best_pickmode.best_pred = this_mode_pred;
       }
     } else {
       if (reuse_inter_pred) free_pred_buffer(this_mode_pred);
     }
 
-    if (x->skip) break;
+    if (x->skip &&
+        (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME]))
+      break;
 
     // If early termination flag is 1 and at least 2 modes are checked,
     // the mode search is terminated.
-    if (best_early_term && idx > 0) {
+    if (best_early_term && idx > 0 && !scene_change_detected &&
+        (!force_test_gf_zeromv || mode_checked[ZEROMV][GOLDEN_FRAME])) {
       x->skip = 1;
       break;
     }
   }
 
-  mi->mode = best_mode;
-  mi->interp_filter = best_pred_filter;
-  mi->tx_size = best_tx_size;
-  mi->ref_frame[0] = best_ref_frame;
-  mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
+  mi->mode = best_pickmode.best_mode;
+  mi->interp_filter = best_pickmode.best_pred_filter;
+  mi->tx_size = best_pickmode.best_tx_size;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->mv[0].as_int =
+      frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
-  x->skip_txfm[0] = best_mode_skip_txfm;
-  mi->ref_frame[1] = best_second_ref_frame;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
-  // LAST is the only reference or is_key_frame is set.
-  if (cpi->svc.spatial_layer_id) {
+  // LAST is the only reference, or is_key_frame is set, or on base
+  // temporal layer.
+  if (svc->spatial_layer_id && !gf_temporal_ref) {
     perform_intra_pred =
-        cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
+        svc->temporal_layer_id == 0 ||
+        svc->layer_context[svc->temporal_layer_id].is_key_frame ||
         !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
-        (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
-         svc_force_zero_mode[best_ref_frame - 1]);
+        (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+         svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
   }
-  if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
-      cpi->rc.is_src_frame_alt_ref)
+  if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
+       cpi->rc.is_src_frame_alt_ref) ||
+      svc->previous_frame_is_intra_only)
     perform_intra_pred = 0;
+
+  // If the segment reference frame feature is enabled and set then
+  // skip the intra prediction.
+  if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+      get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+    perform_intra_pred = 0;
+
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if (best_rdc.rdcost == INT64_MAX ||
+      (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->source_variance == 0) ||
+      (scene_change_detected && perform_intra_pred) ||
       ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
         x->content_state_sb == kVeryHighSad) &&
        perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
        bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
        !x->lowvar_highsumdiff)) {
     struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 };
+    int64_t this_sse = INT64_MAX;
     int i;
-    TX_SIZE best_intra_tx_size = TX_SIZES;
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     TX_SIZE intra_tx_size =
         VPXMIN(max_txsize_lookup[bsize],
                tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
-    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN && intra_tx_size > TX_16X16)
-      intra_tx_size = TX_16X16;
 
     if (reuse_inter_pred && best_pred != NULL) {
       if (best_pred->data == orig_dst.buf) {
@@ -2249,7 +2534,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,
                           0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        best_pred = this_mode_pred;
+        best_pickmode.best_pred = this_mode_pred;
       }
     }
     pd->dst = orig_dst;
@@ -2258,8 +2543,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       const PREDICTION_MODE this_mode = intra_mode_list[i];
       THR_MODES mode_index = mode_idx[INTRA_FRAME][mode_offset(this_mode)];
       int mode_rd_thresh = rd_threshes[mode_index];
+      // For spatially flat blocks, under short_circuit_flat_blocks flag:
+      // only check DC mode for stationary blocks, otherwise also check
+      // H and V mode.
       if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
-          this_mode != DC_PRED) {
+          ((x->zero_temp_sad_source && this_mode != DC_PRED) || i > 2)) {
         continue;
       }
 
@@ -2271,8 +2559,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                                       &rd_thresh_freq_fact[mode_index])) ||
           (!cpi->sf.adaptive_rd_thresh_row_mt &&
            rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
-                               &rd_thresh_freq_fact[mode_index])))
-        continue;
+                               &rd_thresh_freq_fact[mode_index]))) {
+        // Avoid this early exit for screen on base layer, for scene
+        // changes or high motion frames.
+        if (cpi->oxcf.content != VP9E_CONTENT_SCREEN ||
+            svc->spatial_layer_id > 0 ||
+            (!scene_change_detected && !svc->high_num_blocks_with_motion))
+          continue;
+      }
 
       mi->mode = this_mode;
       mi->ref_frame[0] = INTRA_FRAME;
@@ -2281,8 +2575,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       args.skippable = 1;
       args.rdc = &this_rdc;
       mi->tx_size = intra_tx_size;
-      vp9_foreach_transformed_block_in_plane(xd, bsize, 0, estimate_block_intra,
-                                             &args);
+
+      compute_intra_yprediction(this_mode, bsize, x, xd);
+      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                        &var_y, &sse_y, 1);
+      block_yrd(cpi, x, &this_rdc, &args.skippable, &this_sse, bsize,
+                VPXMIN(mi->tx_size, TX_16X16), 1, 1);
+
       // Check skip cost here since skippable is not set for for uv, this
       // mirrors the behavior used by inter
       if (args.skippable) {
@@ -2309,36 +2608,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
-        best_mode = this_mode;
-        best_intra_tx_size = mi->tx_size;
-        best_ref_frame = INTRA_FRAME;
-        best_second_ref_frame = NONE;
+        best_pickmode.best_mode = this_mode;
+        best_pickmode.best_intra_tx_size = mi->tx_size;
+        best_pickmode.best_ref_frame = INTRA_FRAME;
+        best_pickmode.best_second_ref_frame = NONE;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
         mi->mv[1].as_int = INVALID_MV;
-        best_mode_skip_txfm = x->skip_txfm[0];
+        best_pickmode.best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
 
     // Reset mb_mode_info to the best inter mode.
-    if (best_ref_frame != INTRA_FRAME) {
-      mi->tx_size = best_tx_size;
+    if (best_pickmode.best_ref_frame != INTRA_FRAME) {
+      mi->tx_size = best_pickmode.best_tx_size;
     } else {
-      mi->tx_size = best_intra_tx_size;
+      mi->tx_size = best_pickmode.best_intra_tx_size;
     }
   }
 
   pd->dst = orig_dst;
-  mi->mode = best_mode;
-  mi->ref_frame[0] = best_ref_frame;
-  mi->ref_frame[1] = best_second_ref_frame;
-  x->skip_txfm[0] = best_mode_skip_txfm;
+  mi->mode = best_pickmode.best_mode;
+  mi->ref_frame[0] = best_pickmode.best_ref_frame;
+  mi->ref_frame[1] = best_pickmode.best_second_ref_frame;
+  x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm;
 
   if (!is_inter_block(mi)) {
     mi->interp_filter = SWITCHABLE_FILTERS;
   }
 
-  if (reuse_inter_pred && best_pred != NULL) {
+  if (reuse_inter_pred && best_pickmode.best_pred != NULL) {
+    PRED_BUFFER *const best_pred = best_pickmode.best_pred;
     if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) {
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth)
@@ -2367,25 +2667,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Remove this condition when the issue is resolved.
     if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
     vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
-                                frame_mv, reuse_inter_pred, best_tx_size,
-                                best_mode, best_ref_frame, best_pred_filter,
-                                best_mode_skip_txfm);
-    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision);
-    recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb,
-                                   &best_rdc, bsize, mi_row, mi_col);
-    best_ref_frame = ctx_den.best_ref_frame;
+                                frame_mv, reuse_inter_pred, &best_pickmode);
+    vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision,
+                         gf_temporal_ref);
+    if (denoise_recheck_zeromv)
+      recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den,
+                                     yv12_mb, &best_rdc, bsize, mi_row, mi_col);
+    best_pickmode.best_ref_frame = ctx_den.best_ref_frame;
   }
 #endif
 
-  if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME)
+  if (best_pickmode.best_ref_frame == ALTREF_FRAME ||
+      best_pickmode.best_second_ref_frame == ALTREF_FRAME)
     x->arf_frame_usage++;
-  else if (best_ref_frame != INTRA_FRAME)
+  else if (best_pickmode.best_ref_frame != INTRA_FRAME)
     x->lastgolden_frame_usage++;
 
   if (cpi->sf.adaptive_rd_thresh) {
-    THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
+    THR_MODES best_mode_idx =
+        mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)];
 
-    if (best_ref_frame == INTRA_FRAME) {
+    if (best_pickmode.best_ref_frame == INTRA_FRAME) {
       // Only consider the modes that are included in the intra_mode_list.
       int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE);
       int i;
@@ -2405,7 +2707,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     } else {
       for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) {
         PREDICTION_MODE this_mode;
-        if (best_ref_frame != ref_frame) continue;
+        if (best_pickmode.best_ref_frame != ref_frame) continue;
         for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
           if (cpi->sf.adaptive_rd_thresh_row_mt)
             update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance,
@@ -2585,9 +2887,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
                 x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv,
                 cpi->common.allow_high_precision_mv, x->errorperbit,
                 &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
-                cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
-                &dummy_dist, &x->pred_sse[ref_frame], NULL, 0, 0);
+                cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+                x->nmvjointcost, x->mvcost, &dummy_dist,
+                &x->pred_sse[ref_frame], NULL, 0, 0,
+                cpi->sf.use_accurate_subpel_search);
 
             xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv;
           } else {
@@ -2620,7 +2923,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 #endif
 
           model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                            &var_y, &sse_y);
+                            &var_y, &sse_y, 0);
 
           this_rdc.rate += b_rate;
           this_rdc.rdcost =
diff --git a/libs/libvpx/vp9/encoder/vp9_pickmode.h b/libs/libvpx/vp9/encoder/vp9_pickmode.h
index 9aa00c4fabf..15207e6cf4d 100644
--- a/libs/libvpx/vp9/encoder/vp9_pickmode.h
+++ b/libs/libvpx/vp9/encoder/vp9_pickmode.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_PICKMODE_H_
-#define VP9_ENCODER_VP9_PICKMODE_H_
+#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_
+#define VPX_VP9_ENCODER_VP9_PICKMODE_H_
 
 #include "vp9/encoder/vp9_encoder.h"
 
@@ -32,4 +32,4 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_PICKMODE_H_
+#endif  // VPX_VP9_ENCODER_VP9_PICKMODE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.c b/libs/libvpx/vp9/encoder/vp9_quantize.c
index 09f61ead263..26d1434c349 100644
--- a/libs/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libs/libvpx/vp9/encoder/vp9_quantize.c
@@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
     case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
-    case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
   }
 #else
   (void)bit_depth;
@@ -221,13 +220,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
   int i, q, quant;
 
   for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
-    const int qrounding_factor = q == 0 ? 64 : 48;
+    int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
+    int qrounding_factor = q == 0 ? 64 : 48;
+    const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7;
+
+    if (cpi->oxcf.sharpness > 0 && q > 0) {
+      qzbin_factor = 64 + sharpness_adjustment;
+      qrounding_factor = 64 - sharpness_adjustment;
+    }
 
     for (i = 0; i < 2; ++i) {
       int qrounding_factor_fp = i == 0 ? 48 : 42;
       if (q == 0) qrounding_factor_fp = 64;
-
+      if (cpi->oxcf.sharpness > 0)
+        qrounding_factor_fp = 64 - sharpness_adjustment;
       // y
       quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
                      : vp9_ac_quant(q, 0, cm->bit_depth);
@@ -282,12 +288,12 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
   x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  x->plane[0].round_fp = quants->y_round_fp[qindex];
+  memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex],
+         8 * sizeof(*(x->plane[0].round_fp)));
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
   xd->plane[0].dequant = cpi->y_dequant[qindex];
-
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
   x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
 
@@ -295,12 +301,12 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    x->plane[i].round_fp = quants->uv_round_fp[qindex];
+    memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex],
+           8 * sizeof(*(x->plane[i].round_fp)));
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
     xd->plane[i].dequant = cpi->uv_dequant[qindex];
-
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
   }
diff --git a/libs/libvpx/vp9/encoder/vp9_quantize.h b/libs/libvpx/vp9/encoder/vp9_quantize.h
index 61320361b6c..ed9b8495849 100644
--- a/libs/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libs/libvpx/vp9/encoder/vp9_quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_QUANTIZE_H_
-#define VP9_ENCODER_VP9_QUANTIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_
+#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_
 
 #include "./vpx_config.h"
 #include "vp9/encoder/vp9_block.h"
@@ -59,4 +59,4 @@ int vp9_qindex_to_quantizer(int qindex);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_QUANTIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_QUANTIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.c b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
index b7f3a0e897b..6745b0adfcb 100644
--- a/libs/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -31,10 +31,13 @@
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
-// Max rate target for 1080P and below encodes under normal circumstances
-// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
 #define MAX_MB_RATE 250
-#define MAXRATE_1080P 2025000
+#define MAXRATE_1080P 4000000
 
 #define DEFAULT_KF_BOOST 2000
 #define DEFAULT_GF_BOOST 2000
@@ -45,18 +48,16 @@
 #define MAX_BPB_FACTOR 50
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
-  do {                                                       \
-    switch (bit_depth) {                                     \
-      case VPX_BITS_8: name = name##_8; break;               \
-      case VPX_BITS_10: name = name##_10; break;             \
-      case VPX_BITS_12: name = name##_12; break;             \
-      default:                                               \
-        assert(0 &&                                          \
-               "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
-               " or VPX_BITS_12");                           \
-        name = NULL;                                         \
-    }                                                        \
+#define ASSIGN_MINQ_TABLE(bit_depth, name)       \
+  do {                                           \
+    switch (bit_depth) {                         \
+      case VPX_BITS_8: name = name##_8; break;   \
+      case VPX_BITS_10: name = name##_10; break; \
+      default:                                   \
+        assert(bit_depth == VPX_BITS_12);        \
+        name = name##_12;                        \
+        break;                                   \
+    }                                            \
   } while (0)
 #else
 #define ASSIGN_MINQ_TABLE(bit_depth, name) \
@@ -97,8 +98,8 @@ static int kf_low = 400;
 #else
 static int gf_high = 2000;
 static int gf_low = 400;
-static int kf_high = 5000;
-static int kf_low = 400;
+static int kf_high = 4800;
+static int kf_low = 300;
 #endif
 
 // Functions to compute the active minq lookup table entries based on a
@@ -128,7 +129,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
   for (i = 0; i < QINDEX_RANGE; i++) {
     const double maxq = vp9_convert_qindex_to_q(i, bit_depth);
     kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
-    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+    kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
 #ifdef AGGRESSIVE_VBR
     arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth);
     inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth);
@@ -164,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
     case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
-    case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1.0;
+      assert(bit_depth == VPX_BITS_12);
+      return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
   }
 #else
   return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
@@ -211,17 +211,15 @@ int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
 
-  if (cpi->oxcf.pass != 2) {
-    const int min_frame_target =
-        VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-    if (target < min_frame_target) target = min_frame_target;
-    if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
-      // If there is an active ARF at this location use the minimum
-      // bits on this frame even if it is a constructed arf.
-      // The active maximum quantizer insures that an appropriate
-      // number of bits will be spent if needed for constructed ARFs.
-      target = min_frame_target;
-    }
+  const int min_frame_target =
+      VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+  if (target < min_frame_target) target = min_frame_target;
+  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+    // If there is an active ARF at this location use the minimum
+    // bits on this frame even if it is a constructed arf.
+    // The active maximum quantizer insures that an appropriate
+    // number of bits will be spent if needed for constructed ARFs.
+    target = min_frame_target;
   }
 
   // Clip the frame target to the maximum allowed value.
@@ -247,20 +245,68 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) {
   return target;
 }
 
+// TODO(marpan/jianj): bits_off_target and buffer_level are used in the saame
+// way for CBR mode, for the buffering updates below. Look into removing one
+// of these (i.e., bits_off_target).
+// Update the buffer level before encoding with the per-frame-bandwidth,
+static void update_buffer_level_preencode(VP9_COMP *cpi) {
+  RATE_CONTROL *const rc = &cpi->rc;
+  rc->bits_off_target += rc->avg_frame_bandwidth;
+  // Clip the buffer level to the maximum specified buffer size.
+  rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
+  rc->buffer_level = rc->bits_off_target;
+}
+
+// Update the buffer level before encoding with the per-frame-bandwidth
+// for SVC. The current and all upper temporal layers are updated, needed
+// for the layered rate control which involves cumulative buffer levels for
+// the temporal layers. Allow for using the timestamp(pts) delta for the
+// framerate when the set_ref_frame_config is used.
+static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+  SVC *const svc = &cpi->svc;
+  int i;
+  // Set this to 1 to use timestamp delta for "framerate" under
+  // ref_frame_config usage.
+  int use_timestamp = 1;
+  const int64_t ts_delta =
+      svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id];
+  for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) {
+    const int layer =
+        LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
+    LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+    RATE_CONTROL *const lrc = &lc->rc;
+    if (use_timestamp && cpi->svc.use_set_ref_frame_config &&
+        svc->number_temporal_layers == 1 && ts_delta > 0 &&
+        svc->current_superframe > 0) {
+      // TODO(marpan): This may need to be modified for temporal layers.
+      const double framerate_pts = 10000000.0 / ts_delta;
+      lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts);
+    } else {
+      lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+    }
+    // Clip buffer level to maximum buffer size for the layer.
+    lrc->bits_off_target =
+        VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
+    lrc->buffer_level = lrc->bits_off_target;
+    if (i == svc->temporal_layer_id) {
+      cpi->rc.bits_off_target = lrc->bits_off_target;
+      cpi->rc.buffer_level = lrc->buffer_level;
+    }
+  }
+}
+
 // Update the buffer level for higher temporal layers, given the encoded current
 // temporal layer.
-static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
+static void update_layer_buffer_level_postencode(SVC *svc,
+                                                 int encoded_frame_size) {
   int i = 0;
-  int current_temporal_layer = svc->temporal_layer_id;
+  const int current_temporal_layer = svc->temporal_layer_id;
   for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) {
     const int layer =
         LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers);
     LAYER_CONTEXT *lc = &svc->layer_context[layer];
     RATE_CONTROL *lrc = &lc->rc;
-    int bits_off_for_this_layer =
-        (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size);
-    lrc->bits_off_target += bits_off_for_this_layer;
-
+    lrc->bits_off_target -= encoded_frame_size;
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
         VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
@@ -268,21 +314,13 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) {
   }
 }
 
-// Update the buffer level: leaky bucket model.
-static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
-  const VP9_COMMON *const cm = &cpi->common;
+// Update the buffer level after encoding with encoded frame size.
+static void update_buffer_level_postencode(VP9_COMP *cpi,
+                                           int encoded_frame_size) {
   RATE_CONTROL *const rc = &cpi->rc;
-
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame) {
-    rc->bits_off_target -= encoded_frame_size;
-  } else {
-    rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
-  }
-
+  rc->bits_off_target -= encoded_frame_size;
   // Clip the buffer level to the maximum specified buffer size.
   rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
-
   // For screen-content mode, and if frame-dropper is off, don't let buffer
   // level go below threshold, given here as -rc->maximum_ buffer_size.
   if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
@@ -292,7 +330,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) {
   rc->buffer_level = rc->bits_off_target;
 
   if (is_one_pass_cbr_svc(cpi)) {
-    update_layer_buffer_level(&cpi->svc, encoded_frame_size);
+    update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
   }
 }
 
@@ -355,6 +393,9 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->high_source_sad = 0;
   rc->reset_high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
+  rc->high_num_blocks_with_motion = 0;
+  rc->hybrid_intra_scene_change = 0;
+  rc->re_encode_maxq_scene_change = 0;
   rc->alt_ref_gf_group = 0;
   rc->last_frame_is_src_altref = 0;
   rc->fac_active_worst_inter = 150;
@@ -377,6 +418,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
 
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 1.0;
+    rc->damped_adjustment[i] = 0;
   }
 
   rc->min_gf_interval = oxcf->min_gf_interval;
@@ -388,27 +430,115 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
     rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
         oxcf->init_framerate, rc->min_gf_interval);
   rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+
+  rc->force_max_q = 0;
+  rc->last_post_encode_dropped_scene_change = 0;
+  rc->use_post_encode_drop = 0;
+  rc->ext_use_post_encode_drop = 0;
+  rc->arf_active_best_quality_adjustment_factor = 1.0;
+
+  rc->preserve_arf_as_gld = 0;
+  rc->preserve_next_arf_as_gld = 0;
+  rc->show_arf_as_gld = 0;
 }
 
-int vp9_rc_drop_frame(VP9_COMP *cpi) {
+static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) {
+    RATE_CONTROL *const rc = &cpi->rc;
+    return (rc->buffer_level > drop_mark);
+  } else {
+    int i;
+    // For SVC in the FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its above threshold, so no drop) is checked on current and
+    // upper spatial layers. If any spatial layer is not above threshold then
+    // we return 0.
+    for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      // Exclude check for layer whose bitrate is 0.
+      if (lc->target_bandwidth > 0) {
+        const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+                                          lrc->optimal_buffer_level / 100);
+        if (!(lrc->buffer_level > drop_mark_layer)) return 0;
+      }
+    }
+    return 1;
+  }
+}
+
+static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) {
+  SVC *svc = &cpi->svc;
+  if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) {
+    RATE_CONTROL *const rc = &cpi->rc;
+    return (rc->buffer_level <= drop_mark);
+  } else {
+    int i;
+    // For SVC in the constrained framedrop mode (svc->framedrop_mode =
+    // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on
+    // buffer (if its below threshold, so drop frame) is checked on current
+    // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any
+    // spatial layer is <= threshold, then we return 1 (drop).
+    for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      // Exclude check for layer whose bitrate is 0.
+      if (lc->target_bandwidth > 0) {
+        const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] *
+                                          lrc->optimal_buffer_level / 100);
+        if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) {
+          if (lrc->buffer_level <= drop_mark_layer) return 1;
+        } else {
+          if (!(lrc->buffer_level <= drop_mark_layer)) return 0;
+        }
+      }
+    }
+    if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP)
+      return 0;
+    else
+      return 1;
+  }
+}
+
+static int drop_frame(VP9_COMP *cpi) {
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
-  if (!oxcf->drop_frames_water_mark ||
-      (is_one_pass_cbr_svc(cpi) &&
-       cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) {
+  SVC *svc = &cpi->svc;
+  int drop_frames_water_mark = oxcf->drop_frames_water_mark;
+  if (cpi->use_svc) {
+    // If we have dropped max_consec_drop frames, then we don't
+    // drop this spatial layer, and reset counter to 0.
+    if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) {
+      svc->drop_count[svc->spatial_layer_id] = 0;
+      return 0;
+    } else {
+      drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id];
+    }
+  }
+  if (!drop_frames_water_mark ||
+      (svc->spatial_layer_id > 0 &&
+       svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
     return 0;
   } else {
-    if (rc->buffer_level < 0) {
+    if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) ||
+        (check_buffer_below_thresh(cpi, -1) &&
+         svc->framedrop_mode == FULL_SUPERFRAME_DROP)) {
       // Always drop if buffer is below 0.
       return 1;
     } else {
       // If buffer is below drop_mark, for now just drop every other frame
       // (starting with the next frame) until it increases back over drop_mark.
       int drop_mark =
-          (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
-      if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+          (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100);
+      if (check_buffer_above_thresh(cpi, drop_mark) &&
+          (rc->decimation_factor > 0)) {
         --rc->decimation_factor;
-      } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+      } else if (check_buffer_below_thresh(cpi, drop_mark) &&
+                 rc->decimation_factor == 0) {
         rc->decimation_factor = 1;
       }
       if (rc->decimation_factor > 0) {
@@ -427,11 +557,129 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) {
   }
 }
 
+int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) {
+  size_t frame_size = *size << 3;
+  int64_t new_buffer_level =
+      cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size;
+
+  // For now we drop if new buffer level (given the encoded frame size) goes
+  // below 0.
+  if (new_buffer_level < 0) {
+    *size = 0;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    // Update flag to use for next frame.
+    if (cpi->rc.high_source_sad ||
+        (cpi->use_svc && cpi->svc.high_source_sad_superframe))
+      cpi->rc.last_post_encode_dropped_scene_change = 1;
+    // Force max_q on next fame.
+    cpi->rc.force_max_q = 1;
+    cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+    cpi->last_frame_dropped = 1;
+    cpi->ext_refresh_frame_flags_pending = 0;
+    if (cpi->use_svc) {
+      SVC *svc = &cpi->svc;
+      int sl = 0;
+      int tl = 0;
+      svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+      svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+      svc->drop_count[svc->spatial_layer_id]++;
+      svc->skip_enhancement_layer = 1;
+      // Postencode drop is only checked on base spatial layer,
+      // for now if max-q is set on base we force it on all layers.
+      for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->force_max_q = 1;
+          lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality;
+        }
+      }
+    }
+    return 1;
+  }
+
+  cpi->rc.force_max_q = 0;
+  cpi->rc.last_post_encode_dropped_scene_change = 0;
+  return 0;
+}
+
+int vp9_rc_drop_frame(VP9_COMP *cpi) {
+  SVC *svc = &cpi->svc;
+  int svc_prev_layer_dropped = 0;
+  // In the constrained or full_superframe framedrop mode for svc
+  // (framedrop_mode !=  LAYER_DROP), if the previous spatial layer was
+  // dropped, drop the current spatial layer.
+  if (cpi->use_svc && svc->spatial_layer_id > 0 &&
+      svc->drop_spatial_layer[svc->spatial_layer_id - 1])
+    svc_prev_layer_dropped = 1;
+  if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP) ||
+      drop_frame(cpi)) {
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    if (cpi->use_svc) {
+      svc->last_layer_dropped[svc->spatial_layer_id] = 1;
+      svc->drop_spatial_layer[svc->spatial_layer_id] = 1;
+      svc->drop_count[svc->spatial_layer_id]++;
+      svc->skip_enhancement_layer = 1;
+      if (svc->framedrop_mode == LAYER_DROP ||
+          svc->drop_spatial_layer[0] == 0) {
+        // For the case of constrained drop mode where the base is dropped
+        // (drop_spatial_layer[0] == 1), which means full superframe dropped,
+        // we don't increment the svc frame counters. In particular temporal
+        // layer counter (which is incremented in vp9_inc_frame_in_layer())
+        // won't be incremented, so on a dropped frame we try the same
+        // temporal_layer_id on next incoming frame. This is to avoid an
+        // issue with temporal alignement with full superframe dropping.
+        vp9_inc_frame_in_layer(cpi);
+      }
+      if (svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+        int i;
+        int all_layers_drop = 1;
+        for (i = 0; i < svc->spatial_layer_id; i++) {
+          if (svc->drop_spatial_layer[i] == 0) {
+            all_layers_drop = 0;
+            break;
+          }
+        }
+        if (all_layers_drop == 1) svc->skip_enhancement_layer = 0;
+      }
+    }
+    return 1;
+  }
+  return 0;
+}
+
+static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
+  // This makes sure q is between oscillating Qs to prevent resonance.
+  if (!cpi->rc.reset_high_source_sad &&
+      (!cpi->oxcf.gf_cbr_boost_pct ||
+       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
+      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+    int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+                       VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+    // If the previous frame had overshoot and the current q needs to increase
+    // above the clamped value, reduce the clamp for faster reaction to
+    // overshoot.
+    if (cpi->rc.rc_1_frame == -1 && q > qclamp)
+      q = (q + qclamp) >> 1;
+    else
+      q = qclamp;
+  }
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+    vp9_cyclic_refresh_limit_q(cpi, &q);
+  return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
+}
+
 static double get_rate_correction_factor(const VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
   double rcf;
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rcf = rc->rate_correction_factors[KF_STD];
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
@@ -451,13 +699,14 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) {
 
 static void set_rate_correction_factor(VP9_COMP *cpi, double factor) {
   RATE_CONTROL *const rc = &cpi->rc;
+  const VP9_COMMON *const cm = &cpi->common;
 
   // Normalize RCF to account for the size-dependent scaling factor.
   factor /= rcf_mult[cpi->rc.frame_size_selector];
 
   factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
 
-  if (cpi->common.frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rc->rate_correction_factors[KF_STD] = factor;
   } else if (cpi->oxcf.pass == 2) {
     RATE_FACTOR_LEVEL rf_lvl =
@@ -478,6 +727,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
   int correction_factor = 100;
   double rate_correction_factor = get_rate_correction_factor(cpi);
   double adjustment_limit;
+  RATE_FACTOR_LEVEL rf_lvl =
+      cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
 
   int projected_size_based_on_q = 0;
 
@@ -494,8 +745,9 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
     projected_size_based_on_q =
         vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
   } else {
+    FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
     projected_size_based_on_q =
-        vp9_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs,
+        vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs,
                                rate_correction_factor, cm->bit_depth);
   }
   // Work out a size correction factor.
@@ -503,10 +755,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) {
     correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
                               projected_size_based_on_q);
 
-  // More heavily damped adjustment used if we have been oscillating either side
-  // of target.
-  adjustment_limit =
-      0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+  // Do not use damped adjustment for the first frame of each frame type
+  if (!cpi->rc.damped_adjustment[rf_lvl]) {
+    adjustment_limit = 1.0;
+    cpi->rc.damped_adjustment[rf_lvl] = 1;
+  } else {
+    // More heavily damped adjustment used if we have been oscillating either
+    // side of target.
+    adjustment_limit =
+        0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor)));
+  }
 
   cpi->rc.q_2_frame = cpi->rc.q_1_frame;
   cpi->rc.q_1_frame = cm->base_qindex;
@@ -569,8 +827,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
       bits_per_mb_at_this_q =
           (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
     } else {
+      FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type;
       bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(
-          cm->frame_type, i, correction_factor, cm->bit_depth);
+          frame_type, i, correction_factor, cm->bit_depth);
     }
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
@@ -585,16 +844,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
     }
   } while (++i <= active_worst_quality);
 
-  // In CBR mode, this makes sure q is between oscillating Qs to prevent
-  // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
-      (!cpi->oxcf.gf_cbr_boost_pct ||
-       !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
-      (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
-      cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
-    q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
-              VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
-  }
+  // Adjustment to q for CBR mode.
+  if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q);
+
   return q;
 }
 
@@ -623,13 +875,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
                             kf_low_motion_minq, kf_high_motion_minq);
 }
 
-static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+static int get_gf_active_quality(const VP9_COMP *const cpi, int q,
                                  vpx_bit_depth_t bit_depth) {
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+  const RATE_CONTROL *const rc = &cpi->rc;
+
   int *arfgf_low_motion_minq;
   int *arfgf_high_motion_minq;
+  const int gfu_boost = cpi->multi_layer_arf
+                            ? gf_group->gfu_boost[gf_group->index]
+                            : rc->gfu_boost;
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
   ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
-  return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+  return get_active_quality(q, gfu_boost, gf_low, gf_high,
                             arfgf_low_motion_minq, arfgf_high_motion_minq);
 }
 
@@ -674,7 +932,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   int active_worst_quality;
   int ambient_qp;
   unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
-  if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+  if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q)
     return rc->worst_quality;
   // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
   // for the first few frames following key frame. These are both initialized
@@ -685,6 +943,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
                    ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME],
                             rc->avg_frame_qindex[KEY_FRAME])
                    : rc->avg_frame_qindex[INTER_FRAME];
+  active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2);
   // For SVC if the current base spatial layer was key frame, use the QP from
   // that base layer for ambient_qp.
   if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
@@ -694,13 +953,15 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
     if (lc->is_key_frame) {
       const RATE_CONTROL *lrc = &lc->rc;
       ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]);
+      active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3);
     }
   }
-  active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2);
   if (rc->buffer_level > rc->optimal_buffer_level) {
     // Adjust down.
-    // Maximum limit for down adjustment, ~30%.
+    // Maximum limit for down adjustment ~30%; make it lower for screen content.
     int max_adjustment_down = active_worst_quality / 3;
+    if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+      max_adjustment_down = active_worst_quality >> 3;
     if (max_adjustment_down) {
       buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
                        max_adjustment_down);
@@ -769,6 +1030,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
           vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
     }
   } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
+             cpi->oxcf.gf_cbr_boost_pct &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
@@ -779,7 +1041,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
     } else {
       q = active_worst_quality;
     }
-    active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
   } else {
     // Use the lower of active_worst_quality and recent/average Q.
     if (cm->current_video_frame > 1) {
@@ -804,21 +1066,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
   *top_index = active_worst_quality;
   *bottom_index = active_best_quality;
 
-#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
-  // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
-      !(cm->current_video_frame == 0)) {
-    int qdelta = 0;
-    vpx_clear_system_state();
-    qdelta = vp9_compute_qdelta_by_rate(
-        &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth);
-    *top_index = active_worst_quality + qdelta;
-    *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
-  }
-#endif
-
   // Special case code to try and match quality with forced key frames
-  if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+  if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
     q = rc->last_boosted_qindex;
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
@@ -831,6 +1080,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi,
         q = *top_index;
     }
   }
+
   assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
   assert(*bottom_index <= rc->worst_quality &&
          *bottom_index >= rc->best_quality);
@@ -939,7 +1189,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
 
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
 
       // Constrained quality use slightly lower active best.
       active_best_quality = active_best_quality * 15 / 16;
@@ -954,7 +1204,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
         delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+      active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
     }
   } else {
     if (oxcf->rc_mode == VPX_Q) {
@@ -1045,78 +1295,145 @@ int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) {
     1.75,  // GF_ARF_STD
     2.00,  // KF_STD
   };
-  static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
-    INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
-  };
   const VP9_COMMON *const cm = &cpi->common;
-  int qdelta =
-      vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
-                                 rate_factor_deltas[rf_level], cm->bit_depth);
+
+  int qdelta = vp9_compute_qdelta_by_rate(
+      &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth);
   return qdelta;
 }
 
 #define STATIC_MOTION_THRESH 95
-static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
-                                         int *top_index) {
+
+static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                     int *top_index) {
   const VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
   int active_best_quality;
   int active_worst_quality = cpi->twopass.active_worst_quality;
-  int q;
-  int *inter_minq;
-  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
 
-  if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) {
+  if (rc->this_key_frame_forced) {
     // Handle the special case for key frames forced when we have reached
     // the maximum key frame interval. Here force the Q to a range
     // based on the ambient Q to reduce the risk of popping.
-    if (rc->this_key_frame_forced) {
-      double last_boosted_q;
-      int delta_qindex;
-      int qindex;
-
-      if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
-        qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
-        active_best_quality = qindex;
-        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 1.25, cm->bit_depth);
-        active_worst_quality =
-            VPXMIN(qindex + delta_qindex, active_worst_quality);
-      } else {
-        qindex = rc->last_boosted_qindex;
-        last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-        delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
-                                          last_boosted_q * 0.75, cm->bit_depth);
-        active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
-      }
+    double last_boosted_q;
+    int delta_qindex;
+    int qindex;
+
+    if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+      qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+      active_best_quality = qindex;
+      last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 1.25, cm->bit_depth);
+      active_worst_quality =
+          VPXMIN(qindex + delta_qindex, active_worst_quality);
     } else {
-      // Not forced keyframe.
-      double q_adj_factor = 1.0;
-      double q_val;
-      // Baseline value derived from cpi->active_worst_quality and kf boost.
-      active_best_quality =
-          get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+      qindex = rc->last_boosted_qindex;
+      last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      delta_qindex = vp9_compute_qdelta(rc, last_boosted_q,
+                                        last_boosted_q * 0.75, cm->bit_depth);
+      active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
+    }
+  } else {
+    // Not forced keyframe.
+    double q_adj_factor = 1.0;
+    double q_val;
+    // Baseline value derived from cpi->active_worst_quality and kf boost.
+    active_best_quality =
+        get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+    if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+      active_best_quality /= 4;
+    }
 
-      // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
-        q_adj_factor -= 0.25;
-      }
+    // Dont allow the active min to be lossless (q0) unlesss the max q
+    // already indicates lossless.
+    active_best_quality =
+        VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
 
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+    // Allow somewhat lower kf minq with small image formats.
+    if ((cm->width * cm->height) <= (352 * 288)) {
+      q_adj_factor -= 0.25;
+    }
 
-      // Convert the adjustment factor to a qindex delta
-      // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
-      active_best_quality +=
-          vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+    // Make a further adjustment based on the kf zero motion measure.
+    q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+    // Convert the adjustment factor to a qindex delta
+    // on active_best_quality.
+    q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth);
+    active_best_quality +=
+        vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth);
+  }
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+}
+
+static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index,
+                         int gf_group_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int is_intra_frame = frame_is_intra_only(cm);
+
+  const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
+
+  int q = cq_level;
+  int active_best_quality = cq_level;
+  int active_worst_quality = cq_level;
+
+  // Key frame qp decision
+  if (is_intra_frame && rc->frames_to_key > 1)
+    pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+
+  // ARF / GF qp decision
+  if (!is_intra_frame && !rc->is_src_frame_alt_ref &&
+      cpi->refresh_alt_ref_frame) {
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+
+    // Modify best quality for second level arfs. For mode VPX_Q this
+    // becomes the baseline frame q.
+    if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+      const int layer_depth = gf_group->layer_depth[gf_group_index];
+      // linearly fit the frame q depending on the layer depth index from
+      // the base layer ARF.
+      active_best_quality = ((layer_depth - 1) * cq_level +
+                             active_best_quality + layer_depth / 2) /
+                            layer_depth;
     }
-  } else if (!rc->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+  }
+
+  q = active_best_quality;
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+  return q;
+}
+
+static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
+                                         int *top_index, int gf_group_index) {
+  const VP9_COMMON *const cm = &cpi->common;
+  const RATE_CONTROL *const rc = &cpi->rc;
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf);
+  int active_best_quality;
+  int active_worst_quality = cpi->twopass.active_worst_quality;
+  int q;
+  int *inter_minq;
+  int arf_active_best_quality_adjustment, arf_active_best_quality_max;
+  int *arfgf_high_motion_minq;
+  const int boost_frame =
+      !rc->is_src_frame_alt_ref &&
+      (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame);
+
+  ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq);
+
+  if (oxcf->rc_mode == VPX_Q)
+    return rc_constant_q(cpi, bottom_index, top_index, gf_group_index);
+
+  if (frame_is_intra_only(cm)) {
+    pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality);
+  } else if (boost_frame) {
     // Use the lower of active_worst_quality and recent
     // average Q as basis for GF/ARF best Q limit unless last frame was
     // a key frame.
@@ -1129,63 +1446,59 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
     // For constrained quality dont allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
-
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-
-      // Constrained quality use slightly lower active best.
-      active_best_quality = active_best_quality * 15 / 16;
-
-    } else if (oxcf->rc_mode == VPX_Q) {
-      if (!cpi->refresh_alt_ref_frame) {
-        active_best_quality = cq_level;
-      } else {
-        active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
-
-        // Modify best quality for second level arfs. For mode VPX_Q this
-        // becomes the baseline frame q.
-        if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
-          active_best_quality = (active_best_quality + cq_level + 1) / 2;
-      }
-    } else {
-      active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth);
+    }
+    active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
+
+    ASSIGN_MINQ_TABLE(cm->bit_depth, arfgf_high_motion_minq);
+    arf_active_best_quality_max = arfgf_high_motion_minq[q];
+    arf_active_best_quality_adjustment =
+        arf_active_best_quality_max - active_best_quality;
+    active_best_quality = arf_active_best_quality_max -
+                          (int)(arf_active_best_quality_adjustment *
+                                rc->arf_active_best_quality_adjustment_factor);
+
+    // Modify best quality for second level arfs. For mode VPX_Q this
+    // becomes the baseline frame q.
+    if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) {
+      const int layer_depth = gf_group->layer_depth[gf_group_index];
+      // linearly fit the frame q depending on the layer depth index from
+      // the base layer ARF.
+      active_best_quality =
+          ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) /
+          layer_depth;
     }
   } else {
-    if (oxcf->rc_mode == VPX_Q) {
-      active_best_quality = cq_level;
-    } else {
-      active_best_quality = inter_minq[active_worst_quality];
+    active_best_quality = inter_minq[active_worst_quality];
 
-      // For the constrained quality mode we don't want
-      // q to fall below the cq level.
-      if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
-        active_best_quality = cq_level;
-      }
+    // For the constrained quality mode we don't want
+    // q to fall below the cq level.
+    if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) {
+      active_best_quality = cq_level;
     }
   }
 
   // Extension to max or min Q if undershoot or overshoot is outside
   // the permitted range.
-  if (cpi->oxcf.rc_mode != VPX_Q) {
-    if (frame_is_intra_only(cm) ||
-        (!rc->is_src_frame_alt_ref &&
-         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-      active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
-      active_worst_quality += (cpi->twopass.extend_maxq / 2);
-    } else {
-      active_best_quality -=
-          (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
-      active_worst_quality += cpi->twopass.extend_maxq;
-    }
+  if (frame_is_intra_only(cm) || boost_frame) {
+    active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+    active_worst_quality += (cpi->twopass.extend_maxq / 2);
+  } else {
+    active_best_quality -=
+        (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+    active_worst_quality += cpi->twopass.extend_maxq;
+
+    // For normal frames do not allow an active minq lower than the q used for
+    // the last boosted frame.
+    active_best_quality = VPXMAX(active_best_quality, rc->last_boosted_qindex);
   }
 
 #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
   vpx_clear_system_state();
   // Static forced key frames Q restrictions dealt with elsewhere.
-  if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) ||
-      !rc->this_key_frame_forced ||
-      (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
-    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+  if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced ||
+      cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) {
+    int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index],
                                        active_worst_quality);
     active_worst_quality =
         VPXMAX(active_worst_quality + qdelta, active_best_quality);
@@ -1205,17 +1518,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
   active_worst_quality =
       clamp(active_worst_quality, active_best_quality, rc->worst_quality);
 
-  if (oxcf->rc_mode == VPX_Q) {
-    q = active_best_quality;
-    // Special case code to try and match quality with forced key frames.
-  } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) &&
-             rc->this_key_frame_forced) {
+  if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
     // If static since last kf use better of last boosted and last kf q.
     if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
       q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
     } else {
       q = rc->last_boosted_qindex;
     }
+  } else if (frame_is_intra_only(cm) && !rc->this_key_frame_forced) {
+    q = active_best_quality;
   } else {
     q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
                           active_worst_quality);
@@ -1242,13 +1553,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
 int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
                              int *top_index) {
   int q;
+  const int gf_group_index = cpi->twopass.gf_group.index;
   if (cpi->oxcf.pass == 0) {
     if (cpi->oxcf.rc_mode == VPX_CBR)
       q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index);
     else
       q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index);
   } else {
-    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index);
+    q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index,
+                                      gf_group_index);
   }
   if (cpi->sf.use_nonrd_pick_mode) {
     if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex;
@@ -1261,6 +1574,89 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index,
   return q;
 }
 
+void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) {
+  VP9_COMMON *cm = &cpi->common;
+  TWO_PASS *const twopass = &cpi->twopass;
+
+  cpi->rc.is_src_frame_alt_ref = 0;
+  cm->show_existing_frame = 0;
+  cpi->rc.show_arf_as_gld = 0;
+  switch (twopass->gf_group.update_type[gf_group_index]) {
+    case KF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+    case LF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case GF_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      break;
+    case OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      if (cpi->rc.preserve_arf_as_gld) {
+        cpi->rc.show_arf_as_gld = 1;
+        cpi->refresh_golden_frame = 0;
+        cm->show_existing_frame = 1;
+        cm->refresh_frame_context = 0;
+      }
+      break;
+    case MID_OVERLAY_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      break;
+    case USE_BUF_FRAME:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_src_frame_alt_ref = 1;
+      cm->show_existing_frame = 1;
+      cm->refresh_frame_context = 0;
+      break;
+    default:
+      assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE);
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 1;
+      break;
+  }
+}
+
+void vp9_estimate_qp_gop(VP9_COMP *cpi) {
+  int gop_length = cpi->twopass.gf_group.gf_group_size;
+  int bottom_index, top_index;
+  int idx;
+  const int gf_index = cpi->twopass.gf_group.index;
+  const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref;
+  const int refresh_frame_context = cpi->common.refresh_frame_context;
+
+  for (idx = 1; idx <= gop_length; ++idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[idx];
+    int target_rate = cpi->twopass.gf_group.bit_allocation[idx];
+    cpi->twopass.gf_group.index = idx;
+    vp9_rc_set_frame_target(cpi, target_rate);
+    vp9_configure_buffer_updates(cpi, idx);
+    tpl_frame->base_qindex =
+        rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx);
+    tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1);
+  }
+  // Reset the actual index and frame update
+  cpi->twopass.gf_group.index = gf_index;
+  cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref;
+  cpi->common.refresh_frame_context = refresh_frame_context;
+  vp9_configure_buffer_updates(cpi, gf_index);
+}
+
 void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit) {
@@ -1333,6 +1729,15 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
 
     rc->frames_since_golden++;
+
+    if (rc->show_arf_as_gld) {
+      rc->frames_since_golden = 0;
+      // If we are not using alt ref in the up and coming group clear the arf
+      // active flag. In multi arf group case, if the index is not 0 then
+      // we are overlaying a mid group arf so should not reset the flag.
+      if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+        rc->source_alt_ref_active = 0;
+    }
   }
 }
 
@@ -1367,7 +1772,8 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) {
   int cnt_zeromv = 0;
   for (mi_row = 0; mi_row < rows; mi_row++) {
     for (mi_col = 0; mi_col < cols; mi_col++) {
-      if (abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16)
+      if (mi[0]->ref_frame[0] == LAST_FRAME &&
+          abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16)
         cnt_zeromv++;
       mi++;
     }
@@ -1381,6 +1787,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   const VP9_COMMON *const cm = &cpi->common;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
   const int qindex = cm->base_qindex;
 
   // Update rate control heuristics
@@ -1390,7 +1797,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   vp9_rc_update_rate_correction_factors(cpi);
 
   // Keep a record of last Q and ambient average Q.
-  if (cm->frame_type == KEY_FRAME) {
+  if (frame_is_intra_only(cm)) {
     rc->last_q[KEY_FRAME] = qindex;
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
@@ -1423,6 +1830,8 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     }
   }
 
+  if (cpi->use_svc) vp9_svc_adjust_avg_frame_qindex(cpi);
+
   // Keep record of last boosted (KF/KF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
@@ -1434,13 +1843,13 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
         (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
     rc->last_boosted_qindex = qindex;
   }
-  if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+  if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex;
 
-  update_buffer_level(cpi, rc->projected_frame_size);
+  update_buffer_level_postencode(cpi, rc->projected_frame_size);
 
   // Rolling monitors of whether we are over or underspending used to help
   // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
+  if (!frame_is_intra_only(cm)) {
     rc->rolling_target_bits = ROUND_POWER_OF_TWO(
         rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
     rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
@@ -1457,9 +1866,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
-  if (!cpi->use_svc || is_two_pass_svc(cpi)) {
+  if (!cpi->use_svc) {
     if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
-        (cm->frame_type != KEY_FRAME))
+        (!frame_is_intra_only(cm)))
       // Update the alternate reference frame stats as appropriate.
       update_alt_ref_frame_stats(cpi);
     else
@@ -1467,7 +1876,28 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       update_golden_frame_stats(cpi);
   }
 
-  if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+  // If second (long term) temporal reference is used for SVC,
+  // update the golden frame counter, only for base temporal layer.
+  if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+      svc->temporal_layer_id == 0) {
+    int i = 0;
+    if (cpi->refresh_golden_frame)
+      rc->frames_since_golden = 0;
+    else
+      rc->frames_since_golden++;
+    // Decrement count down till next gf
+    if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--;
+    // Update the frames_since_golden for all upper temporal layers.
+    for (i = 1; i < svc->number_temporal_layers; ++i) {
+      const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
+                                         svc->number_temporal_layers);
+      LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+      RATE_CONTROL *const lrc = &lc->rc;
+      lrc->frames_since_golden = rc->frames_since_golden;
+    }
+  }
+
+  if (frame_is_intra_only(cm)) rc->frames_since_key = 0;
   if (cm->show_frame) {
     rc->frames_since_key++;
     rc->frames_to_key--;
@@ -1481,24 +1911,53 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 
   if (oxcf->pass == 0) {
-    if (cm->frame_type != KEY_FRAME) {
+    if (!frame_is_intra_only(cm) &&
+        (!cpi->use_svc ||
+         (cpi->use_svc &&
+          !svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+          svc->spatial_layer_id == svc->number_spatial_layers - 1))) {
       compute_frame_low_motion(cpi);
       if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
     }
+    // For SVC: set avg_frame_low_motion (only computed on top spatial layer)
+    // to all lower spatial layers.
+    if (cpi->use_svc &&
+        svc->spatial_layer_id == svc->number_spatial_layers - 1) {
+      int i;
+      for (i = 0; i < svc->number_spatial_layers - 1; ++i) {
+        const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id,
+                                           svc->number_temporal_layers);
+        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+        RATE_CONTROL *const lrc = &lc->rc;
+        lrc->avg_frame_low_motion = rc->avg_frame_low_motion;
+      }
+    }
     cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
   }
-  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
+  if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0;
 
   rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+  if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
+    svc->lower_layer_qindex = cm->base_qindex;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
-  // Update buffer level with zero size, update frame counters, and return.
-  update_buffer_level(cpi, 0);
+  cpi->common.current_video_frame++;
   cpi->rc.frames_since_key++;
   cpi->rc.frames_to_key--;
   cpi->rc.rc_2_frame = 0;
   cpi->rc.rc_1_frame = 0;
+  cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth;
+  // For SVC on dropped frame when framedrop_mode != LAYER_DROP:
+  // in this mode the whole superframe may be dropped if only a single layer
+  // has buffer underflow (below threshold). Since this can then lead to
+  // increasing buffer levels/overflow for certain layers even though whole
+  // superframe is dropped, we cap buffer level if its already stable.
+  if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP &&
+      cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) {
+    cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
+    cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
+  }
 }
 
 static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) {
@@ -1544,10 +2003,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+       rc->frames_to_key == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->this_key_frame_forced =
         cm->current_video_frame != 0 && rc->frames_to_key == 0;
@@ -1582,9 +2040,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
       // between 0 and 100 (stationary, 100% zero/small motion).
       rc->gfu_boost =
-          VPXMAX(500,
-                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                     (rc->avg_frame_low_motion + 100));
+          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                          (rc->avg_frame_low_motion + 100));
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1684,30 +2141,80 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
+static void set_intra_only_frame(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  // Don't allow intra_only frame for bypass/flexible SVC mode, or if number
+  // of spatial layers is 1 or if number of spatial or temporal layers > 3.
+  // Also if intra-only is inserted on very first frame, don't allow if
+  // if number of temporal layers > 1. This is because on intra-only frame
+  // only 3 reference buffers can be updated, but for temporal layers > 1
+  // we generally need to use buffer slots 4 and 5.
+  if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
+      svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
+      svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
+      svc->number_spatial_layers == 1)
+    return;
+  cm->show_frame = 0;
+  cm->intra_only = 1;
+  cm->frame_type = INTER_FRAME;
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->ext_refresh_last_frame = 1;
+  cpi->ext_refresh_golden_frame = 1;
+  cpi->ext_refresh_alt_ref_frame = 1;
+  if (cm->current_video_frame == 0) {
+    cpi->lst_fb_idx = 0;
+    cpi->gld_fb_idx = 1;
+    cpi->alt_fb_idx = 2;
+  } else {
+    int i;
+    int count = 0;
+    cpi->lst_fb_idx = -1;
+    cpi->gld_fb_idx = -1;
+    cpi->alt_fb_idx = -1;
+    // For intra-only frame we need to refresh all slots that were
+    // being used for the base layer (fb_idx_base[i] == 1).
+    // Start with assigning last first, then golden and then alt.
+    for (i = 0; i < REF_FRAMES; ++i) {
+      if (svc->fb_idx_base[i] == 1) count++;
+      if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
+      if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
+      if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
+    }
+    // If golden or alt is not being used for base layer, then set them
+    // to the lst_fb_idx.
+    if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
+    if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+  }
+}
+
 void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  SVC *const svc = &cpi->svc;
   int target = rc->avg_frame_bandwidth;
-  int layer =
-      LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
-                       cpi->svc.number_temporal_layers);
+  int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                               svc->number_temporal_layers);
+  if (svc->first_spatial_layer_to_encode)
+    svc->layer_context[svc->temporal_layer_id].is_key_frame = 0;
   // Periodic key frames is based on the super-frame counter
   // (svc.current_superframe), also only base spatial layer is key frame.
-  if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+  // Key frame is set for any of the following: very first frame, frame flags
+  // indicates key, superframe counter hits key frequencey, or (non-intra) sync
+  // flag is set for spatial layer 0.
+  if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
+      (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key &&
-       (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) &&
-       cpi->svc.spatial_layer_id == 0)) {
+       (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
+       !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
+      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-    if (is_two_pass_svc(cpi)) {
-      cpi->svc.layer_context[layer].is_key_frame = 1;
-      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi);
-      layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
-                               cpi->svc.temporal_layer_id,
-                               cpi->svc.number_temporal_layers);
-      cpi->svc.layer_context[layer].is_key_frame = 1;
+    if (is_one_pass_cbr_svc(cpi)) {
+      if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
+      layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
+                               svc->number_temporal_layers);
+      svc->layer_context[layer].is_key_frame = 1;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
       // Thus no change in update flags.
@@ -1715,48 +2222,127 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     }
   } else {
     cm->frame_type = INTER_FRAME;
-    if (is_two_pass_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == 0) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-        if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
-      }
-      cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else if (is_one_pass_cbr_svc(cpi)) {
-      LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-      if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) {
-        lc->is_key_frame = 0;
-      } else {
-        lc->is_key_frame =
-            cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame;
-      }
+    if (is_one_pass_cbr_svc(cpi)) {
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      // Add condition current_video_frame > 0 for the case where first frame
+      // is intra only followed by overlay/copy frame. In this case we don't
+      // want to reset is_key_frame to 0 on overlay/copy frame.
+      lc->is_key_frame =
+          (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
+              ? 0
+              : svc->layer_context[svc->temporal_layer_id].is_key_frame;
       target = calc_pframe_target_size_one_pass_cbr(cpi);
     }
   }
 
+  if (svc->simulcast_mode) {
+    if (svc->spatial_layer_id > 0 &&
+        svc->layer_context[layer].is_key_frame == 1) {
+      cm->frame_type = KEY_FRAME;
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
+      target = calc_iframe_target_size_one_pass_cbr(cpi);
+    }
+    // Set the buffer idx and refresh flags for key frames in simulcast mode.
+    // Note the buffer slot for long-term reference is set below (line 2255),
+    // and alt_ref is used for that on key frame. So use last and golden for
+    // the other two normal slots.
+    if (cm->frame_type == KEY_FRAME) {
+      if (svc->number_spatial_layers == 2) {
+        if (svc->spatial_layer_id == 0) {
+          cpi->lst_fb_idx = 0;
+          cpi->gld_fb_idx = 2;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 1) {
+          cpi->lst_fb_idx = 1;
+          cpi->gld_fb_idx = 3;
+          cpi->alt_fb_idx = 6;
+        }
+      } else if (svc->number_spatial_layers == 3) {
+        if (svc->spatial_layer_id == 0) {
+          cpi->lst_fb_idx = 0;
+          cpi->gld_fb_idx = 3;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 1) {
+          cpi->lst_fb_idx = 1;
+          cpi->gld_fb_idx = 4;
+          cpi->alt_fb_idx = 6;
+        } else if (svc->spatial_layer_id == 2) {
+          cpi->lst_fb_idx = 2;
+          cpi->gld_fb_idx = 5;
+          cpi->alt_fb_idx = 7;
+        }
+      }
+      cpi->ext_refresh_last_frame = 1;
+      cpi->ext_refresh_golden_frame = 1;
+      cpi->ext_refresh_alt_ref_frame = 1;
+    }
+  }
+
+  // Check if superframe contains a sync layer request.
+  vp9_svc_check_spatial_layer_sync(cpi);
+
+  // If long term termporal feature is enabled, set the period of the update.
+  // The update/refresh of this reference frame is always on base temporal
+  // layer frame.
+  if (svc->use_gf_temporal_ref_current_layer) {
+    // Only use gf long-term prediction on non-key superframes.
+    if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+      // Use golden for this reference, which will be used for prediction.
+      int index = svc->spatial_layer_id;
+      if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+      assert(index >= 0);
+      cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+      // Enable prediction off LAST (last reference) and golden (which will
+      // generally be further behind/long-term reference).
+      cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+    }
+    // Check for update/refresh of reference: only refresh on base temporal
+    // layer.
+    if (svc->temporal_layer_id == 0) {
+      if (svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+        // On key frame we update the buffer index used for long term reference.
+        // Use the alt_ref since it is not used or updated on key frames.
+        int index = svc->spatial_layer_id;
+        if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+        assert(index >= 0);
+        cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      } else if (rc->frames_till_gf_update_due == 0) {
+        // Set perdiod of next update. Make it a multiple of 10, as the cyclic
+        // refresh is typically ~10%, and we'd like the update to happen after
+        // a few cylces of the refresh (so it better quality frame). Note the
+        // cyclic refresh for SVC only operates on base temporal layer frames.
+        // Choose 20 as perdiod for now (2 cycles).
+        rc->baseline_gf_interval = 20;
+        rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+        cpi->ext_refresh_golden_frame = 1;
+        rc->gfu_boost = DEFAULT_GF_BOOST;
+      }
+    }
+  } else if (!svc->use_gf_temporal_ref) {
+    rc->frames_till_gf_update_due = INT_MAX;
+    rc->baseline_gf_interval = INT_MAX;
+  }
+  if (svc->set_intra_only_frame) {
+    set_intra_only_frame(cpi);
+    target = calc_iframe_target_size_one_pass_cbr(cpi);
+  }
   // Any update/change of global cyclic refresh parameters (amount/delta-qp)
   // should be done here, before the frame qp is selected.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_update_parameters(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  rc->frames_till_gf_update_due = INT_MAX;
-  rc->baseline_gf_interval = INT_MAX;
+  if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
 }
 
 void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
-  // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
-  if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+  if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
     cm->frame_type = KEY_FRAME;
-    rc->this_key_frame_forced =
-        cm->current_video_frame != 0 && rc->frames_to_key == 0;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
     rc->source_alt_ref_active = 0;
@@ -1782,12 +2368,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_update_parameters(cpi);
 
-  if (cm->frame_type == KEY_FRAME)
+  if (frame_is_intra_only(cm))
     target = calc_iframe_target_size_one_pass_cbr(cpi);
   else
     target = calc_pframe_target_size_one_pass_cbr(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
+
+  if (cm->show_frame) update_buffer_level_preencode(cpi);
+
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC)
     cpi->resize_pending = vp9_resize_one_pass_cbr(cpi);
   else
@@ -1859,13 +2448,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
           cpi->framerate, rc->min_gf_interval);
 
-    // Extended interval for genuinely static scenes
-    rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
-
-    if (is_altref_enabled(cpi)) {
-      if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
-        rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
-    }
+    // Extended max interval for genuinely static scenes like slide shows.
+    rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
 
     if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
       rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1909,12 +2493,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
       VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
 
   // A maximum bitrate for a frame is defined.
-  // The baseline for this aligns with HW implementations that
-  // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
-  // per 16x16 MB (averaged over a frame). However this limit is extended if
-  // a very high rate is given on the command line or the the rate cannnot
-  // be acheived because of a user specificed max q (e.g. when the user
-  // specifies lossless encode.
+  // However this limit is extended if a very high rate is given on the command
+  // line or the the rate cannnot be acheived because of a user specificed max q
+  // (e.g. when the user specifies lossless encode).
+  //
+  // If a level is specified that requires a lower maximum rate then the level
+  // value take precedence.
   vbr_max_bits =
       (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
             100);
@@ -2271,30 +2855,56 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
 void vp9_scene_detection_onepass(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
+  YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source;
+  YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source;
+  uint8_t *src_y;
+  int src_ystride;
+  int src_width;
+  int src_height;
+  uint8_t *last_src_y;
+  int last_src_ystride;
+  int last_src_width;
+  int last_src_height;
+  if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL ||
+      (cpi->use_svc && cpi->svc.current_superframe == 0))
+    return;
+  src_y = unscaled_src->y_buffer;
+  src_ystride = unscaled_src->y_stride;
+  src_width = unscaled_src->y_width;
+  src_height = unscaled_src->y_height;
+  last_src_y = unscaled_last_src->y_buffer;
+  last_src_ystride = unscaled_last_src->y_stride;
+  last_src_width = unscaled_last_src->y_width;
+  last_src_height = unscaled_last_src->y_height;
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) return;
 #endif
   rc->high_source_sad = 0;
-  if (cpi->Last_Source != NULL &&
-      cpi->Last_Source->y_width == cpi->Source->y_width &&
-      cpi->Last_Source->y_height == cpi->Source->y_height) {
+  rc->high_num_blocks_with_motion = 0;
+  // For SVC: scene detection is only checked on first spatial layer of
+  // the superframe using the original/unscaled resolutions.
+  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+      src_width == last_src_width && src_height == last_src_height) {
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
-    uint8_t *src_y = cpi->Source->y_buffer;
-    int src_ystride = cpi->Source->y_stride;
-    uint8_t *last_src_y = cpi->Last_Source->y_buffer;
-    int last_src_ystride = cpi->Last_Source->y_stride;
+    int num_mi_cols = cm->mi_cols;
+    int num_mi_rows = cm->mi_rows;
     int start_frame = 0;
     int frames_to_buffer = 1;
     int frame = 0;
     int scene_cut_force_key_frame = 0;
+    int num_zero_temp_sad = 0;
     uint64_t avg_sad_current = 0;
-    uint32_t min_thresh = 4000;
+    uint32_t min_thresh = 10000;
     float thresh = 8.0f;
     uint32_t thresh_key = 140000;
     if (cpi->oxcf.speed <= 5) thresh_key = 240000;
-    if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 65000;
-      thresh = 2.1f;
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000;
+    if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f;
+    if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) {
+      const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2);
+      const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2);
+      num_mi_cols = aligned_width >> MI_SIZE_LOG2;
+      num_mi_rows = aligned_height >> MI_SIZE_LOG2;
     }
     if (cpi->oxcf.lag_in_frames > 0) {
       frames_to_buffer = (cm->current_video_frame == 1)
@@ -2342,14 +2952,15 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
         uint64_t avg_sad = 0;
         uint64_t tmp_sad = 0;
         int num_samples = 0;
-        int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-        int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+        int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+        int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
         if (cpi->oxcf.lag_in_frames > 0) {
           src_y = frames[frame]->y_buffer;
           src_ystride = frames[frame]->y_stride;
           last_src_y = frames[frame + 1]->y_buffer;
           last_src_ystride = frames[frame + 1]->y_stride;
         }
+        num_zero_temp_sad = 0;
         for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) {
           for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) {
             // Checker-board pattern, ignore boundary.
@@ -2361,6 +2972,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
                                                last_src_ystride);
               avg_sad += tmp_sad;
               num_samples++;
+              if (tmp_sad == 0) num_zero_temp_sad++;
             }
             src_y += 64;
             last_src_y += 64;
@@ -2377,7 +2989,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
           if (avg_sad >
                   VPXMAX(min_thresh,
                          (unsigned int)(rc->avg_source_sad[0] * thresh)) &&
-              rc->frames_since_key > 1)
+              rc->frames_since_key > 1 + cpi->svc.number_spatial_layers &&
+              num_zero_temp_sad < 3 * (num_samples >> 2))
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
@@ -2388,6 +3001,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
         } else {
           rc->avg_source_sad[lagframe_idx] = avg_sad;
         }
+        if (num_zero_temp_sad < (3 * num_samples >> 2))
+          rc->high_num_blocks_with_motion = 1;
       }
     }
     // For CBR non-screen content mode, check if we should reset the rate
@@ -2407,6 +3022,19 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
       if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
         rc->this_frame_target = rc->avg_frame_bandwidth;
     }
+    // For SVC the new (updated) avg_source_sad[0] for the current superframe
+    // updates the setting for all layers.
+    if (cpi->use_svc) {
+      int sl, tl;
+      SVC *const svc = &cpi->svc;
+      for (sl = 0; sl < svc->number_spatial_layers; ++sl)
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *const lc = &svc->layer_context[layer];
+          RATE_CONTROL *const lrc = &lc->rc;
+          lrc->avg_source_sad[0] = rc->avg_source_sad[0];
+        }
+    }
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
         rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2437,12 +3065,26 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
 
 // Test if encoded frame will significantly overshoot the target bitrate, and
 // if so, set the QP, reset/adjust some rate control parameters, and return 1.
+// frame_size = -1 means frame has not been encoded.
 int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
-  int thresh_qp = 3 * (rc->worst_quality >> 2);
-  int thresh_rate = rc->avg_frame_bandwidth * 10;
-  if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) {
+  SPEED_FEATURES *const sf = &cpi->sf;
+  int thresh_qp = 7 * (rc->worst_quality >> 3);
+  int thresh_rate = rc->avg_frame_bandwidth << 3;
+  // Lower thresh_qp for video (more overshoot at lower Q) to be
+  // more conservative for video.
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+    thresh_qp = 3 * (rc->worst_quality >> 2);
+  // If this decision is not based on an encoded frame size but just on
+  // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt ==
+  // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate)
+  // condition in this case.
+  // TODO(marpan): Use a better size/rate condition for this case and
+  // adjust thresholds.
+  if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ ||
+       frame_size > thresh_rate) &&
+      cm->base_qindex < thresh_qp) {
     double rate_correction_factor =
         cpi->rc.rate_correction_factors[INTER_NORMAL];
     const int target_size = cpi->rc.avg_frame_bandwidth;
@@ -2452,6 +3094,29 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
     int enumerator;
     // Force a re-encode, and for now use max-QP.
     *q = cpi->rc.worst_quality;
+    cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0;
+    cpi->rc.re_encode_maxq_scene_change = 1;
+    // If the frame_size is much larger than the threshold (big content change)
+    // and the encoded frame used alot of Intra modes, then force hybrid_intra
+    // encoding for the re-encode on this scene change. hybrid_intra will
+    // use rd-based intra mode selection for small blocks.
+    if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
+        frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) {
+      MODE_INFO **mi = cm->mi_grid_visible;
+      int sum_intra_usage = 0;
+      int mi_row, mi_col;
+      int tot = 0;
+      for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+        for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+          if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
+          tot++;
+          mi++;
+        }
+        mi += 8;
+      }
+      sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols);
+      if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1;
+    }
     // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as
     // these parameters will affect QP selection for subsequent frames. If they
     // have settled down to a very different (low QP) state, then not adjusting
@@ -2479,21 +3144,27 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
     }
     // For temporal layers, reset the rate control parametes across all
-    // temporal layers.
+    // temporal layers. If the first_spatial_layer_to_encode > 0, then this
+    // superframe has skipped lower base layers. So in this case we should also
+    // reset and force max-q for spatial layers < first_spatial_layer_to_encode.
     if (cpi->use_svc) {
-      int i = 0;
+      int tl = 0;
+      int sl = 0;
       SVC *svc = &cpi->svc;
-      for (i = 0; i < svc->number_temporal_layers; ++i) {
-        const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
-                                           svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
-        lrc->avg_frame_qindex[INTER_FRAME] = *q;
-        lrc->buffer_level = rc->optimal_buffer_level;
-        lrc->bits_off_target = rc->optimal_buffer_level;
-        lrc->rc_1_frame = 0;
-        lrc->rc_2_frame = 0;
-        lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+      for (sl = 0; sl < svc->first_spatial_layer_to_encode; ++sl) {
+        for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+          const int layer =
+              LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+          LAYER_CONTEXT *lc = &svc->layer_context[layer];
+          RATE_CONTROL *lrc = &lc->rc;
+          lrc->avg_frame_qindex[INTER_FRAME] = *q;
+          lrc->buffer_level = lrc->optimal_buffer_level;
+          lrc->bits_off_target = lrc->optimal_buffer_level;
+          lrc->rc_1_frame = 0;
+          lrc->rc_2_frame = 0;
+          lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
+          lrc->force_max_q = 1;
+        }
       }
     }
     return 1;
diff --git a/libs/libvpx/vp9/encoder/vp9_ratectrl.h b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
index c1b210677e2..09d69e4d4e1 100644
--- a/libs/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libs/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RATECTRL_H_
-#define VP9_ENCODER_VP9_RATECTRL_H_
+#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_
+#define VPX_VP9_ENCODER_VP9_RATECTRL_H_
 
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
@@ -34,6 +34,14 @@ extern "C" {
 
 #define FRAME_OVERHEAD_BITS 200
 
+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -167,15 +175,34 @@ typedef struct {
   uint64_t avg_source_sad[MAX_LAG_BUFFERS];
   uint64_t prev_avg_source_sad_lag;
   int high_source_sad_lagindex;
+  int high_num_blocks_with_motion;
   int alt_ref_gf_group;
   int last_frame_is_src_altref;
   int high_source_sad;
   int count_last_scene_change;
+  int hybrid_intra_scene_change;
+  int re_encode_maxq_scene_change;
   int avg_frame_low_motion;
   int af_ratio_onepass_vbr;
   int force_qpmin;
   int reset_high_source_sad;
   double perc_arf_usage;
+  int force_max_q;
+  // Last frame was dropped post encode on scene change.
+  int last_post_encode_dropped_scene_change;
+  // Enable post encode frame dropping for screen content. Only enabled when
+  // ext_use_post_encode_drop is enabled by user.
+  int use_post_encode_drop;
+  // External flag to enable post encode frame dropping, controlled by user.
+  int ext_use_post_encode_drop;
+
+  int damped_adjustment[RATE_FACTOR_LEVELS];
+  double arf_active_best_quality_adjustment_factor;
+  int arf_active_best_quality_adjustment_window;
+
+  int preserve_arf_as_gld;
+  int preserve_next_arf_as_gld;
+  int show_arf_as_gld;
 } RATE_CONTROL;
 
 struct VP9_COMP;
@@ -184,7 +211,7 @@ struct VP9EncoderConfig;
 void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass,
                  RATE_CONTROL *rc);
 
-int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
                            double correction_factor, vpx_bit_depth_t bit_depth);
 
 double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth);
@@ -195,9 +222,9 @@ void vp9_rc_init_minq_luts(void);
 
 int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
 // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
-// be passed in to ensure that the max_gf_interval returned is at least as bis
+// be passed in to ensure that the max_gf_interval returned is at least as big
 // as that.
-int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval);
 
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
@@ -237,13 +264,16 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi);
 // Changes only the rate correction factors in the rate control structure.
 void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi);
 
+// Post encode drop for CBR mode.
+int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size);
+
 // Decide if we should drop this frame: For 1-pass CBR.
 // Changes only the decimation count in the rate control structure
 int vp9_rc_drop_frame(struct VP9_COMP *cpi);
 
 // Computes frame size bounds.
 void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi,
-                                      int this_frame_target,
+                                      int frame_target,
                                       int *frame_under_shoot_limit,
                                       int *frame_over_shoot_limit);
 
@@ -294,8 +324,12 @@ void vp9_scene_detection_onepass(struct VP9_COMP *cpi);
 
 int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q);
 
+void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index);
+
+void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RATECTRL_H_
+#endif  // VPX_VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.c b/libs/libvpx/vp9/encoder/vp9_rd.c
index 6b2306ce9b0..b874a32dd73 100644
--- a/libs/libvpx/vp9/encoder/vp9_rd.c
+++ b/libs/libvpx/vp9/encoder/vp9_rd.c
@@ -57,6 +57,30 @@ void vp9_rd_cost_init(RD_COST *rd_cost) {
   rd_cost->rdcost = 0;
 }
 
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) {
+  assert(mult >= 0);
+  assert(div > 0);
+  if (rate >= 0 && dist >= 0) {
+    return RDCOST(mult, div, rate, dist);
+  }
+  if (rate >= 0 && dist < 0) {
+    return RDCOST_NEG_D(mult, div, rate, -dist);
+  }
+  if (rate < 0 && dist >= 0) {
+    return RDCOST_NEG_R(mult, div, -rate, dist);
+  }
+  return -RDCOST(mult, div, -rate, -dist);
+}
+
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) {
+  if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) {
+    rd_cost->rdcost =
+        vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist);
+  } else {
+    vp9_rd_cost_reset(rd_cost);
+  }
+}
+
 // The baseline rd thresholds for breaking out of the rd loop for
 // certain modes are assumed to be based on 8x8 blocks.
 // This table is used to correct for block size.
@@ -69,10 +93,12 @@ static void fill_mode_costs(VP9_COMP *cpi) {
   const FRAME_CONTEXT *const fc = cpi->common.fc;
   int i, j;
 
-  for (i = 0; i < INTRA_MODES; ++i)
-    for (j = 0; j < INTRA_MODES; ++j)
+  for (i = 0; i < INTRA_MODES; ++i) {
+    for (j = 0; j < INTRA_MODES; ++j) {
       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
                       vp9_intra_mode_tree);
+    }
+  }
 
   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
   for (i = 0; i < INTRA_MODES; ++i) {
@@ -82,9 +108,28 @@ static void fill_mode_costs(VP9_COMP *cpi) {
                     fc->uv_mode_prob[i], vp9_intra_mode_tree);
   }
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
     vp9_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+  }
+
+  for (i = TX_8X8; i < TX_SIZES; ++i) {
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
+      const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
+      int k;
+      for (k = 0; k <= i; ++k) {
+        int cost = 0;
+        int m;
+        for (m = 0; m <= k - (k == i); ++m) {
+          if (m == k)
+            cost += vp9_cost_zero(tx_probs[m]);
+          else
+            cost += vp9_cost_one(tx_probs[m]);
+        }
+        cpi->tx_size_cost[i - 1][j][k] = cost;
+      }
+    }
+  }
 }
 
 static void fill_token_costs(vp9_coeff_cost *c,
@@ -143,40 +188,74 @@ void vp9_init_me_luts(void) {
 
 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
                                          8,  8,  4,  4,  2,  2,  1,  0 };
-static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
-                                                              128, 144 };
 
-int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
-  const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+// Note that the element below for frame type "USE_BUF_FRAME", which indicates
+// that the show frame flag is set, should not be used as no real frame
+// is encoded so we should not reach here. However, a dummy value
+// is inserted here to make sure the data structure has the right number
+// of values assigned.
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
+                                                              128, 144, 144 };
+
+int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
+  // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
+  const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
+  uint32_t rdmult = q * q;
+
+  if (cpi->common.frame_type != KEY_FRAME) {
+    if (qindex < 128)
+      rdmult = rdmult * 4;
+    else if (qindex < 190)
+      rdmult = rdmult * 4 + rdmult / 2;
+    else
+      rdmult = rdmult * 3;
+  } else {
+    if (qindex < 64)
+      rdmult = rdmult * 4;
+    else if (qindex <= 128)
+      rdmult = rdmult * 3 + rdmult / 2;
+    else if (qindex < 190)
+      rdmult = rdmult * 4 + rdmult / 2;
+    else
+      rdmult = rdmult * 7 + rdmult / 2;
+  }
 #if CONFIG_VP9_HIGHBITDEPTH
-  int64_t rdmult = 0;
   switch (cpi->common.bit_depth) {
-    case VPX_BITS_8: rdmult = 88 * q * q / 24; break;
-    case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
-    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+    case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
+    case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
+    default: break;
   }
-#else
-  int64_t rdmult = 88 * q * q / 24;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  return rdmult;
+  return rdmult > 0 ? rdmult : 1;
 }
 
-int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
-  int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
-
+static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
+  int64_t rdmult_64 = rdmult;
   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
-    const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100));
+    const int gfu_boost = cpi->multi_layer_arf
+                              ? gf_group->gfu_boost[gf_group->index]
+                              : cpi->rc.gfu_boost;
+    const int boost_index = VPXMIN(15, (gfu_boost / 100));
 
-    rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
-    rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+    rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7;
+    rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7);
   }
-  if (rdmult < 1) rdmult = 1;
-  return (int)rdmult;
+  return (int)rdmult_64;
+}
+
+int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
+  int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+  return modulate_rdmult(cpi, rdmult);
+}
+
+int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) {
+  int rdmult =
+      vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex);
+  rdmult = (int)((double)rdmult / beta);
+  rdmult = rdmult > 0 ? rdmult : 1;
+  return modulate_rdmult(cpi, rdmult);
 }
 
 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
@@ -185,10 +264,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
   switch (bit_depth) {
     case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
     case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
-    case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break;
     default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
+      assert(bit_depth == VPX_BITS_12);
+      q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+      break;
   }
 #else
   (void)bit_depth;
@@ -209,12 +288,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
       x->sadperbit16 = sad_per_bit16lut_10[qindex];
       x->sadperbit4 = sad_per_bit4lut_10[qindex];
       break;
-    case VPX_BITS_12:
+    default:
+      assert(cpi->common.bit_depth == VPX_BITS_12);
       x->sadperbit16 = sad_per_bit16lut_12[qindex];
       x->sadperbit4 = sad_per_bit4lut_12[qindex];
       break;
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
   }
 #else
   (void)cpi;
@@ -255,6 +333,15 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
   }
 }
 
+void vp9_build_inter_mode_cost(VP9_COMP *cpi) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int i;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i],
+                    vp9_inter_mode_tree);
+  }
+}
+
 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -303,10 +390,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
             x->nmvjointcost,
             cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
             &cm->fc->nmvc, cm->allow_high_precision_mv);
-
-        for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-          vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
-                          cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
+        vp9_build_inter_mode_cost(cpi);
       }
     }
   }
@@ -471,13 +555,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       for (i = 0; i < num_4x4_h; i += 4)
         t_left[i] = !!*(const uint32_t *)&left[i];
       break;
-    case TX_32X32:
+    default:
+      assert(tx_size == TX_32X32);
       for (i = 0; i < num_4x4_w; i += 8)
         t_above[i] = !!*(const uint64_t *)&above[i];
       for (i = 0; i < num_4x4_h; i += 8)
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
-    default: assert(0 && "Invalid transform size."); break;
   }
 }
 
@@ -493,8 +577,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
   const int num_mv_refs =
-      MAX_MV_REF_CANDIDATES +
-      (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size);
+      MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size);
 
   MV pred_mv[3];
   pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
@@ -504,11 +587,12 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
 
   near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
                       x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
+
   // Get the sad for each candidate reference mv.
-  for (i = 0; i < num_mv_refs; ++i) {
+  for (i = 0; i < num_mv_refs && i < MAX_MV_REF_CANDIDATES + 1; ++i) {
     const MV *this_mv = &pred_mv[i];
     int fp_row, fp_col;
-
+    if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue;
     if (i == 1 && near_same_nearest) continue;
     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
@@ -573,6 +657,7 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
   const VP9_COMMON *const cm = &cpi->common;
   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
   const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
   return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
              ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
              : NULL;
diff --git a/libs/libvpx/vp9/encoder/vp9_rd.h b/libs/libvpx/vp9/encoder/vp9_rd.h
index 59022c106e2..df6ea9094c4 100644
--- a/libs/libvpx/vp9/encoder/vp9_rd.h
+++ b/libs/libvpx/vp9/encoder/vp9_rd.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RD_H_
-#define VP9_ENCODER_VP9_RD_H_
+#ifndef VPX_VP9_ENCODER_VP9_RD_H_
+#define VPX_VP9_ENCODER_VP9_RD_H_
 
 #include <limits.h>
 
@@ -27,7 +27,12 @@ extern "C" {
 #define RD_EPB_SHIFT 6
 
 #define RDCOST(RM, DM, R, D) \
-  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
+  ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) + ((D) << (DM))
+#define RDCOST_NEG_R(RM, DM, R, D) \
+  ((D) << (DM)) - ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT)
+#define RDCOST_NEG_D(RM, DM, R, D) \
+  ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), VP9_PROB_COST_SHIFT) - ((D) << (DM))
+
 #define QIDX_SKIP_THRESH 115
 
 #define MV_COST_WEIGHT 108
@@ -42,6 +47,9 @@ extern "C" {
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC 1
 
+#define VP9_DIST_SCALE_LOG2 4
+#define VP9_DIST_SCALE (1 << VP9_DIST_SCALE_LOG2)
+
 // This enumerator type needs to be kept aligned with the mode order in
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
@@ -98,8 +106,8 @@ typedef enum {
 typedef struct RD_OPT {
   // Thresh_mult is used to set a threshold for the rd score. A higher value
   // means that we will accept the best mode so far more often. This number
-  // is used in combination with the current block size, and thresh_freq_fact
-  // to pick a threshold.
+  // is used in combination with the current block size, and thresh_freq_fact to
+  // pick a threshold.
   int thresh_mult[MAX_MODES];
   int thresh_mult_sub8x8[MAX_REFS];
 
@@ -108,9 +116,14 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+#if CONFIG_CONSISTENT_RECODE
+  int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
+  int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
+#endif
   int RDMULT;
   int RDDIV;
+  double r0;
 } RD_OPT;
 
 typedef struct RD_COST {
@@ -123,22 +136,27 @@ typedef struct RD_COST {
 void vp9_rd_cost_reset(RD_COST *rd_cost);
 // Initialize the rate distortion cost values to zero.
 void vp9_rd_cost_init(RD_COST *rd_cost);
+// It supports negative rate and dist, which is different from RDCOST().
+int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist);
+// Update the cost value based on its rate and distortion.
+void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost);
 
 struct TileInfo;
 struct TileDataEnc;
 struct VP9_COMP;
 struct macroblock;
 
-int64_t vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi,
-                                            int qindex);
+int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex);
 
 int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex);
 
+int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta);
+
 void vp9_initialize_rd_consts(struct VP9_COMP *cpi);
 
 void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 
-void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
+void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
 void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
@@ -169,8 +187,8 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi);
 
 void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi);
 
-void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize,
-                               int best_mode_index);
+void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
+                               int bsize, int best_mode_index);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       const int *const thresh_fact) {
@@ -208,8 +226,10 @@ unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                                 BLOCK_SIZE bs, int bd);
 #endif
 
+void vp9_build_inter_mode_cost(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RD_H_
+#endif  // VPX_VP9_ENCODER_VP9_RD_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.c b/libs/libvpx/vp9/encoder/vp9_rdopt.c
index 2ba6378c5e4..d07d91774bf 100644
--- a/libs/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libs/libvpx/vp9/encoder/vp9_rdopt.c
@@ -31,6 +31,9 @@
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#if !CONFIG_REALTIME_ONLY
+#include "vp9/encoder/vp9_aq_variance.h"
+#endif
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
@@ -40,7 +43,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_aq_variance.h"
 
 #define LAST_FRAME_MODE_MASK \
   ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | (1 << INTRA_FRAME))
@@ -59,7 +61,9 @@ typedef struct {
   MV_REFERENCE_FRAME ref_frame[2];
 } MODE_DEFINITION;
 
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+  MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
 
 struct rdcost_block_args {
   const VP9_COMP *cpi;
@@ -75,9 +79,12 @@ struct rdcost_block_args {
   int use_fast_coef_costing;
   const scan_order *so;
   uint8_t skippable;
+  struct buf_2d *this_recon;
 };
 
 #define LAST_NEW_MV_INDEX 6
+
+#if !CONFIG_REALTIME_ONLY
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   { NEARESTMV, { LAST_FRAME, NONE } },
   { NEARESTMV, { ALTREF_FRAME, NONE } },
@@ -125,6 +132,7 @@ static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
   { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
   { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
 };
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
                            int min_plane, int max_plane) {
@@ -151,6 +159,7 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
                             MACROBLOCKD *xd, int *out_rate_sum,
                             int64_t *out_dist_sum, int *skip_txfm_sb,
@@ -271,10 +280,11 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   }
 
   *skip_txfm_sb = skip_flag;
-  *skip_sse_sb = total_sse << 4;
+  *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -457,6 +467,66 @@ static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
   return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
 }
 
+// Copy all visible 4x4s in the transform block.
+static void copy_block_visible(const MACROBLOCKD *xd,
+                               const struct macroblockd_plane *const pd,
+                               const uint8_t *src, const int src_stride,
+                               uint8_t *dst, const int dst_stride, int blk_row,
+                               int blk_col, const BLOCK_SIZE plane_bsize,
+                               const BLOCK_SIZE tx_bsize) {
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
+                                            pd->subsampling_x, blk_col);
+  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
+                                             pd->subsampling_y, blk_row);
+  const int is_highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    const int w = tx_4x4_w << 2;
+    const int h = tx_4x4_h << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_highbd) {
+      vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src), src_stride,
+                               CONVERT_TO_SHORTPTR(dst), dst_stride, NULL, 0, 0,
+                               0, 0, w, h, xd->bd);
+    } else {
+#endif
+      vpx_convolve_copy(src, src_stride, dst, dst_stride, NULL, 0, 0, 0, 0, w,
+                        h);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif
+  } else {
+    int r, c;
+    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        const uint8_t *src_ptr = src + r * src_stride * 4 + c * 4;
+        uint8_t *dst_ptr = dst + r * dst_stride * 4 + c * 4;
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (is_highbd) {
+          vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
+                                   CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
+                                   NULL, 0, 0, 0, 0, 4, 4, xd->bd);
+        } else {
+#endif
+          vpx_convolve_copy(src_ptr, src_stride, dst_ptr, dst_stride, NULL, 0,
+                            0, 0, 0, 4, 4);
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif
+      }
+    }
+  }
+  (void)is_highbd;
+}
+
 // Compute the pixel domain sum square error on all visible 4x4s in the
 // transform block.
 static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
@@ -537,12 +607,13 @@ static int64_t sum_squares_visible(const MACROBLOCKD *xd,
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse) {
+                       int64_t *out_sse, struct buf_2d *out_recon) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int eob = p->eobs[block];
 
-  if (x->block_tx_domain) {
+  if (!out_recon && x->block_tx_domain && eob) {
     const int ss_txfrm_size = tx_size << 1;
     int64_t this_sse;
     const int shift = tx_size == TX_32X32 ? 0 : 2;
@@ -581,15 +652,23 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
     const uint8_t *src = &p->src.buf[src_idx];
     const uint8_t *dst = &pd->dst.buf[dst_idx];
+    uint8_t *out_recon_ptr = 0;
+
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    const uint16_t *eob = &p->eobs[block];
     unsigned int tmp;
 
     tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
                     blk_col, plane_bsize, tx_bsize);
     *out_sse = (int64_t)tmp * 16;
+    if (out_recon) {
+      const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
+      out_recon_ptr = &out_recon->buf[out_recon_idx];
+      copy_block_visible(xd, pd, dst, dst_stride, out_recon_ptr,
+                         out_recon->stride, blk_row, blk_col, plane_bsize,
+                         tx_bsize);
+    }
 
-    if (*eob) {
+    if (eob) {
 #if CONFIG_VP9_HIGHBITDEPTH
       DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
       uint8_t *recon = (uint8_t *)recon16;
@@ -602,22 +681,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
                                  32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
         if (xd->lossless) {
-          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+          vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
         } else {
           switch (tx_size) {
             case TX_4X4:
-              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_8X8:
-              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
             case TX_16X16:
-              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
+              vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            case TX_32X32:
-              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
+            default:
+              assert(tx_size == TX_32X32);
+              vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd);
               break;
-            default: assert(0 && "Invalid transform size");
           }
         }
         recon = CONVERT_TO_BYTEPTR(recon16);
@@ -625,16 +704,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
         switch (tx_size) {
-          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
-          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
-          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break;
-          case TX_4X4:
+          case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break;
+          case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break;
+          case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break;
+          default:
+            assert(tx_size == TX_4X4);
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1, which is significant (not just an optimization) for
             // the lossless case.
-            x->inv_txfm_add(dqcoeff, recon, 32, *eob);
+            x->inv_txfm_add(dqcoeff, recon, 32, eob);
             break;
-          default: assert(0 && "Invalid transform size"); break;
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       }
@@ -642,6 +721,10 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
       tmp = pixel_sse(cpi, xd, pd, src, src_stride, recon, 32, blk_row, blk_col,
                       plane_bsize, tx_bsize);
+      if (out_recon) {
+        copy_block_visible(xd, pd, recon, 32, out_recon_ptr, out_recon->stride,
+                           blk_row, blk_col, plane_bsize, tx_bsize);
+      }
     }
 
     *out_dist = (int64_t)tmp * 16;
@@ -666,26 +749,38 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   int64_t sse;
   const int coeff_ctx =
       combine_entropy_contexts(args->t_left[blk_row], args->t_above[blk_col]);
+  struct buf_2d *recon = args->this_recon;
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int dst_stride = pd->dst.stride;
+  const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mi)) {
+#if CONFIG_MISMATCH_DEBUG
+    struct encode_b_args intra_arg = {
+      x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0
+    };
+#else
     struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
                                        args->t_left, &mi->skip };
+#endif
     vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
                            &intra_arg);
+    if (recon) {
+      uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
+      copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
+    }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse);
+                 tx_size, &dist, &sse, /*recon =*/0);
     } else {
-      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
       const struct macroblock_plane *const p = &x->plane[plane];
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
       const int src_stride = p->src.stride;
-      const int dst_stride = pd->dst.stride;
       const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
       const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
       sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
@@ -699,17 +794,20 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
                       blk_row, blk_col, plane_bsize, tx_bsize);
       dist = (int64_t)tmp * 16;
     }
-  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
-    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-        SKIP_TXFM_NONE) {
+  } else {
+    int skip_txfm_flag = SKIP_TXFM_NONE;
+    if (max_txsize_lookup[plane_bsize] == tx_size)
+      skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))];
+
+    if (skip_txfm_flag == SKIP_TXFM_NONE ||
+        (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
       if (x->block_qcoeff_opt)
         vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse);
-    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-               SKIP_TXFM_AC_ONLY) {
+                 tx_size, &dist, &sse, recon);
+    } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
       tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
@@ -735,14 +833,12 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
       x->plane[plane].eobs[block] = 0;
       sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
+      if (recon) {
+        uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
+        copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
+                           blk_row, blk_col, plane_bsize, tx_bsize);
+      }
     }
-  } else {
-    // full forward transform and quantization
-    vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-    if (x->block_qcoeff_opt)
-      vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
-    dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-               tx_size, &dist, &sse);
   }
 
   rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
@@ -761,7 +857,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   rd = VPXMIN(rd1, rd2);
   if (plane == 0) {
     x->zcoeff_blk[tx_size][block] =
-        !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless);
+        !x->plane[plane].eobs[block] ||
+        (x->sharpness == 0 && rd1 > rd2 && !xd->lossless);
     x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block];
   }
 
@@ -781,7 +878,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                              int64_t *distortion, int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane, BLOCK_SIZE bsize,
-                             TX_SIZE tx_size, int use_fast_coef_casting) {
+                             TX_SIZE tx_size, int use_fast_coef_costing,
+                             struct buf_2d *recon) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   struct rdcost_block_args args;
@@ -789,8 +887,9 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   args.cpi = cpi;
   args.x = x;
   args.best_rd = ref_best_rd;
-  args.use_fast_coef_costing = use_fast_coef_casting;
+  args.use_fast_coef_costing = use_fast_coef_costing;
   args.skippable = 1;
+  args.this_recon = recon;
 
   if (plane == 0) xd->mi[0]->tx_size = tx_size;
 
@@ -815,7 +914,8 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
 static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                    int64_t *distortion, int *skip, int64_t *sse,
-                                   int64_t ref_best_rd, BLOCK_SIZE bs) {
+                                   int64_t ref_best_rd, BLOCK_SIZE bs,
+                                   struct buf_2d *recon) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
@@ -825,13 +925,13 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   mi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
 
   txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs,
-                   mi->tx_size, cpi->sf.use_fast_coef_costing);
+                   mi->tx_size, cpi->sf.use_fast_coef_costing, recon);
 }
 
 static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                                    int64_t *distortion, int *skip,
                                    int64_t *psse, int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
+                                   BLOCK_SIZE bs, struct buf_2d *recon) {
   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -843,20 +943,34 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX },
                               { INT64_MAX, INT64_MAX } };
-  int n, m;
+  int n;
   int s0, s1;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = ref_best_rd;
   TX_SIZE best_tx = max_tx_size;
   int start_tx, end_tx;
+  const int tx_size_ctx = get_tx_size_context(xd);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, recon_buf16[TX_SIZES][64 * 64]);
+  uint8_t *recon_buf[TX_SIZES];
+  for (n = 0; n < TX_SIZES; ++n) {
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      recon_buf[n] = CONVERT_TO_BYTEPTR(recon_buf16[n]);
+    } else {
+      recon_buf[n] = (uint8_t *)recon_buf16[n];
+    }
+  }
+#else
+  DECLARE_ALIGNED(16, uint8_t, recon_buf[TX_SIZES][64 * 64]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     start_tx = max_tx_size;
-    end_tx = 0;
+    end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0);
+    if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx);
   } else {
     TX_SIZE chosen_tx_size =
         VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -865,15 +979,17 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   }
 
   for (n = start_tx; n >= end_tx; n--) {
-    int r_tx_size = 0;
-    for (m = 0; m <= n - (n == (int)max_tx_size); m++) {
-      if (m == n)
-        r_tx_size += vp9_cost_zero(tx_probs[m]);
-      else
-        r_tx_size += vp9_cost_one(tx_probs[m]);
+    const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
+    if (recon) {
+      struct buf_2d this_recon;
+      this_recon.buf = recon_buf[n];
+      this_recon.stride = recon->stride;
+      txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+                       n, cpi->sf.use_fast_coef_costing, &this_recon);
+    } else {
+      txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs,
+                       n, cpi->sf.use_fast_coef_costing, 0);
     }
-    txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
-                     bs, n, cpi->sf.use_fast_coef_costing);
     r[n][1] = r[n][0];
     if (r[n][0] < INT_MAX) {
       r[n][1] += r_tx_size;
@@ -915,11 +1031,25 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   *rate = r[mi->tx_size][cm->tx_mode == TX_MODE_SELECT];
   *skip = s[mi->tx_size];
   *psse = sse[mi->tx_size];
+  if (recon) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      memcpy(CONVERT_TO_SHORTPTR(recon->buf),
+             CONVERT_TO_SHORTPTR(recon_buf[mi->tx_size]),
+             64 * 64 * sizeof(uint16_t));
+    } else {
+#endif
+      memcpy(recon->buf, recon_buf[mi->tx_size], 64 * 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif
+  }
 }
 
 static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int64_t *distortion, int *skip, int64_t *psse,
-                            BLOCK_SIZE bs, int64_t ref_best_rd) {
+                            BLOCK_SIZE bs, int64_t ref_best_rd,
+                            struct buf_2d *recon) {
   MACROBLOCKD *xd = &x->e_mbd;
   int64_t sse;
   int64_t *ret_sse = psse ? psse : &sse;
@@ -928,10 +1058,10 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL || xd->lossless) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
-                           bs);
+                           bs, recon);
   } else {
     choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
-                           bs);
+                           bs, recon);
   }
 }
 
@@ -1273,7 +1403,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, best_rd);
+                    bsize, best_rd, /*recon = */ 0);
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
@@ -1325,7 +1455,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
-                     plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing);
+                     plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
+                     /*recon = */ 0);
     if (pnrate == INT_MAX) {
       is_cost_valid = 0;
       break;
@@ -1393,6 +1524,7 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   return best_rd;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                               int *rate_tokenonly, int64_t *distortion,
                               int *skippable, BLOCK_SIZE bsize) {
@@ -1466,11 +1598,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       if (is_compound)
         this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
       break;
-    case ZEROMV:
+    default:
+      assert(mode == ZEROMV);
       this_mv[0].as_int = 0;
       if (is_compound) this_mv[1].as_int = 0;
       break;
-    default: break;
   }
 
   mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
@@ -1604,6 +1736,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
 
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 typedef struct {
   int eobs;
@@ -1631,6 +1764,7 @@ typedef struct {
   int mvthresh;
 } BEST_SEG_INFO;
 
+#if !CONFIG_REALTIME_ONLY
 static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
   return (mv->row >> 3) < mv_limits->row_min ||
          (mv->row >> 3) > mv_limits->row_max ||
@@ -1829,8 +1963,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       bestsme = cpi->find_fractional_mv_step(
           x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv,
           x->errorperbit, &cpi->fn_ptr[bsize], 0,
-          cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred, pw, ph);
+          cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost,
+          &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search);
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -1884,6 +2018,8 @@ static int64_t rd_pick_best_sub8x8_mode(
   const BLOCK_SIZE bsize = mi->sb_type;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int pw = num_4x4_blocks_wide << 2;
+  const int ph = num_4x4_blocks_high << 2;
   ENTROPY_CONTEXT t_above[2], t_left[2];
   int subpelmv = 1, have_ref = 0;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -1992,8 +2128,11 @@ static int64_t rd_pick_best_sub8x8_mode(
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
 
           if (sf->adaptive_motion_search) {
-            mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
-            mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+            if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX &&
+                x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) {
+              mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3;
+              mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3;
+            }
             step_param = VPXMAX(step_param, 8);
           }
 
@@ -2015,16 +2154,16 @@ static int64_t rd_pick_best_sub8x8_mode(
             cpi->find_fractional_mv_step(
                 x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv,
                 x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop,
-                sf->mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+                sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list),
                 x->nmvjointcost, x->mvcost, &distortion,
-                &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0);
+                &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph,
+                cpi->sf.use_accurate_subpel_search);
 
             // save motion search result for use in compound prediction
             seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
-          if (sf->adaptive_motion_search)
-            x->pred_mv[mi->ref_frame[0]] = *new_mv;
+          x->pred_mv[mi->ref_frame[0]] = *new_mv;
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -2319,6 +2458,22 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                 block_size);
 }
 
+#if CONFIG_NON_GREEDY_MV
+static int ref_frame_to_gf_rf_idx(int ref_frame) {
+  if (ref_frame == GOLDEN_FRAME) {
+    return 0;
+  }
+  if (ref_frame == LAST_FRAME) {
+    return 1;
+  }
+  if (ref_frame == ALTREF_FRAME) {
+    return 2;
+  }
+  assert(0);
+  return -1;
+}
+#endif
+
 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                  int mi_row, int mi_col, int_mv *tmp_mv,
                                  int *rate_mv) {
@@ -2326,19 +2481,35 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const VP9_COMMON *cm = &cpi->common;
   MODE_INFO *mi = xd->mi[0];
   struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
-  int bestsme = INT_MAX;
   int step_param;
-  int sadpb = x->sadperbit16;
   MV mvp_full;
   int ref = mi->ref_frame[0];
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   const MvLimits tmp_mv_limits = x->mv_limits;
   int cost_list[5];
-
+  const int best_predmv_idx = x->mv_best_ref_index[ref];
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, ref);
-
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
   MV pred_mv[3];
+
+#if CONFIG_NON_GREEDY_MV
+  double bestsme;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  const int nb_full_mv_num = NB_MVS_NUM;
+  int gf_group_idx = cpi->twopass.gf_group.index;
+  int gf_rf_idx = ref_frame_to_gf_rf_idx(ref);
+  BLOCK_SIZE square_bsize = get_square_block_size(bsize);
+  const int lambda = (pw * ph) / 4;
+  assert(pw * ph == lambda << 2);
+  vp9_prepare_nb_full_mvs(&cpi->tpl_stats[gf_group_idx], mi_row, mi_col,
+                          gf_rf_idx, square_bsize, nb_full_mvs);
+#else   // CONFIG_NON_GREEDY_MV
+  int bestsme = INT_MAX;
+  int sadpb = x->sadperbit16;
+#endif  // CONFIG_NON_GREEDY_MV
+
   pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
   pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
   pred_mv[2] = x->pred_mv[ref];
@@ -2367,7 +2538,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   }
 
   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset =
+    const int boffset =
         2 * (b_width_log2_lookup[BLOCK_64X64] -
              VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
@@ -2385,8 +2556,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       int i;
       for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
         if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
-          x->pred_mv[ref].row = 0;
-          x->pred_mv[ref].col = 0;
+          x->pred_mv[ref].row = INT16_MAX;
+          x->pred_mv[ref].col = INT16_MAX;
           tmp_mv->as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
@@ -2404,14 +2575,69 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   // after full-pixel motion search.
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
 
-  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
-
+  mvp_full = pred_mv[best_predmv_idx];
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
+#if CONFIG_NON_GREEDY_MV
+  bestsme = vp9_full_pixel_diamond_new(cpi, x, &mvp_full, step_param, lambda, 1,
+                                       &cpi->fn_ptr[bsize], nb_full_mvs,
+                                       nb_full_mv_num, &tmp_mv->as_mv);
+#else   // CONFIG_NON_GREEDY_MV
   bestsme = vp9_full_pixel_search(
       cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
       cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+#endif  // CONFIG_NON_GREEDY_MV
+
+  if (cpi->sf.enhanced_full_pixel_motion_search) {
+    int i;
+    for (i = 0; i < 3; ++i) {
+#if CONFIG_NON_GREEDY_MV
+      double this_me;
+#else   // CONFIG_NON_GREEDY_MV
+      int this_me;
+#endif  // CONFIG_NON_GREEDY_MV
+      MV this_mv;
+      int diff_row;
+      int diff_col;
+      int step;
+
+      if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue;
+      if (i == best_predmv_idx) continue;
+
+      diff_row = ((int)pred_mv[i].row -
+                  pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >>
+                 3;
+      diff_col = ((int)pred_mv[i].col -
+                  pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >>
+                 3;
+      if (diff_row == 0 && diff_col == 0) continue;
+      if (diff_row < 0) diff_row = -diff_row;
+      if (diff_col < 0) diff_col = -diff_col;
+      step = get_msb((diff_row + diff_col + 1) >> 1);
+      if (step <= 0) continue;
+
+      mvp_full = pred_mv[i];
+      mvp_full.col >>= 3;
+      mvp_full.row >>= 3;
+#if CONFIG_NON_GREEDY_MV
+      this_me = vp9_full_pixel_diamond_new(
+          cpi, x, &mvp_full, VPXMAX(step_param, MAX_MVSEARCH_STEPS - step),
+          lambda, 1, &cpi->fn_ptr[bsize], nb_full_mvs, nb_full_mv_num,
+          &this_mv);
+#else   // CONFIG_NON_GREEDY_MV
+      this_me = vp9_full_pixel_search(
+          cpi, x, bsize, &mvp_full,
+          VPXMAX(step_param, MAX_MVSEARCH_STEPS - step),
+          cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list),
+          &ref_mv, &this_mv, INT_MAX, 1);
+#endif  // CONFIG_NON_GREEDY_MV
+      if (this_me < bestsme) {
+        tmp_mv->as_mv = this_mv;
+        bestsme = this_me;
+      }
+    }
+  }
 
   x->mv_limits = tmp_mv_limits;
 
@@ -2420,13 +2646,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit,
         &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
-        cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
-        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0);
+        cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list),
+        x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph,
+        cpi->sf.use_accurate_subpel_search);
   }
   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost,
                              x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv;
+  x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
     int i;
@@ -2453,21 +2680,56 @@ static INLINE void restore_dst_buf(MACROBLOCKD *xd,
 // visual quality.
 static int discount_newmv_test(const VP9_COMP *cpi, int this_mode,
                                int_mv this_mv,
-                               int_mv (*mode_mv)[MAX_REF_FRAMES],
-                               int ref_frame) {
+                               int_mv (*mode_mv)[MAX_REF_FRAMES], int ref_frame,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize) {
+#if CONFIG_NON_GREEDY_MV
+  (void)mode_mv;
+  (void)this_mv;
+  if (this_mode == NEWMV && bsize >= BLOCK_8X8 && cpi->tpl_ready) {
+    const int gf_group_idx = cpi->twopass.gf_group.index;
+    const int gf_rf_idx = ref_frame_to_gf_rf_idx(ref_frame);
+    const TplDepFrame tpl_frame = cpi->tpl_stats[gf_group_idx];
+    const int tpl_block_mi_h = num_8x8_blocks_high_lookup[cpi->tpl_bsize];
+    const int tpl_block_mi_w = num_8x8_blocks_wide_lookup[cpi->tpl_bsize];
+    const int tpl_mi_row = mi_row - (mi_row % tpl_block_mi_h);
+    const int tpl_mi_col = mi_col - (mi_col % tpl_block_mi_w);
+    const int mv_mode =
+        tpl_frame
+            .mv_mode_arr[gf_rf_idx][tpl_mi_row * tpl_frame.stride + tpl_mi_col];
+    if (mv_mode == NEW_MV_MODE) {
+      int_mv tpl_new_mv = *get_pyramid_mv(&tpl_frame, gf_rf_idx, cpi->tpl_bsize,
+                                          tpl_mi_row, tpl_mi_col);
+      int row_diff = abs(tpl_new_mv.as_mv.row - this_mv.as_mv.row);
+      int col_diff = abs(tpl_new_mv.as_mv.col - this_mv.as_mv.col);
+      if (VPXMAX(row_diff, col_diff) <= 8) {
+        return 1;
+      } else {
+        return 0;
+      }
+    } else {
+      return 0;
+    }
+  } else {
+    return 0;
+  }
+#else
+  (void)mi_row;
+  (void)mi_col;
+  (void)bsize;
   return (!cpi->rc.is_src_frame_alt_ref && (this_mode == NEWMV) &&
           (this_mv.as_int != 0) &&
           ((mode_mv[NEARESTMV][ref_frame].as_int == 0) ||
            (mode_mv[NEARESTMV][ref_frame].as_int == INVALID_MV)) &&
           ((mode_mv[NEARMV][ref_frame].as_int == 0) ||
            (mode_mv[NEARMV][ref_frame].as_int == INVALID_MV)));
+#endif
 }
 
 static int64_t handle_inter_mode(
     VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int *rate2,
     int64_t *distortion, int *skippable, int *rate_y, int *rate_uv,
-    int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row,
-    int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
+    struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
+    int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
     INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
     int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
     const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
@@ -2573,7 +2835,8 @@ static int64_t handle_inter_mode(
       // under certain circumstances where we want to help initiate a weak
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
-      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0], mi_row,
+                              mi_col, bsize)) {
         *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       } else {
         *rate2 += rate_mv;
@@ -2606,8 +2869,8 @@ static int64_t handle_inter_mode(
   //
   // Under some circumstances we discount the cost of new mv mode to encourage
   // initiation of a motion field.
-  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv,
-                          refs[0])) {
+  if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]], mode_mv, refs[0],
+                          mi_row, mi_col, bsize)) {
     *rate2 +=
         VPXMIN(cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]),
                cost_mv_ref(cpi, NEARESTMV, mbmi_ext->mode_context[refs[0]]));
@@ -2771,7 +3034,7 @@ static int64_t handle_inter_mode(
   memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
   memcpy(x->bsse, bsse, sizeof(bsse));
 
-  if (!skip_txfm_sb) {
+  if (!skip_txfm_sb || xd->lossless) {
     int skippable_y, skippable_uv;
     int64_t sseuv = INT64_MAX;
     int64_t rdcosty = INT64_MAX;
@@ -2779,7 +3042,7 @@ static int64_t handle_inter_mode(
     // Y cost and distortion
     vp9_subtract_plane(x, bsize, 0);
     super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse, bsize,
-                    ref_best_rd);
+                    ref_best_rd, recon);
 
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
@@ -2821,6 +3084,7 @@ static int64_t handle_inter_mode(
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
@@ -2874,85 +3138,97 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
 }
 
+#if !CONFIG_REALTIME_ONLY
 // This function is designed to apply a bias or adjustment to an rd value based
 // on the relative variance of the source and reconstruction.
-#define VERY_LOW_VAR_THRESH 2
-#define LOW_VAR_THRESH 5
-#define VAR_MULT 100
-static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 };
+#define LOW_VAR_THRESH 250
+#define VAR_MULT 250
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 250 };
 
 static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bsize, int64_t *this_rd,
+                                   struct buf_2d *recon,
                                    MV_REFERENCE_FRAME ref_frame,
-                                   unsigned int source_variance) {
+                                   MV_REFERENCE_FRAME second_ref_frame,
+                                   PREDICTION_MODE this_mode) {
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int rec_variance;
   unsigned int src_variance;
   unsigned int src_rec_min;
-  unsigned int absvar_diff = 0;
+  unsigned int var_diff = 0;
   unsigned int var_factor = 0;
   unsigned int adj_max;
+  unsigned int low_var_thresh = LOW_VAR_THRESH;
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   vp9e_tune_content content_type = cpi->oxcf.content;
 
   if (*this_rd == INT64_MAX) return;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    if (source_variance > 0) {
-      rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
-                                                        bsize, xd->bd);
-      src_variance = source_variance;
-    } else {
-      rec_variance =
-          vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
-      src_variance =
-          vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
-    }
+    rec_variance = vp9_high_get_sby_variance(cpi, recon, bsize, xd->bd);
+    src_variance =
+        vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
   } else {
-    if (source_variance > 0) {
-      rec_variance =
-          vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
-      src_variance = source_variance;
-    } else {
-      rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
-      src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
-    }
+    rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+    src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
   }
 #else
-  if (source_variance > 0) {
-    rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
-    src_variance = source_variance;
+  rec_variance = vp9_get_sby_variance(cpi, recon, bsize);
+  src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Scale based on area in 8x8 blocks
+  rec_variance /= (bw * bh);
+  src_variance /= (bw * bh);
+
+  if (content_type == VP9E_CONTENT_FILM) {
+    if (cpi->oxcf.pass == 2) {
+      // Adjust low variance threshold based on estimated group noise enegry.
+      double noise_factor =
+          (double)cpi->twopass.gf_group.group_noise_energy / SECTION_NOISE_DEF;
+      low_var_thresh = (unsigned int)(low_var_thresh * noise_factor);
+
+      if (ref_frame == INTRA_FRAME) {
+        low_var_thresh *= 2;
+        if (this_mode == DC_PRED) low_var_thresh *= 5;
+      } else if (second_ref_frame > INTRA_FRAME) {
+        low_var_thresh *= 2;
+      }
+    }
   } else {
-    rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
-    src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+    low_var_thresh = LOW_VAR_THRESH / 2;
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Lower of source (raw per pixel value) and recon variance. Note that
   // if the source per pixel is 0 then the recon value here will not be per
   // pixel (see above) so will likely be much larger.
-  src_rec_min = VPXMIN(source_variance, rec_variance);
+  src_rec_min = VPXMIN(src_variance, rec_variance);
 
-  if (src_rec_min > LOW_VAR_THRESH) return;
+  if (src_rec_min > low_var_thresh) return;
 
-  absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance)
-                                              : (rec_variance - src_variance);
+  // We care more when the reconstruction has lower variance so give this case
+  // a stronger weighting.
+  var_diff = (src_variance > rec_variance) ? (src_variance - rec_variance) * 2
+                                           : (rec_variance - src_variance) / 2;
 
   adj_max = max_var_adjust[content_type];
 
   var_factor =
-      (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance);
+      (unsigned int)((int64_t)VAR_MULT * var_diff) / VPXMAX(1, src_variance);
   var_factor = VPXMIN(adj_max, var_factor);
 
+  if ((content_type == VP9E_CONTENT_FILM) &&
+      ((ref_frame == INTRA_FRAME) || (second_ref_frame > INTRA_FRAME))) {
+    var_factor *= 2;
+  }
+
   *this_rd += (*this_rd * var_factor) / 100;
 
-  if (content_type == VP9E_CONTENT_FILM) {
-    if (src_rec_min <= VERY_LOW_VAR_THRESH) {
-      if (ref_frame == INTRA_FRAME) *this_rd *= 2;
-      if (bsize > 6) *this_rd *= 2;
-    }
-  }
+  (void)xd;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Do we have an internal image edge (e.g. formatting bars).
 int vp9_internal_image_edge(VP9_COMP *cpi) {
@@ -3023,6 +3299,7 @@ int vp9_active_edge_sb(VP9_COMP *cpi, int mi_row, int mi_col) {
          vp9_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
 }
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
                                MACROBLOCK *x, int mi_row, int mi_col,
                                RD_COST *rd_cost, BLOCK_SIZE bsize,
@@ -3066,20 +3343,36 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   const int intra_cost_penalty =
       vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int best_skip2 = 0;
-  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint8_t ref_frame_skip_mask[2] = { 0, 1 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
-  int *tile_mode_map = tile_data->mode_map[bsize];
-  int mode_map[MAX_MODES];  // Maintain mode_map information locally to avoid
-                            // lock mechanism involved with reads from
-                            // tile_mode_map
+  int8_t *tile_mode_map = tile_data->mode_map[bsize];
+  int8_t mode_map[MAX_MODES];  // Maintain mode_map information locally to avoid
+                               // lock mechanism involved with reads from
+                               // tile_mode_map
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
+  const int is_rect_partition =
+      num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize];
   int64_t mask_filter = 0;
   int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
 
+  struct buf_2d *recon;
+  struct buf_2d recon_buf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, recon16[64 * 64]);
+  recon_buf.buf = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH
+                      ? CONVERT_TO_BYTEPTR(recon16)
+                      : (uint8_t *)recon16;
+#else
+  DECLARE_ALIGNED(16, uint8_t, recon8[64 * 64]);
+  recon_buf.buf = recon8;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  recon_buf.stride = 64;
+  recon = cpi->oxcf.content == VP9E_CONTENT_FILM ? &recon_buf : 0;
+
   vp9_zero(best_mbmode);
 
   x->skip_encode = sf->skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
@@ -3105,7 +3398,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
+        !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
@@ -3228,18 +3522,21 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
     vp9_zero(x->sum_y_eobs);
 
+    if (is_rect_partition) {
+      if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
+      if (second_ref_frame > 0 &&
+          (ctx->skip_ref_frame_mask & (1 << second_ref_frame)))
+        continue;
+    }
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
       switch (best_mbmode.ref_frame[0]) {
         case INTRA_FRAME: break;
-        case LAST_FRAME:
-          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
-          break;
+        case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break;
         case GOLDEN_FRAME:
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
-          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
         case NONE:
@@ -3313,6 +3610,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
 
+      if (cm->ref_frame_sign_bias[ref_frame] ==
+          cm->ref_frame_sign_bias[second_ref_frame])
+        continue;
+
       // Skip compound inter modes if ARF is not available.
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
 
@@ -3339,7 +3640,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         // Disable intra modes other than DC_PRED for blocks with low variance
         // Threshold for intra skipping based on source variance
         // TODO(debargha): Specialize the threshold for super block sizes
-        const unsigned int skip_intra_var_thresh = 64;
+        const unsigned int skip_intra_var_thresh =
+            (cpi->oxcf.content == VP9E_CONTENT_FILM) ? 0 : 64;
         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
             x->source_variance < skip_intra_var_thresh)
           continue;
@@ -3385,7 +3687,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       struct macroblockd_plane *const pd = &xd->plane[1];
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
-                      best_rd);
+                      best_rd, recon);
       if (rate_y == INT_MAX) continue;
 
       uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
@@ -3408,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     } else {
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
-          &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
+          recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
           single_inter_filter, single_skippable, &total_sse, best_rd,
           &mask_filter, filter_cache);
       if (this_rd == INT64_MAX) continue;
@@ -3437,7 +3739,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
         // Cost the skip mb case
         rate2 += skip_cost1;
-      } else if (ref_frame != INTRA_FRAME && !xd->lossless) {
+      } else if (ref_frame != INTRA_FRAME && !xd->lossless &&
+                 !cpi->oxcf.sharpness) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv + skip_cost0,
                    distortion2) <
             RDCOST(x->rdmult, x->rddiv, skip_cost1, total_sse)) {
@@ -3461,10 +3764,39 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
     }
 
-    // Apply an adjustment to the rd value based on the similarity of the
-    // source variance and reconstructed variance.
-    rd_variance_adjustment(cpi, x, bsize, &this_rd, ref_frame,
-                           x->source_variance);
+    if (recon) {
+      // In film mode bias against DC pred and other intra if there is a
+      // significant difference between the variance of the sub blocks in the
+      // the source. Also apply some bias against compound modes which also
+      // tend to blur fine texture such as film grain over time.
+      //
+      // The sub block test here acts in the case where one or more sub
+      // blocks have high relatively variance but others relatively low
+      // variance. Here the high variance sub blocks may push the
+      // total variance for the current block size over the thresholds
+      // used in rd_variance_adjustment() below.
+      if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+        if (bsize >= BLOCK_16X16) {
+          int min_energy, max_energy;
+          vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy,
+                                   &max_energy);
+          if (max_energy > min_energy) {
+            if (ref_frame == INTRA_FRAME) {
+              if (this_mode == DC_PRED)
+                this_rd += (this_rd * (max_energy - min_energy));
+              else
+                this_rd += (this_rd * (max_energy - min_energy)) / 4;
+            } else if (second_ref_frame > INTRA_FRAME) {
+              this_rd += this_rd / 4;
+            }
+          }
+        }
+      }
+      // Apply an adjustment to the rd value based on the similarity of the
+      // source variance and reconstructed variance.
+      rd_variance_adjustment(cpi, x, bsize, &this_rd, recon, ref_frame,
+                             second_ref_frame, this_mode);
+    }
 
     if (ref_frame == INTRA_FRAME) {
       // Keep record of best intra rd
@@ -3616,9 +3948,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-    // If adaptive interp filter is enabled, then the current leaf node of 8x8
-    // data is needed for sub8x8. Hence preserve the context.
+// If adaptive interp filter is enabled, then the current leaf node of 8x8
+// data is needed for sub8x8. Hence preserve the context.
+#if CONFIG_CONSISTENT_RECODE
+    if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
+#else
     if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
+#endif
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -3894,7 +4230,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
 #if CONFIG_BETTER_HW_COMPATIBILITY
     // forbid 8X4 and 4X8 partitions if any reference frame is scaled.
     if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) {
-      int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
+      int ref_scaled = ref_frame > INTRA_FRAME &&
+                       vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf);
       if (second_ref_frame > INTRA_FRAME)
         ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf);
       if (ref_scaled) continue;
@@ -3940,6 +4277,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
+
+      if (cm->ref_frame_sign_bias[ref_frame] ==
+          cm->ref_frame_sign_bias[second_ref_frame])
+        continue;
+
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -4418,3 +4760,4 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   store_coding_context(x, ctx, best_ref_index, best_pred_diff, best_filter_diff,
                        0);
 }
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/libs/libvpx/vp9/encoder/vp9_rdopt.h b/libs/libvpx/vp9/encoder/vp9_rdopt.h
index 795c91aef7a..e1147ff9438 100644
--- a/libs/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/libs/libvpx/vp9/encoder/vp9_rdopt.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RDOPT_H_
-#define VP9_ENCODER_VP9_RDOPT_H_
+#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_
+#define VPX_VP9_ENCODER_VP9_RDOPT_H_
 
 #include "vp9/common/vp9_blockd.h"
 
@@ -29,6 +29,7 @@ void vp9_rd_pick_intra_mode_sb(struct VP9_COMP *cpi, struct macroblock *x,
                                struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                PICK_MODE_CONTEXT *ctx, int64_t best_rd);
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_rd_pick_inter_mode_sb(struct VP9_COMP *cpi,
                                struct TileDataEnc *tile_data,
                                struct macroblock *x, int mi_row, int mi_col,
@@ -39,21 +40,24 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(
     struct VP9_COMP *cpi, struct TileDataEnc *tile_data, struct macroblock *x,
     struct RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
     int64_t best_rd_so_far);
+#endif
 
 int vp9_internal_image_edge(struct VP9_COMP *cpi);
 int vp9_active_h_edge(struct VP9_COMP *cpi, int mi_row, int mi_step);
 int vp9_active_v_edge(struct VP9_COMP *cpi, int mi_col, int mi_step);
 int vp9_active_edge_sb(struct VP9_COMP *cpi, int mi_row, int mi_col);
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi,
                                    struct TileDataEnc *tile_data,
                                    struct macroblock *x, int mi_row, int mi_col,
                                    struct RD_COST *rd_cost, BLOCK_SIZE bsize,
                                    PICK_MODE_CONTEXT *ctx,
                                    int64_t best_rd_so_far);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RDOPT_H_
+#endif  // VPX_VP9_ENCODER_VP9_RDOPT_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_resize.c b/libs/libvpx/vp9/encoder/vp9_resize.c
index f6c4aad4d34..7486dee25b8 100644
--- a/libs/libvpx/vp9/encoder/vp9_resize.c
+++ b/libs/libvpx/vp9/encoder/vp9_resize.c
@@ -424,11 +424,11 @@ void vp9_resize_plane(const uint8_t *const input, int height, int width,
                       int in_stride, uint8_t *output, int height2, int width2,
                       int out_stride) {
   int i;
-  uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height);
+  uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf));
   uint8_t *tmpbuf =
-      (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width));
-  uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height);
-  uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2);
+      (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf));
+  uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf));
+  uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2));
   if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
     goto Error;
   assert(width > 0);
@@ -506,10 +506,12 @@ static void highbd_interpolate(const uint16_t *const input, int inlength,
       sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK;
       filter = interp_filters[sub_pel];
       sum = 0;
-      for (k = 0; k < INTERP_TAPS; ++k)
+      for (k = 0; k < INTERP_TAPS; ++k) {
+        assert(int_pel - INTERP_TAPS / 2 + 1 + k < inlength);
         sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0
                                       ? 0
                                       : int_pel - INTERP_TAPS / 2 + 1 + k)];
+      }
       *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     }
     // Middle part.
@@ -720,6 +722,10 @@ void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width,
   uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2);
   if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
     goto Error;
+  assert(width > 0);
+  assert(height > 0);
+  assert(width2 > 0);
+  assert(height2 > 0);
   for (i = 0; i < height; ++i) {
     highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
                             intbuf + width2 * i, width2, tmpbuf, bd);
diff --git a/libs/libvpx/vp9/encoder/vp9_resize.h b/libs/libvpx/vp9/encoder/vp9_resize.h
index d3282ee1919..5d4ce97eba9 100644
--- a/libs/libvpx/vp9/encoder/vp9_resize.h
+++ b/libs/libvpx/vp9/encoder/vp9_resize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_RESIZE_H_
-#define VP9_ENCODER_VP9_RESIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_
+#define VPX_VP9_ENCODER_VP9_RESIZE_H_
 
 #include <stdio.h>
 #include "vpx/vpx_integer.h"
@@ -65,4 +65,4 @@ void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_RESIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_RESIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.c b/libs/libvpx/vp9/encoder/vp9_segmentation.c
index 4a5a68e07a1..a163297e6e1 100644
--- a/libs/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libs/libvpx/vp9/encoder/vp9_segmentation.c
@@ -9,6 +9,7 @@
  */
 
 #include <limits.h>
+#include <math.h>
 
 #include "vpx_mem/vpx_mem.h"
 
@@ -46,6 +47,59 @@ void vp9_clear_segdata(struct segmentation *seg, int segment_id,
   seg->feature_data[segment_id][feature_id] = 0;
 }
 
+void vp9_psnr_aq_mode_setup(struct segmentation *seg) {
+  int i;
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  for (i = 0; i < MAX_SEGMENTS; ++i) {
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2)));
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+}
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+                                  struct segmentation *seg) {
+  const VP9_COMMON *cm = &cpi->common;
+  const int seg_counts = cpi->kmeans_ctr_num;
+  const int base_qindex = cm->base_qindex;
+  const double base_qstep = vp9_convert_qindex_to_q(base_qindex, cm->bit_depth);
+  const double mid_ctr = cpi->kmeans_ctr_ls[seg_counts / 2];
+  const double var_diff_scale = 4.0;
+  int i;
+
+  assert(seg_counts <= MAX_SEGMENTS);
+
+  vp9_enable_segmentation(seg);
+  vp9_clearall_segfeatures(seg);
+  seg->abs_delta = SEGMENT_DELTADATA;
+
+  for (i = 0; i < seg_counts / 2; ++i) {
+    double wiener_var_diff = mid_ctr - cpi->kmeans_ctr_ls[i];
+    double target_qstep = base_qstep / (1.0 + wiener_var_diff / var_diff_scale);
+    int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+    assert(wiener_var_diff >= 0.0);
+
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+
+  vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 0);
+  vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+
+  for (; i < seg_counts; ++i) {
+    double wiener_var_diff = cpi->kmeans_ctr_ls[i] - mid_ctr;
+    double target_qstep = base_qstep * (1.0 + wiener_var_diff / var_diff_scale);
+    int target_qindex = vp9_convert_q_to_qindex(target_qstep, cm->bit_depth);
+    assert(wiener_var_diff >= 0.0);
+
+    vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, target_qindex - base_qindex);
+    vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+  }
+}
+
 // Based on set of segment counts calculate a probability tree
 static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) {
   // Work out probabilities of each segment
diff --git a/libs/libvpx/vp9/encoder/vp9_segmentation.h b/libs/libvpx/vp9/encoder/vp9_segmentation.h
index 562805543b8..9404c38bc89 100644
--- a/libs/libvpx/vp9/encoder/vp9_segmentation.h
+++ b/libs/libvpx/vp9/encoder/vp9_segmentation.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_
-#define VP9_ENCODER_VP9_SEGMENTATION_H_
+#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
+#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -26,6 +26,11 @@ void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
                        SEG_LVL_FEATURES feature_id);
 
+void vp9_psnr_aq_mode_setup(struct segmentation *seg);
+
+void vp9_perceptual_aq_mode_setup(struct VP9_COMP *cpi,
+                                  struct segmentation *seg);
+
 // The values given for each segment can be either deltas (from the default
 // value chosen for the frame) or absolute values.
 //
@@ -47,4 +52,4 @@ void vp9_reset_segment_features(struct segmentation *seg);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SEGMENTATION_H_
+#endif  // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_skin_detection.h b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
index 8880bff466a..46a722af9bf 100644
--- a/libs/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/libs/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_
-#define VP9_ENCODER_VP9_SKIN_MAP_H_
+#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
+#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
 
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_dsp/skin_detection.h"
@@ -37,4 +37,4 @@ void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SKIN_MAP_H_
+#endif  // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.c b/libs/libvpx/vp9/encoder/vp9_speed_features.c
index a05db60c652..529dca0406b 100644
--- a/libs/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.c
@@ -20,6 +20,7 @@ static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
   { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
 };
 
+#if !CONFIG_REALTIME_ONLY
 // Define 3 mesh density levels to control the number of searches.
 #define MESH_DENSITY_LEVELS 3
 static MESH_PATTERN
@@ -32,7 +33,7 @@ static MESH_PATTERN
 // Intra only frames, golden frames (except alt ref overlays) and
 // alt ref frames tend to be coded at a higher than ambient quality
 static int frame_is_boosted(const VP9_COMP *cpi) {
-  return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi);
+  return frame_is_kf_gf_arf(cpi);
 }
 
 // Sets a partition size down to which the auto partition code will always
@@ -61,46 +62,92 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
                                                        SPEED_FEATURES *sf,
                                                        int speed) {
   VP9_COMMON *const cm = &cpi->common;
+  const int min_frame_size = VPXMIN(cm->width, cm->height);
+  const int is_480p_or_larger = min_frame_size >= 480;
+  const int is_720p_or_larger = min_frame_size >= 720;
+  const int is_1080p_or_larger = min_frame_size >= 1080;
+  const int is_2160p_or_larger = min_frame_size >= 2160;
 
   // speed 0 features
   sf->partition_search_breakout_thr.dist = (1 << 20);
   sf->partition_search_breakout_thr.rate = 80;
+  sf->use_square_only_thresh_high = BLOCK_SIZES;
+  sf->use_square_only_thresh_low = BLOCK_4X4;
 
-  // Currently, the machine-learning based partition search early termination
-  // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
-  if (VPXMIN(cm->width, cm->height) >= 480) {
-    sf->ml_partition_search_early_termination = 1;
+  if (is_480p_or_larger) {
+    // Currently, the machine-learning based partition search early termination
+    // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
+    sf->rd_ml_partition.search_early_termination = 1;
+  } else {
+    sf->use_square_only_thresh_high = BLOCK_32X32;
   }
 
-  if (speed >= 1) {
-    sf->ml_partition_search_early_termination = 0;
+  if (!is_1080p_or_larger) {
+    sf->rd_ml_partition.search_breakout = 1;
+    if (is_720p_or_larger) {
+      sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = 0.0f;
+    } else {
+      sf->rd_ml_partition.search_breakout_thresh[0] = 2.5f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = 1.5f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = 1.5f;
+    }
+  }
 
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+  if (speed >= 1) {
+    sf->rd_ml_partition.search_early_termination = 0;
+    sf->rd_ml_partition.search_breakout = 1;
+    if (is_480p_or_larger)
+      sf->use_square_only_thresh_high = BLOCK_64X64;
+    else
+      sf->use_square_only_thresh_high = BLOCK_32X32;
+    sf->use_square_only_thresh_low = BLOCK_16X16;
+    if (is_720p_or_larger) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
-      sf->partition_search_breakout_thr.dist = (1 << 23);
+      sf->partition_search_breakout_thr.dist = (1 << 22);
+      sf->rd_ml_partition.search_breakout_thresh[0] = -5.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -5.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -9.0f;
     } else {
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
       sf->partition_search_breakout_thr.dist = (1 << 21);
+      sf->rd_ml_partition.search_breakout_thresh[0] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -1.0f;
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) {
+      sf->rd_ml_partition.search_breakout_thresh[0] -= 1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] -= 1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] -= 1.0f;
     }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   if (speed >= 2) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+    sf->use_square_only_thresh_high = BLOCK_4X4;
+    sf->use_square_only_thresh_low = BLOCK_SIZES;
+    if (is_720p_or_larger) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
       sf->adaptive_pred_interp_filter = 0;
       sf->partition_search_breakout_thr.dist = (1 << 24);
       sf->partition_search_breakout_thr.rate = 120;
+      sf->rd_ml_partition.search_breakout = 0;
     } else {
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
       sf->partition_search_breakout_thr.dist = (1 << 22);
       sf->partition_search_breakout_thr.rate = 100;
+      sf->rd_ml_partition.search_breakout_thresh[0] = 0.0f;
+      sf->rd_ml_partition.search_breakout_thresh[1] = -1.0f;
+      sf->rd_ml_partition.search_breakout_thresh[2] = -4.0f;
     }
     sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
 
     // Use a set of speed features for 4k videos.
-    if (VPXMIN(cm->width, cm->height) >= 2160) {
+    if (is_2160p_or_larger) {
       sf->use_square_partition_only = 1;
       sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
       sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
@@ -112,7 +159,8 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   }
 
   if (speed >= 3) {
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+    sf->rd_ml_partition.search_breakout = 0;
+    if (is_720p_or_larger) {
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
       sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
       sf->partition_search_breakout_thr.dist = (1 << 25);
@@ -137,7 +185,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
 
   if (speed >= 4) {
     sf->partition_search_breakout_thr.rate = 300;
-    if (VPXMIN(cm->width, cm->height) >= 720) {
+    if (is_720p_or_larger) {
       sf->partition_search_breakout_thr.dist = (1 << 26);
     } else {
       sf->partition_search_breakout_thr.dist = (1 << 24);
@@ -166,28 +214,41 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !frame_is_boosted(cpi);
-  sf->use_square_only_threshold = BLOCK_16X16;
+  sf->use_square_partition_only = !boosted;
+  sf->prune_ref_frame_for_rect_partitions = 1;
+  sf->rd_ml_partition.var_pruning = 1;
+
+  sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[1] = 350;
+  sf->rd_ml_partition.prune_rect_thresh[2] = 325;
+  sf->rd_ml_partition.prune_rect_thresh[3] = 250;
 
   if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
     sf->exhaustive_searches_thresh = (1 << 22);
-    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      int mesh_density_level = 0;
-      sf->mesh_patterns[i].range =
-          good_quality_mesh_patterns[mesh_density_level][i].range;
-      sf->mesh_patterns[i].interval =
-          good_quality_mesh_patterns[mesh_density_level][i].interval;
-    }
   } else {
     sf->exhaustive_searches_thresh = INT_MAX;
   }
 
+  for (i = 0; i < MAX_MESH_STEP; ++i) {
+    const int mesh_density_level = 0;
+    sf->mesh_patterns[i].range =
+        good_quality_mesh_patterns[mesh_density_level][i].range;
+    sf->mesh_patterns[i].interval =
+        good_quality_mesh_patterns[mesh_density_level][i].interval;
+  }
+
   if (speed >= 1) {
+    sf->temporal_filter_search_method = NSTEP;
+    sf->rd_ml_partition.var_pruning = !boosted;
+    sf->rd_ml_partition.prune_rect_thresh[1] = 225;
+    sf->rd_ml_partition.prune_rect_thresh[2] = 225;
+    sf->rd_ml_partition.prune_rect_thresh[3] = 225;
+
     if (oxcf->pass == 2) {
       TWO_PASS *const twopass = &cpi->twopass;
       if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
           vp9_internal_image_edge(cpi)) {
-        sf->use_square_partition_only = !frame_is_boosted(cpi);
+        sf->use_square_partition_only = !boosted;
       } else {
         sf->use_square_partition_only = !frame_is_intra_only(cm);
       }
@@ -199,23 +260,22 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
     sf->allow_quant_coeff_opt = sf->optimize_coefficients;
     sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
-
-    sf->use_square_only_threshold = BLOCK_4X4;
     sf->less_rectangular_check = 1;
-
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
     sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
-    sf->mv.subpel_iters_per_step = 1;
-    sf->mode_skip_start = 10;
+    sf->mv.subpel_search_level = 1;
+    if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
     sf->adaptive_pred_interp_filter = 1;
     sf->allow_acl = 0;
 
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
-    sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
-    sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
+      sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+      sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
+    }
 
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 30;
@@ -223,9 +283,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->exhaustive_searches_thresh =
         (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23)
                                                                 : INT_MAX;
+    sf->use_accurate_subpel_search = USE_4_TAPS;
   }
 
   if (speed >= 2) {
+    sf->rd_ml_partition.var_pruning = 0;
     if (oxcf->vbr_corpus_complexity)
       sf->recode_loop = ALLOW_RECODE_FIRST;
     else
@@ -247,6 +309,12 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
+    sf->enhanced_full_pixel_motion_search = 0;
+    sf->prune_ref_frame_for_rect_partitions = 0;
+    sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+    sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+    sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+    sf->mv.subpel_search_level = 0;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -257,6 +325,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
             good_quality_mesh_patterns[mesh_density_level][i].interval;
       }
     }
+
+    sf->use_accurate_subpel_search = USE_2_TAPS;
   }
 
   if (speed >= 3) {
@@ -316,6 +386,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->simple_model_rd_from_var = 1;
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
                                                      SPEED_FEATURES *sf,
@@ -358,6 +429,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi,
 static void set_rt_speed_feature_framesize_independent(
     VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) {
   VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   const int is_keyframe = cm->frame_type == KEY_FRAME;
   const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key;
   sf->static_segmentation = 0;
@@ -374,6 +446,16 @@ static void set_rt_speed_feature_framesize_independent(
   sf->use_compound_nonrd_pickmode = 0;
   sf->nonrd_keyframe = 0;
   sf->svc_use_lowres_part = 0;
+  sf->overshoot_detection_cbr_rt = NO_DETECTION;
+  sf->disable_16x16part_nonkey = 0;
+  sf->disable_golden_ref = 0;
+  sf->enable_tpl_model = 0;
+  sf->enhanced_full_pixel_motion_search = 0;
+  sf->use_accurate_subpel_search = USE_2_TAPS;
+  sf->nonrd_use_ml_partition = 0;
+  sf->variance_part_thresh_mult = 1;
+  sf->cb_pred_filter_search = 0;
+  sf->force_smooth_interpol = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -407,7 +489,7 @@ static void set_rt_speed_feature_framesize_independent(
     // Reference masking only enabled for 1 spatial layer, and if none of the
     // references have been scaled. The latter condition needs to be checked
     // for external or internal dynamic resize.
-    sf->reference_masking = (cpi->svc.number_spatial_layers == 1);
+    sf->reference_masking = (svc->number_spatial_layers == 1);
     if (sf->reference_masking == 1 &&
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
@@ -440,7 +522,7 @@ static void set_rt_speed_feature_framesize_independent(
     sf->disable_filter_search_var_thresh = 100;
     sf->use_uv_intra_rd_estimate = 1;
     sf->skip_encode_sb = 1;
-    sf->mv.subpel_iters_per_step = 1;
+    sf->mv.subpel_search_level = 0;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
     sf->allow_skip_recode = 0;
@@ -460,7 +542,7 @@ static void set_rt_speed_feature_framesize_independent(
     sf->adjust_partitioning_from_last_frame =
         cm->last_frame_type != cm->frame_type ||
         (0 == (frames_since_key + 1) % sf->last_partitioning_redo_frequency);
-    sf->mv.subpel_force_stop = 1;
+    sf->mv.subpel_force_stop = QUARTER_PEL;
     for (i = 0; i < TX_SIZES; i++) {
       sf->intra_y_mode_mask[i] = INTRA_DC_H_V;
       sf->intra_uv_mode_mask[i] = INTRA_DC;
@@ -513,7 +595,10 @@ static void set_rt_speed_feature_framesize_independent(
       int i;
       if (content == VP9E_CONTENT_SCREEN) {
         for (i = 0; i < BLOCK_SIZES; ++i)
-          sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
+          if (i >= BLOCK_32X32)
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_H_V;
+          else
+            sf->intra_y_mode_bsize_mask[i] = INTRA_DC_TM_H_V;
       } else {
         for (i = 0; i < BLOCK_SIZES; ++i)
           if (i > BLOCK_16X16)
@@ -531,6 +616,23 @@ static void set_rt_speed_feature_framesize_independent(
       sf->limit_newmv_early_exit = 1;
       if (!cpi->use_svc) sf->bias_golden = 1;
     }
+    // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent
+    // increase in encoding time.
+    if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1;
+    if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+        cpi->oxcf.rc_mode == VPX_CBR) {
+      if (cm->width * cm->height <= 352 * 288 && !cpi->use_svc &&
+          cpi->oxcf.content != VP9E_CONTENT_SCREEN)
+        sf->overshoot_detection_cbr_rt = RE_ENCODE_MAXQ;
+      else
+        sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
+    }
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
+        cm->width <= 1280 && cm->height <= 720) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
+    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1;
   }
 
   if (speed >= 6) {
@@ -539,8 +641,6 @@ static void set_rt_speed_feature_framesize_independent(
       sf->use_compound_nonrd_pickmode = 1;
     }
     sf->partition_search_type = VAR_BASED_PARTITION;
-    // Turn on this to use non-RD key frame coding mode.
-    sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
@@ -553,7 +653,7 @@ static void set_rt_speed_feature_framesize_independent(
           (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+           svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
         cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
       }
@@ -562,11 +662,14 @@ static void set_rt_speed_feature_framesize_independent(
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
     }
-    if (cpi->svc.temporal_layer_id > 0) {
+    if (svc->temporal_layer_id > 0) {
       sf->adaptive_rd_thresh = 4;
       sf->limit_newmv_early_exit = 0;
       sf->base_mv_aggressive = 1;
     }
+    if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+        cpi->oxcf.rc_mode == VPX_CBR)
+      sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ;
   }
 
   if (speed >= 7) {
@@ -576,16 +679,15 @@ static void set_rt_speed_feature_framesize_independent(
     sf->mv.fullpel_search_step_param = 10;
     // For SVC: use better mv search on base temporal layer, and only
     // on base spatial layer if highest resolution is above 640x360.
-    if (cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0 &&
-        (cpi->svc.spatial_layer_id == 0 ||
+    if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 &&
+        (svc->spatial_layer_id == 0 ||
          cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
       sf->mv.search_method = NSTEP;
       sf->mv.fullpel_search_step_param = 6;
     }
-    if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) {
+    if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) {
       sf->use_simple_block_yrd = 1;
-      if (cpi->svc.non_reference_frame)
+      if (svc->non_reference_frame)
         sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
     }
     if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
@@ -596,22 +698,30 @@ static void set_rt_speed_feature_framesize_independent(
     if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
         !cpi->external_resize &&
         (!cpi->use_svc ||
-         cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+         (svc->spatial_layer_id == svc->number_spatial_layers - 1 &&
+          !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) {
       sf->copy_partition_flag = 1;
       cpi->max_copied_frame = 2;
       // The top temporal enhancement layer (for number of temporal layers > 1)
       // are non-reference frames, so use large/max value for max_copied_frame.
-      if (cpi->svc.number_temporal_layers > 1 &&
-          cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+      if (svc->number_temporal_layers > 1 &&
+          svc->temporal_layer_id == svc->number_temporal_layers - 1)
         cpi->max_copied_frame = 255;
     }
     // For SVC: enable use of lower resolution partition for higher resolution,
     // only for 3 spatial layers and when config/top resolution is above VGA.
     // Enable only for non-base temporal layer frames.
-    if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 &&
-        cpi->svc.temporal_layer_id > 0 &&
+    if (cpi->use_svc && svc->use_partition_reuse &&
+        svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 &&
         cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
       sf->svc_use_lowres_part = 1;
+    // For SVC when golden is used as second temporal reference: to avoid
+    // encode time increase only use this feature on base temporal layer.
+    // (i.e remove golden flag from frame_flags for temporal_layer_id > 0).
+    if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
+        svc->temporal_layer_id > 0)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1;
   }
 
   if (speed >= 8) {
@@ -621,9 +731,15 @@ static void set_rt_speed_feature_framesize_independent(
     if (!cpi->use_svc) cpi->max_copied_frame = 4;
     if (cpi->row_mt && cpi->oxcf.max_threads > 1)
       sf->adaptive_rd_thresh_row_mt = 1;
-
-    if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3;
-    if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+    // Enable ML based partition for low res.
+    if (!frame_is_intra_only(cm) && cm->width * cm->height <= 352 * 288) {
+      sf->nonrd_use_ml_partition = 1;
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH)
+      sf->nonrd_use_ml_partition = 0;
+#endif
+    if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = HALF_PEL;
     // Only keep INTRA_DC mode for speed 8.
     if (!is_keyframe) {
       int i = 0;
@@ -651,7 +767,27 @@ static void set_rt_speed_feature_framesize_independent(
     }
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
+    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1;
   }
+
+  if (speed >= 9) {
+    sf->cb_pred_filter_search = 1;
+    sf->mv.enable_adaptive_subpel_force_stop = 1;
+    sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
+    sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
+    sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL;
+    // Disable partition blocks below 16x16, except for low-resolutions.
+    if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240)
+      sf->disable_16x16part_nonkey = 1;
+    // Allow for disabling GOLDEN reference, for CBR mode.
+    if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1;
+    if (cpi->rc.avg_frame_low_motion < 70) sf->default_interp_filter = BILINEAR;
+    if (cm->width * cm->height >= 640 * 360) sf->variance_part_thresh_mult = 2;
+  }
+
+  if (sf->nonrd_use_ml_partition)
+    sf->partition_search_type = ML_BASED_PARTITION;
+
   if (sf->use_altref_onepass) {
     if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
       sf->partition_search_type = FIXED_PARTITION;
@@ -666,9 +802,26 @@ static void set_rt_speed_feature_framesize_independent(
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_lastgolden_frame_usage));
   }
+  if (svc->previous_frame_is_intra_only) {
+    sf->partition_search_type = FIXED_PARTITION;
+    sf->always_this_block_size = BLOCK_64X64;
+  }
+  // Special case for screen content: increase motion search on base spatial
+  // layer when high motion is detected or previous SL0 frame was dropped.
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 &&
+      (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) {
+    sf->mv.search_method = NSTEP;
+    // TODO(marpan/jianj): Tune this setting for screensharing. For now use
+    // small step_param for all spatial layers.
+    sf->mv.fullpel_search_step_param = 2;
+  }
+  // TODO(marpan): There is regression for aq-mode=3 speed <= 4, force it
+  // off for now.
+  if (speed <= 4 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+    cpi->oxcf.aq_mode = 0;
 }
 
-void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RD_OPT *const rd = &cpi->rd;
@@ -678,13 +831,15 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->partition_search_breakout_thr.dist = (1 << 19);
   sf->partition_search_breakout_thr.rate = 80;
-  sf->ml_partition_search_early_termination = 0;
+  sf->rd_ml_partition.search_early_termination = 0;
+  sf->rd_ml_partition.search_breakout = 0;
 
-  if (oxcf->mode == REALTIME) {
-    set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
-  } else if (oxcf->mode == GOOD) {
-    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
-  }
+  if (oxcf->mode == REALTIME)
+    set_rt_speed_feature_framesize_dependent(cpi, sf, speed);
+#if !CONFIG_REALTIME_ONLY
+  else if (oxcf->mode == GOOD)
+    set_good_speed_feature_framesize_dependent(cpi, sf, speed);
+#endif
 
   if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
     sf->adaptive_pred_interp_filter = 0;
@@ -710,17 +865,13 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
   if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
       oxcf->max_threads > 1)
     sf->adaptive_rd_thresh = 0;
-
-  // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
-    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
-    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
 }
 
-void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
+void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   SPEED_FEATURES *const sf = &cpi->sf;
+#if !CONFIG_REALTIME_ONLY
   VP9_COMMON *const cm = &cpi->common;
+#endif
   MACROBLOCK *const x = &cpi->td.mb;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   int i;
@@ -730,8 +881,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->mv.search_method = NSTEP;
   sf->recode_loop = ALLOW_RECODE_FIRST;
   sf->mv.subpel_search_method = SUBPEL_TREE;
-  sf->mv.subpel_iters_per_step = 2;
-  sf->mv.subpel_force_stop = 0;
+  sf->mv.subpel_search_level = 2;
+  sf->mv.subpel_force_stop = EIGHTH_PEL;
   sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf);
   sf->mv.reduce_first_step_size = 0;
   sf->coeff_prob_appx_step = 1;
@@ -741,6 +892,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
+  sf->enhanced_full_pixel_motion_search = 1;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
   sf->cb_pred_filter_search = 0;
@@ -752,7 +904,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->partition_search_type = SEARCH_PARTITION;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
-  sf->use_square_only_threshold = BLOCK_SIZES;
+  sf->use_square_only_thresh_high = BLOCK_SIZES;
+  sf->use_square_only_thresh_low = BLOCK_4X4;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
   sf->default_max_partition_size = BLOCK_64X64;
@@ -771,6 +924,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
   sf->quant_opt_thresh = 99.0;
   sf->allow_acl = 1;
+  sf->enable_tpl_model = oxcf->enable_tpl_model;
+  sf->prune_ref_frame_for_rect_partitions = 0;
+  sf->temporal_filter_search_method = MESH;
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -804,10 +960,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->limit_newmv_early_exit = 0;
   sf->bias_golden = 0;
   sf->base_mv_aggressive = 0;
+  sf->rd_ml_partition.prune_rect_thresh[0] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[1] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[2] = -1;
+  sf->rd_ml_partition.prune_rect_thresh[3] = -1;
+  sf->rd_ml_partition.var_pruning = 0;
+  sf->use_accurate_subpel_search = USE_8_TAPS;
 
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->adaptive_rd_thresh = 1;
   sf->tx_size_search_breakout = 1;
+  sf->tx_size_search_depth = 2;
 
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
@@ -820,10 +983,11 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   }
 
   if (oxcf->mode == REALTIME)
-    set_rt_speed_feature_framesize_independent(cpi, sf, oxcf->speed,
-                                               oxcf->content);
+    set_rt_speed_feature_framesize_independent(cpi, sf, speed, oxcf->content);
+#if !CONFIG_REALTIME_ONLY
   else if (oxcf->mode == GOOD)
-    set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed);
+    set_good_speed_feature_framesize_independent(cpi, cm, sf, speed);
+#endif
 
   cpi->diamond_search_sad = vp9_diamond_search_sad;
 
@@ -837,7 +1001,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     sf->optimize_coefficients = 0;
   }
 
-  if (sf->mv.subpel_force_stop == 3) {
+  if (sf->mv.subpel_force_stop == FULL_PEL) {
     // Whole pel only
     cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree;
   } else if (sf->mv.subpel_search_method == SUBPEL_TREE) {
@@ -850,6 +1014,12 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
     cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore;
   }
 
+  // This is only used in motion vector unit test.
+  if (cpi->oxcf.motion_vector_unit_test == 1)
+    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
+  else if (cpi->oxcf.motion_vector_unit_test == 2)
+    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
+
   x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1;
 
   x->min_partition_size = sf->default_min_partition_size;
@@ -867,10 +1037,4 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
       oxcf->max_threads > 1)
     sf->adaptive_rd_thresh = 0;
-
-  // This is only used in motion vector unit test.
-  if (cpi->oxcf.motion_vector_unit_test == 1)
-    cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv;
-  else if (cpi->oxcf.motion_vector_unit_test == 2)
-    cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv;
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_speed_features.h b/libs/libvpx/vp9/encoder/vp9_speed_features.h
index 50d52bc23a4..eb06281990a 100644
--- a/libs/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libs/libvpx/vp9/encoder/vp9_speed_features.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_
-#define VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
 
 #include "vp9/common/vp9_enums.h"
 
@@ -57,7 +57,8 @@ typedef enum {
   BIGDIA = 3,
   SQUARE = 4,
   FAST_HEX = 5,
-  FAST_DIAMOND = 6
+  FAST_DIAMOND = 6,
+  MESH = 7
 } SEARCH_METHODS;
 
 typedef enum {
@@ -135,20 +136,23 @@ typedef enum {
 } INTERP_FILTER_MASK;
 
 typedef enum {
-  // Search partitions using RD/NONRD criterion
+  // Search partitions using RD/NONRD criterion.
   SEARCH_PARTITION,
 
-  // Always use a fixed size partition
+  // Always use a fixed size partition.
   FIXED_PARTITION,
 
   REFERENCE_PARTITION,
 
   // Use an arbitrary partitioning scheme based on source variance within
-  // a 64X64 SB
+  // a 64X64 SB.
   VAR_BASED_PARTITION,
 
-  // Use non-fixed partitions based on source variance
-  SOURCE_VAR_BASED_PARTITION
+  // Use non-fixed partitions based on source variance.
+  SOURCE_VAR_BASED_PARTITION,
+
+  // Make partition decisions with machine learning models.
+  ML_BASED_PARTITION
 } PARTITION_SEARCH_TYPE;
 
 typedef enum {
@@ -161,6 +165,19 @@ typedef enum {
   ONE_LOOP_REDUCED = 1
 } FAST_COEFF_UPDATE;
 
+typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP;
+
+typedef struct ADAPT_SUBPEL_FORCE_STOP {
+  // Threshold for full pixel motion vector;
+  int mv_thresh;
+
+  // subpel_force_stop if full pixel MV is below the threshold.
+  SUBPEL_FORCE_STOP force_stop_below;
+
+  // subpel_force_stop if full pixel MV is equal to or above the threshold.
+  SUBPEL_FORCE_STOP force_stop_above;
+} ADAPT_SUBPEL_FORCE_STOP;
+
 typedef struct MV_SPEED_FEATURES {
   // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
   SEARCH_METHODS search_method;
@@ -179,15 +196,17 @@ typedef struct MV_SPEED_FEATURES {
   // the same process. Along the way it skips many diagonals.
   SUBPEL_SEARCH_METHODS subpel_search_method;
 
-  // Maximum number of steps in logarithmic subpel search before giving up.
-  int subpel_iters_per_step;
+  // Subpel MV search level. Can take values 0 - 2. Higher values mean more
+  // extensive subpel search.
+  int subpel_search_level;
+
+  // When to stop subpel motion search.
+  SUBPEL_FORCE_STOP subpel_force_stop;
+
+  // If it's enabled, different subpel_force_stop will be used for different MV.
+  int enable_adaptive_subpel_force_stop;
 
-  // Control when to stop subpel search:
-  // 0: Full subpel search.
-  // 1: Stop at quarter pixel.
-  // 2: Stop at half pixel.
-  // 3: Stop at full pixel.
-  int subpel_force_stop;
+  ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop;
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
@@ -205,6 +224,28 @@ typedef struct MESH_PATTERN {
   int interval;
 } MESH_PATTERN;
 
+typedef enum {
+  // No reaction to rate control on a detected slide/scene change.
+  NO_DETECTION = 0,
+
+  // Set to larger Q (max_q set by user) based only on the
+  // detected slide/scene change and current/past Q.
+  FAST_DETECTION_MAXQ = 1,
+
+  // Based on (first pass) encoded frame, if large frame size is detected
+  // then set to higher Q for the second re-encode. This involves 2 pass
+  // encoding on slide change, so slower than 1, but more accurate for
+  // detecting overshoot.
+  RE_ENCODE_MAXQ = 2
+} OVERSHOOT_DETECTION_CBR_RT;
+
+typedef enum {
+  USE_2_TAPS = 0,
+  USE_4_TAPS,
+  USE_8_TAPS,
+  USE_8_TAPS_SHARP,
+} SUBPEL_SEARCH_TYPE;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -258,6 +299,9 @@ typedef struct SPEED_FEATURES {
   // alternate reference frames.
   int allow_acl;
 
+  // Temporal dependency model based encoding mode optimization
+  int enable_tpl_model;
+
   // Use transform domain distortion. Use pixel domain distortion in speed 0
   // and certain situations in higher speed to improve the RD model precision.
   int allow_txfm_domain_distortion;
@@ -272,6 +316,9 @@ typedef struct SPEED_FEATURES {
   // for intra and model coefs for the rest.
   TX_SIZE_SEARCH_METHOD tx_size_search_method;
 
+  // How many levels of tx size to search, starting from the largest.
+  int tx_size_search_depth;
+
   // Low precision 32x32 fdct keeps everything in 16 bits and thus is less
   // precise but significantly faster than the non lp version.
   int use_lp32x32fdct;
@@ -293,9 +340,14 @@ typedef struct SPEED_FEATURES {
   // rd than partition type split.
   int less_rectangular_check;
 
-  // Disable testing non square partitions. (eg 16x32)
+  // Disable testing non square partitions(eg 16x32) for block sizes larger than
+  // use_square_only_thresh_high or smaller than use_square_only_thresh_low.
   int use_square_partition_only;
-  BLOCK_SIZE use_square_only_threshold;
+  BLOCK_SIZE use_square_only_thresh_high;
+  BLOCK_SIZE use_square_only_thresh_low;
+
+  // Prune reference frames for rectangular partitions.
+  int prune_ref_frame_for_rect_partitions;
 
   // Sets min and max partition sizes for this 64x64 region based on the
   // same 64x64 in last encoded frame, and the left and above neighbor.
@@ -327,6 +379,9 @@ typedef struct SPEED_FEATURES {
   // point for this motion search and limits the search range around it.
   int adaptive_motion_search;
 
+  // Do extra full pixel motion search to obtain better motion vector.
+  int enhanced_full_pixel_motion_search;
+
   // Threshold for allowing exhaistive motion search.
   int exhaustive_searches_thresh;
 
@@ -448,8 +503,27 @@ typedef struct SPEED_FEATURES {
   // Partition search early breakout thresholds.
   PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr;
 
-  // Machine-learning based partition search early termination
-  int ml_partition_search_early_termination;
+  struct {
+    // Use ML-based partition search early breakout.
+    int search_breakout;
+    // Higher values mean more aggressiveness for partition search breakout that
+    // results in better encoding  speed but worse compression performance.
+    float search_breakout_thresh[3];
+
+    // Machine-learning based partition search early termination
+    int search_early_termination;
+
+    // Machine-learning based partition search pruning using prediction residue
+    // variance.
+    int var_pruning;
+
+    // Threshold values used for ML based rectangular partition search pruning.
+    // If < 0, the feature is turned off.
+    // Higher values mean more aggressiveness to skip rectangular partition
+    // search that results in better encoding speed but worse coding
+    // performance.
+    int prune_rect_thresh[4];
+  } rd_ml_partition;
 
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;
@@ -508,15 +582,43 @@ typedef struct SPEED_FEATURES {
 
   // For SVC: enables use of partition from lower spatial resolution.
   int svc_use_lowres_part;
+
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt;
+
+  // Disable partitioning of 16x16 blocks.
+  int disable_16x16part_nonkey;
+
+  // Allow for disabling golden reference.
+  int disable_golden_ref;
+
+  // Allow sub-pixel search to use interpolation filters with different taps in
+  // order to achieve accurate motion search result.
+  SUBPEL_SEARCH_TYPE use_accurate_subpel_search;
+
+  // Search method used by temporal filtering in full_pixel_motion_search.
+  SEARCH_METHODS temporal_filter_search_method;
+
+  // Use machine learning based partition search.
+  int nonrd_use_ml_partition;
+
+  // Multiplier for base thresold for variance partitioning.
+  int variance_part_thresh_mult;
+
+  // Force subpel motion filter to always use SMOOTH_FILTER.
+  int force_smooth_interpol;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
 
-void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi);
-void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi);
+void vp9_set_speed_features_framesize_independent(struct VP9_COMP *cpi,
+                                                  int speed);
+void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi,
+                                                int speed);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SPEED_FEATURES_H_
+#endif  // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.c b/libs/libvpx/vp9/encoder/vp9_subexp.c
index e8212ce05e1..19bbd5373fd 100644
--- a/libs/libvpx/vp9/encoder/vp9_subexp.c
+++ b/libs/libvpx/vp9/encoder/vp9_subexp.c
@@ -71,6 +71,7 @@ static int remap_prob(int v, int m) {
   else
     i = recenter_nonneg(MAX_PROB - 1 - v, MAX_PROB - 1 - m) - 1;
 
+  assert(i >= 0 && (size_t)i < sizeof(map_table));
   i = map_table[i];
   return i;
 }
diff --git a/libs/libvpx/vp9/encoder/vp9_subexp.h b/libs/libvpx/vp9/encoder/vp9_subexp.h
index 26c89e2ea78..f0d544b5270 100644
--- a/libs/libvpx/vp9/encoder/vp9_subexp.h
+++ b/libs/libvpx/vp9/encoder/vp9_subexp.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SUBEXP_H_
-#define VP9_ENCODER_VP9_SUBEXP_H_
+#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_
+#define VPX_VP9_ENCODER_VP9_SUBEXP_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -37,4 +37,4 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SUBEXP_H_
+#endif  // VPX_VP9_ENCODER_VP9_SUBEXP_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 2636bd9a580..8ba113bf3e7 100644
--- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -19,6 +19,14 @@
 #define SMALL_FRAME_WIDTH 32
 #define SMALL_FRAME_HEIGHT 16
 
+static void swap_ptr(void *a, void *b) {
+  void **a_p = (void **)a;
+  void **b_p = (void **)b;
+  void *c = *a_p;
+  *a_p = *b_p;
+  *b_p = c;
+}
+
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -29,24 +37,50 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
 
   svc->spatial_layer_id = 0;
   svc->temporal_layer_id = 0;
-  svc->first_spatial_layer_to_encode = 0;
-  svc->rc_drop_superframe = 0;
   svc->force_zero_mode_spatial_ref = 0;
   svc->use_base_mv = 0;
+  svc->use_partition_reuse = 0;
+  svc->use_gf_temporal_ref = 1;
+  svc->use_gf_temporal_ref_current_layer = 0;
   svc->scaled_temp_is_alloc = 0;
   svc->scaled_one_half = 0;
   svc->current_superframe = 0;
   svc->non_reference_frame = 0;
-
-  for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
+  svc->skip_enhancement_layer = 0;
+  svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON;
+  svc->framedrop_mode = CONSTRAINED_LAYER_DROP;
+  svc->set_intra_only_frame = 0;
+  svc->previous_frame_is_intra_only = 0;
+  svc->superframe_has_layer_sync = 0;
+  svc->use_set_ref_frame_config = 0;
+  svc->num_encoded_top_layer = 0;
+  svc->simulcast_mode = 0;
+
+  for (i = 0; i < REF_FRAMES; ++i) {
+    svc->fb_idx_spatial_layer_id[i] = -1;
+    svc->fb_idx_temporal_layer_id[i] = -1;
+    svc->fb_idx_base[i] = 0;
+  }
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
+    svc->last_layer_dropped[sl] = 0;
+    svc->drop_spatial_layer[sl] = 0;
     svc->ext_frame_flags[sl] = 0;
-    svc->ext_lst_fb_idx[sl] = 0;
-    svc->ext_gld_fb_idx[sl] = 1;
-    svc->ext_alt_fb_idx[sl] = 2;
-    svc->downsample_filter_type[sl] = EIGHTTAP;
-    svc->downsample_filter_phase[sl] = 0;  // Set to 8 for averaging filter.
+    svc->lst_fb_idx[sl] = 0;
+    svc->gld_fb_idx[sl] = 1;
+    svc->alt_fb_idx[sl] = 2;
+    svc->downsample_filter_type[sl] = BILINEAR;
+    svc->downsample_filter_phase[sl] = 8;  // Set to 8 for averaging filter.
+    svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
+    svc->fb_idx_upd_tl0[sl] = -1;
+    svc->drop_count[sl] = 0;
+    svc->spatial_layer_sync[sl] = 0;
   }
+  svc->max_consec_drop = INT_MAX;
+
+  svc->buffer_gf_temporal_ref[1].idx = 7;
+  svc->buffer_gf_temporal_ref[0].idx = 6;
+  svc->buffer_gf_temporal_ref[1].is_used = 0;
+  svc->buffer_gf_temporal_ref[0].is_used = 0;
 
   if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
     if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH,
@@ -84,6 +118,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       lrc->ni_frames = 0;
       lrc->decimation_count = 0;
       lrc->decimation_factor = 0;
+      lrc->worst_quality = oxcf->worst_allowed_q;
+      lrc->best_quality = oxcf->best_allowed_q;
 
       for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
         lrc->rate_correction_factors[i] = 1.0;
@@ -122,6 +158,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         size_t consec_zero_mv_size;
         VP9_COMMON *const cm = &cpi->common;
         lc->sb_index = 0;
+        lc->actual_num_seg1_blocks = 0;
+        lc->actual_num_seg2_blocks = 0;
+        lc->counter_encode_maxq_scene_change = 0;
         CHECK_MEM_ERROR(cm, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
@@ -154,6 +193,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
   int sl, tl, layer = 0, spatial_layer_target;
   float bitrate_alloc = 1.0;
 
+  cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
     for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
       for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -290,6 +331,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   LAYER_CONTEXT *const lc = get_layer_context(cpi);
   const int old_frame_since_key = cpi->rc.frames_since_key;
   const int old_frame_to_key = cpi->rc.frames_to_key;
+  const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop;
 
   cpi->rc = lc->rc;
   cpi->twopass = lc->twopass;
@@ -303,26 +345,23 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   // Reset the frames_since_key and frames_to_key counters to their values
   // before the layer restore. Keep these defined for the stream (not layer).
   if (cpi->svc.number_temporal_layers > 1 ||
-      (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) {
+      cpi->svc.number_spatial_layers > 1) {
     cpi->rc.frames_since_key = old_frame_since_key;
     cpi->rc.frames_to_key = old_frame_to_key;
   }
-
+  cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop;
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
       cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) {
     CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
-    signed char *temp = cr->map;
-    uint8_t *temp2 = cr->last_coded_q_map;
-    uint8_t *temp3 = cpi->consec_zero_mv;
-    cr->map = lc->map;
-    lc->map = temp;
-    cr->last_coded_q_map = lc->last_coded_q_map;
-    lc->last_coded_q_map = temp2;
-    cpi->consec_zero_mv = lc->consec_zero_mv;
-    lc->consec_zero_mv = temp3;
+    swap_ptr(&cr->map, &lc->map);
+    swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map);
+    swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv);
     cr->sb_index = lc->sb_index;
+    cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks;
+    cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks;
+    cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change;
   }
 }
 
@@ -350,6 +389,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     lc->consec_zero_mv = cpi->consec_zero_mv;
     cpi->consec_zero_mv = temp3;
     lc->sb_index = cr->sb_index;
+    lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
+    lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
+    lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
   }
 }
 
@@ -381,15 +423,6 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
     ++cpi->svc.current_superframe;
 }
 
-int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
-  return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 &&
-         cpi->svc
-             .layer_context[cpi->svc.spatial_layer_id *
-                                cpi->svc.number_temporal_layers +
-                            cpi->svc.temporal_layer_id]
-             .is_key_frame;
-}
-
 void get_layer_resolution(const int width_org, const int height_org,
                           const int num, const int den, int *width_out,
                           int *height_out) {
@@ -408,6 +441,51 @@ void get_layer_resolution(const int width_org, const int height_org,
   *height_out = h;
 }
 
+static void reset_fb_idx_unused(VP9_COMP *const cpi) {
+  // If a reference frame is not referenced or refreshed, then set the
+  // fb_idx for that reference to the first one used/referenced.
+  // This is to avoid setting fb_idx for a reference to a slot that is not
+  // used/needed (i.e., since that reference is not referenced or refreshed).
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME first_ref = 0;
+  int first_fb_idx = 0;
+  int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+      first_ref = ref_frame;
+      first_fb_idx = fb_idx[ref_frame - 1];
+      break;
+    }
+  }
+  if (first_ref > 0) {
+    if (first_ref != LAST_FRAME &&
+        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+        !cpi->ext_refresh_last_frame)
+      cpi->lst_fb_idx = first_fb_idx;
+    else if (first_ref != GOLDEN_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !cpi->ext_refresh_golden_frame)
+      cpi->gld_fb_idx = first_fb_idx;
+    else if (first_ref != ALTREF_FRAME &&
+             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !cpi->ext_refresh_alt_ref_frame)
+      cpi->alt_fb_idx = first_fb_idx;
+  }
+}
+
+// Never refresh any reference frame buffers on top temporal layers in
+// simulcast mode, which has interlayer prediction disabled.
+static void non_reference_frame_simulcast(VP9_COMP *const cpi) {
+  if (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1 &&
+      cpi->svc.temporal_layer_id > 0) {
+    cpi->ext_refresh_last_frame = 0;
+    cpi->ext_refresh_golden_frame = 0;
+    cpi->ext_refresh_alt_ref_frame = 0;
+  }
+}
+
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
 // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
 // scheme.
@@ -511,6 +589,10 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -546,6 +628,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     if (!spatial_id) {
       cpi->ref_frame_flags = VP9_LAST_FLAG;
     } else {
+      if (spatial_id == cpi->svc.number_spatial_layers - 1)
+        cpi->ext_refresh_alt_ref_frame = 0;
       cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
     }
   }
@@ -568,6 +652,10 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
     cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
     cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
 }
 
 // The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -600,54 +688,174 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
   } else {
     cpi->gld_fb_idx = 0;
   }
+
+  if (cpi->svc.simulcast_mode) non_reference_frame_simulcast(cpi);
+
+  reset_fb_idx_unused(cpi);
+}
+
+static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config(
+    VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode;
+  cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl];
+  cpi->ext_refresh_frame_flags_pending = 1;
+  cpi->lst_fb_idx = svc->lst_fb_idx[sl];
+  cpi->gld_fb_idx = svc->gld_fb_idx[sl];
+  cpi->alt_fb_idx = svc->alt_fb_idx[sl];
+  cpi->ext_refresh_last_frame = 0;
+  cpi->ext_refresh_golden_frame = 0;
+  cpi->ext_refresh_alt_ref_frame = 0;
+  cpi->ref_frame_flags = 0;
+  if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+  if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+  if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG;
+}
+
+void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                    VP9_ALT_FLAG };
+  int sl = svc->spatial_layer_id;
+  svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
+  svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
+  svc->alt_fb_idx[sl] = cpi->alt_fb_idx;
+  // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the
+  // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api.
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    int ref;
+    for (ref = 0; ref < REF_FRAMES; ++ref) {
+      svc->update_buffer_slot[sl] &= ~(1 << ref);
+      if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) ||
+          (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) ||
+          (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame))
+        svc->update_buffer_slot[sl] |= (1 << ref);
+    }
+  }
+
+  // TODO(jianj): Remove these 3, deprecated.
+  svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame;
+  svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
+  svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
+
+  svc->reference_last[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
+  svc->reference_golden[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
+  svc->reference_altref[sl] =
+      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
+  SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
-  if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
-  cpi->svc.force_zero_mode_spatial_ref = 1;
-  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
+  svc->skip_enhancement_layer = 0;
+
+  if (svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF &&
+      svc->number_spatial_layers > 1 && svc->number_spatial_layers <= 3 &&
+      svc->number_temporal_layers <= 3 &&
+      !(svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+        svc->use_set_ref_frame_config))
+    svc->simulcast_mode = 1;
+  else
+    svc->simulcast_mode = 0;
 
-  if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
+  if (svc->number_spatial_layers > 1) {
+    svc->use_base_mv = 1;
+    svc->use_partition_reuse = 1;
+  }
+  svc->force_zero_mode_spatial_ref = 1;
+  svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride;
+  svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows;
+  svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols;
+
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
     set_flags_and_fb_idx_for_temporal_mode3(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
+  } else if (svc->temporal_layering_mode ==
              VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
     set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-             VP9E_TEMPORAL_LAYERING_MODE_0101) {
+  } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) {
     set_flags_and_fb_idx_for_temporal_mode2(cpi);
-  } else if (cpi->svc.temporal_layering_mode ==
-             VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
-    // In the BYPASS/flexible mode, the encoder is relying on the application
-    // to specify, for each spatial layer, the flags and buffer indices for the
-    // layering.
-    // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is
-    // needed to support the case where the frame flags may be passed in via
-    // vpx_codec_encode(), which can be used for the temporal-only svc case.
-    // TODO(marpan): Consider adding an enc_config parameter to better handle
-    // this case.
-    if (cpi->ext_refresh_frame_flags_pending == 0) {
-      int sl;
-      cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-      sl = cpi->svc.spatial_layer_id;
-      vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
-      cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
-      cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
-      cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
-    }
-  }
-
-  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode)
-    cpi->svc.rc_drop_superframe = 0;
-
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
-                                   cpi->svc.number_temporal_layers +
-                               cpi->svc.temporal_layer_id];
+  } else if (svc->temporal_layering_mode ==
+                 VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+             svc->use_set_ref_frame_config) {
+    set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi);
+  }
+
+  if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+      cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx ||
+      cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx)
+    svc->buffer_gf_temporal_ref[0].is_used = 1;
+  if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+      cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx ||
+      cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx)
+    svc->buffer_gf_temporal_ref[1].is_used = 1;
+
+  // For the fixed (non-flexible/bypass) SVC mode:
+  // If long term temporal reference is enabled at the sequence level
+  // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames),
+  // we can use golden as a second temporal reference
+  // (since the spatial/inter-layer reference is disabled).
+  // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is
+  // unused (slot 7 and 6 should be available for 3-3 layer system).
+  // For now usage of this second temporal reference will only be used for
+  // highest and next to highest spatial layer (i.e., top and middle layer for
+  // 3 spatial layers).
+  svc->use_gf_temporal_ref_current_layer = 0;
+  if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used &&
+      !svc->buffer_gf_temporal_ref[1].is_used &&
+      svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON &&
+      svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 &&
+      svc->spatial_layer_id >= svc->number_spatial_layers - 2) {
+    // Enable the second (long-term) temporal reference at the frame-level.
+    svc->use_gf_temporal_ref_current_layer = 1;
+  }
+
+  // Check if current superframe has any layer sync, only check once on
+  // base layer.
+  if (svc->spatial_layer_id == 0) {
+    int sl = 0;
+    // Default is no sync.
+    svc->superframe_has_layer_sync = 0;
+    for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+      if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1;
+    }
+  }
+
+  // Reset the drop flags for all spatial layers, on the base layer.
+  if (svc->spatial_layer_id == 0) {
+    vp9_zero(svc->drop_spatial_layer);
+    // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx
+    // causes an issue with frame dropping and temporal layers, when the frame
+    // flags are passed via the encode call (bypass mode). Issue is that we're
+    // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
+    if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx));
+      // These are set by API before the superframe is encoded and they are
+      // passed to encoder layer by layer. Don't reset them on layer 0 in bypass
+      // mode.
+      vp9_zero(svc->update_buffer_slot);
+      vp9_zero(svc->reference_last);
+      vp9_zero(svc->reference_golden);
+      vp9_zero(svc->reference_altref);
+      // TODO(jianj): Remove these 3, deprecated.
+      vp9_zero(svc->update_last);
+      vp9_zero(svc->update_golden);
+      vp9_zero(svc->update_altref);
+    }
+  }
+
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
 
   // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS,
   // only for non-BYPASS mode for now.
-  if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
+      svc->use_set_ref_frame_config) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
@@ -657,157 +865,76 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                        lc->scaling_factor_num, lc->scaling_factor_den, &width,
                        &height);
 
-  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
-  // averaging filter), use bilinear for now.
-  if (width * height <= 640 * 480) {
-    cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
-    cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
-  }
-
-  // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use
-  // of base motion vectors if spatial scale factors for any layers are not 2,
+  // Use Eightap_smooth for low resolutions.
+  if (width * height <= 320 * 240)
+    svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH;
+  // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel
+  // to source pixel).
+  lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers +
+                           svc->temporal_layer_id];
+  if (lc->scaling_factor_num > (3 * lc->scaling_factor_den) >> 2)
+    svc->downsample_filter_phase[svc->spatial_layer_id] = 0;
+
+  // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2.
+  // For now, turn off use of base motion vectors and partition reuse if the
+  // spatial scale factors for any layers are not 2,
   // keep the case of 3 spatial layers with scale factor of 4x4 for base layer.
   // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2.
-  if (cpi->svc.number_spatial_layers > 1) {
+  if (svc->number_spatial_layers > 1) {
     int sl;
-    for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) {
-      lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers +
-                                   cpi->svc.temporal_layer_id];
+    for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) {
+      lc = &svc->layer_context[sl * svc->number_temporal_layers +
+                               svc->temporal_layer_id];
       if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) &&
           !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 &&
-            cpi->svc.number_spatial_layers == 3)) {
-        cpi->svc.use_base_mv = 0;
+            svc->number_spatial_layers == 3)) {
+        svc->use_base_mv = 0;
+        svc->use_partition_reuse = 0;
         break;
       }
     }
+    // For non-zero spatial layers: if the previous spatial layer was dropped
+    // disable the base_mv and partition_reuse features.
+    if (svc->spatial_layer_id > 0 &&
+        svc->drop_spatial_layer[svc->spatial_layer_id - 1]) {
+      svc->use_base_mv = 0;
+      svc->use_partition_reuse = 0;
+    }
   }
 
-  cpi->svc.non_reference_frame = 0;
+  svc->non_reference_frame = 0;
   if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
-      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
-    cpi->svc.non_reference_frame = 1;
-  }
-
-  if (vp9_set_size_literal(cpi, width, height) != 0)
-    return VPX_CODEC_INVALID_PARAM;
-
-  return 0;
-}
-
-#if CONFIG_SPATIAL_SVC
-#define SMALL_FRAME_FB_IDX 7
-
-int vp9_svc_start_frame(VP9_COMP *const cpi) {
-  int width = 0, height = 0;
-  LAYER_CONTEXT *lc;
-  struct lookahead_entry *buf;
-  int count = 1 << (cpi->svc.number_temporal_layers - 1);
-
-  cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
-  lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id];
-
-  cpi->svc.temporal_layer_id = 0;
-  while ((lc->current_video_frame_in_layer % count) != 0) {
-    ++cpi->svc.temporal_layer_id;
-    count >>= 1;
+      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame)
+    svc->non_reference_frame = 1;
+  // For non-flexible mode, where update_buffer_slot is used, need to check if
+  // all buffer slots are not refreshed.
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+    if (svc->update_buffer_slot[svc->spatial_layer_id] != 0)
+      svc->non_reference_frame = 0;
   }
 
-  cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
-
-  cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-
-  if (cpi->svc.spatial_layer_id == 0)
-    cpi->gld_fb_idx =
-        (lc->gold_ref_idx >= 0) ? lc->gold_ref_idx : cpi->lst_fb_idx;
-  else
-    cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1;
-
-  if (lc->current_video_frame_in_layer == 0) {
-    if (cpi->svc.spatial_layer_id >= 2) {
-      cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-    } else {
-      cpi->alt_fb_idx = cpi->lst_fb_idx;
-      cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG);
-    }
-  } else {
-    if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) {
-      cpi->alt_fb_idx = lc->alt_ref_idx;
-      if (!lc->has_alt_frame) cpi->ref_frame_flags &= (~VP9_ALT_FLAG);
-    } else {
-      // Find a proper alt_fb_idx for layers that don't have alt ref frame
-      if (cpi->svc.spatial_layer_id == 0) {
-        cpi->alt_fb_idx = cpi->lst_fb_idx;
-      } else {
-        LAYER_CONTEXT *lc_lower =
-            &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1];
-
-        if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] &&
-            lc_lower->alt_ref_source != NULL)
-          cpi->alt_fb_idx = lc_lower->alt_ref_idx;
-        else if (cpi->svc.spatial_layer_id >= 2)
-          cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2;
-        else
-          cpi->alt_fb_idx = cpi->lst_fb_idx;
-      }
-    }
+  if (svc->spatial_layer_id == 0) {
+    svc->high_source_sad_superframe = 0;
+    svc->high_num_blocks_with_motion = 0;
   }
 
-  get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height,
-                       lc->scaling_factor_num, lc->scaling_factor_den, &width,
-                       &height);
-
-  // Workaround for multiple frame contexts. In some frames we can't use prev_mi
-  // since its previous frame could be changed during decoding time. The idea is
-  // we put a empty invisible frame in front of them, then we will not use
-  // prev_mi when encoding these frames.
-
-  buf = vp9_lookahead_peek(cpi->lookahead, 0);
-  if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 &&
-      cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE &&
-      lc->rc.frames_to_key != 0 &&
-      !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) {
-    if ((cpi->svc.number_temporal_layers > 1 &&
-         cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) ||
-        (cpi->svc.number_spatial_layers > 1 &&
-         cpi->svc.spatial_layer_id == 0)) {
-      struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0);
-
-      if (buf != NULL) {
-        cpi->svc.empty_frame.ts_start = buf->ts_start;
-        cpi->svc.empty_frame.ts_end = buf->ts_end;
-        cpi->svc.encode_empty_frame_state = ENCODING;
-        cpi->common.show_frame = 0;
-        cpi->ref_frame_flags = 0;
-        cpi->common.frame_type = INTER_FRAME;
-        cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx =
-            SMALL_FRAME_FB_IDX;
-
-        if (cpi->svc.encode_intra_empty_frame != 0) cpi->common.intra_only = 1;
-
-        width = SMALL_FRAME_WIDTH;
-        height = SMALL_FRAME_HEIGHT;
-      }
-    }
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->last_layer_dropped[svc->spatial_layer_id] &&
+      svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 &&
+      !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    // For fixed/non-flexible mode, if the previous frame (same spatial layer
+    // from previous superframe) was dropped, make sure the lst_fb_idx
+    // for this frame corresponds to the buffer index updated on (last) encoded
+    // TL0 frame (with same spatial layer).
+    cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id];
   }
 
-  cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q);
-  cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q);
-
-  vp9_change_config(cpi, &cpi->oxcf);
-
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
-  vp9_set_high_precision_mv(cpi, 1);
-
-  cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source;
-
   return 0;
 }
 
-#undef SMALL_FRAME_FB_IDX
-#endif  // CONFIG_SPATIAL_SVC
-
 struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi,
                                               struct lookahead_ctx *ctx,
                                               int drain) {
@@ -840,7 +967,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
 }
 
 // Reset on key frame: reset counters, references and buffer updates.
-void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) {
   int sl, tl;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -848,7 +975,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
     for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
       lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
       lc->current_video_frame_in_layer = 0;
-      lc->frames_from_key_frame = 0;
+      if (is_key) lc->frames_from_key_frame = 0;
     }
   }
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
@@ -887,3 +1014,276 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
     }
   }
 }
+
+void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  const int sl = svc->spatial_layer_id;
+  // Check for disabling inter-layer (spatial) prediction, if
+  // svc.disable_inter_layer_pred is set. If the previous spatial layer was
+  // dropped then disable the prediction from this (scaled) reference.
+  // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled
+  // on key frames or if any spatial layer is a sync layer.
+  if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame &&
+       !svc->superframe_has_layer_sync) ||
+      svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
+      svc->drop_spatial_layer[sl - 1]) {
+    MV_REFERENCE_FRAME ref_frame;
+    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                      VP9_ALT_FLAG };
+    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+      const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        const struct scale_factors *const scale_fac =
+            &cm->frame_refs[ref_frame - 1].sf;
+        if (vp9_is_scaled(scale_fac)) {
+          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+          // Point golden/altref frame buffer index to last.
+          if (!svc->simulcast_mode) {
+            if (ref_frame == GOLDEN_FRAME)
+              cpi->gld_fb_idx = cpi->lst_fb_idx;
+            else if (ref_frame == ALTREF_FRAME)
+              cpi->alt_fb_idx = cpi->lst_fb_idx;
+          }
+        }
+      }
+    }
+  }
+  // For fixed/non-flexible SVC: check for disabling inter-layer prediction.
+  // If the reference for inter-layer prediction (the reference that is scaled)
+  // is not the previous spatial layer from the same superframe, then we disable
+  // inter-layer prediction. Only need to check when inter_layer prediction is
+  // not set to OFF mode.
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) {
+    // We only use LAST and GOLDEN for prediction in real-time mode, so we
+    // check both here.
+    MV_REFERENCE_FRAME ref_frame;
+    for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+      struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf;
+      if (vp9_is_scaled(scale_fac)) {
+        // If this reference  was updated on the previous spatial layer of the
+        // current superframe, then we keep this reference (don't disable).
+        // Otherwise we disable the inter-layer prediction.
+        // This condition is verified by checking if the current frame buffer
+        // index is equal to any of the slots for the previous spatial layer,
+        // and if so, check if that slot was updated/refreshed. If that is the
+        // case, then this reference is valid for inter-layer prediction under
+        // the mode INTER_LAYER_PRED_ON_CONSTRAINED.
+        int fb_idx =
+            ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx;
+        int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG;
+        int disable = 1;
+        if (fb_idx < 0) continue;
+        if ((fb_idx == svc->lst_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+            (fb_idx == svc->gld_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) ||
+            (fb_idx == svc->alt_fb_idx[sl - 1] &&
+             (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))))
+          disable = 0;
+        if (disable) cpi->ref_frame_flags &= (~ref_flag);
+      }
+    }
+  }
+}
+
+void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // For fixed/non-flexible mode, the following constraint are expected,
+  // when inter-layer prediciton is on (default).
+  if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
+      svc->framedrop_mode != LAYER_DROP) {
+    if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+      // On non-key frames: LAST is always temporal reference, GOLDEN is
+      // spatial reference.
+      if (svc->temporal_layer_id == 0)
+        // Base temporal only predicts from base temporal.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0);
+      else
+        // Non-base temporal only predicts from lower temporal layer.
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] <
+               svc->temporal_layer_id);
+      if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG &&
+          svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+        // Non-base spatial only predicts from lower spatial layer with same
+        // temporal_id.
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    } else if (svc->spatial_layer_id > 0 &&
+               svc->spatial_layer_id > svc->first_spatial_layer_to_encode) {
+      // Only 1 reference for frame whose base is key; reference may be LAST
+      // or GOLDEN, so we check both.
+      if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] ==
+               svc->temporal_layer_id);
+      } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+        assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] ==
+               svc->spatial_layer_id - 1);
+        assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] ==
+               svc->temporal_layer_id);
+      }
+    }
+  } else if (svc->use_gf_temporal_ref_current_layer &&
+             !svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    // For the usage of golden as second long term reference: the
+    // temporal_layer_id of that reference must be base temporal layer 0, and
+    // spatial_layer_id of that reference must be same as current
+    // spatial_layer_id. If not, disable feature.
+    // TODO(marpan): Investigate when this can happen, and maybe put this check
+    // and reset in a different place.
+    if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] !=
+            svc->spatial_layer_id ||
+        svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0)
+      svc->use_gf_temporal_ref_current_layer = 0;
+  }
+}
+
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(VP9_COMP *const cpi) {
+  int layer =
+      LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id,
+                       cpi->svc.number_temporal_layers);
+  LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+  return denoise_svc(cpi) && !lc->is_key_frame;
+}
+#endif
+
+void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // Only for superframes whose base is not key, as those are
+  // already sync frames.
+  if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) {
+    if (svc->spatial_layer_id == 0) {
+      // On base spatial layer: if the current superframe has a layer sync then
+      // reset the pattern counters and reset to base temporal layer.
+      if (svc->superframe_has_layer_sync)
+        vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME);
+    }
+    // If the layer sync is set for this current spatial layer then
+    // disable the temporal reference.
+    if (svc->spatial_layer_id > 0 &&
+        svc->spatial_layer_sync[svc->spatial_layer_id]) {
+      cpi->ref_frame_flags &= (~VP9_LAST_FLAG);
+      if (svc->use_gf_temporal_ref_current_layer) {
+        int index = svc->spatial_layer_id;
+        // If golden is used as second reference: need to remove it from
+        // prediction, reset refresh period to 0, and update the reference.
+        svc->use_gf_temporal_ref_current_layer = 0;
+        cpi->rc.baseline_gf_interval = 0;
+        cpi->rc.frames_till_gf_update_due = 0;
+        // On layer sync frame we must update the buffer index used for long
+        // term reference. Use the alt_ref since it is not used or updated on
+        // sync frames.
+        if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1;
+        assert(index >= 0);
+        cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx;
+        cpi->ext_refresh_alt_ref_frame = 1;
+      }
+    }
+  }
+}
+
+void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
+  SVC *const svc = &cpi->svc;
+  // Update the usage of frame buffer index for base spatial layers.
+  if (svc->spatial_layer_id == 0) {
+    if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
+      svc->fb_idx_base[cpi->lst_fb_idx] = 1;
+    if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame)
+      svc->fb_idx_base[cpi->gld_fb_idx] = 1;
+    if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
+      svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+  }
+}
+
+static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) {
+  // For non-flexible/bypass SVC mode: check for refreshing other buffer
+  // slots.
+  SVC *const svc = &cpi->svc;
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+  int i;
+  for (i = 0; i < REF_FRAMES; i++) {
+    if (cm->frame_type == KEY_FRAME ||
+        svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) {
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+      svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+    }
+  }
+}
+
+void vp9_svc_update_ref_frame(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  BufferPool *const pool = cm->buffer_pool;
+
+  if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+      svc->use_set_ref_frame_config) {
+    vp9_svc_update_ref_frame_bypass_mode(cpi);
+  } else if (cm->frame_type == KEY_FRAME && !svc->simulcast_mode) {
+    // Keep track of frame index for each reference frame.
+    int i;
+    // On key frame update all reference frame slots.
+    for (i = 0; i < REF_FRAMES; i++) {
+      svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id;
+      // LAST/GOLDEN/ALTREF is already updated above.
+      if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx)
+        ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+    }
+  } else {
+    if (cpi->refresh_last_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id;
+    }
+    if (cpi->refresh_golden_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id;
+    }
+    if (cpi->refresh_alt_ref_frame) {
+      svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id;
+      svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id;
+    }
+  }
+  // Copy flags from encoder to SVC struct.
+  vp9_copy_flags_ref_update_idx(cpi);
+  vp9_svc_update_ref_frame_buffer_idx(cpi);
+}
+
+void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) {
+  int64_t this_duration =
+      cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id];
+  vp9_new_framerate(cpi, 10000000.0 / this_duration);
+}
+
+void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  RATE_CONTROL *const rc = &cpi->rc;
+  // On key frames in CBR mode: reset the avg_frame_index for base layer
+  // (to level closer to worst_quality) if the overshoot is significant.
+  // Reset it for all temporal layers on base spatial layer.
+  if (cm->frame_type == KEY_FRAME && cpi->oxcf.rc_mode == VPX_CBR &&
+      !svc->simulcast_mode &&
+      rc->projected_frame_size > 3 * rc->avg_frame_bandwidth) {
+    int tl;
+    rc->avg_frame_qindex[INTER_FRAME] =
+        VPXMAX(rc->avg_frame_qindex[INTER_FRAME],
+               (cm->base_qindex + rc->worst_quality) >> 1);
+    for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+      const int layer = LAYER_IDS_TO_IDX(0, tl, svc->number_temporal_layers);
+      LAYER_CONTEXT *lc = &svc->layer_context[layer];
+      RATE_CONTROL *lrc = &lc->rc;
+      lrc->avg_frame_qindex[INTER_FRAME] = rc->avg_frame_qindex[INTER_FRAME];
+    }
+  }
+}
diff --git a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
index b7cdfd9623d..77d43826654 100644
--- a/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libs/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
-#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
+#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -19,6 +19,24 @@
 extern "C" {
 #endif
 
+typedef enum {
+  // Inter-layer prediction is on on all frames.
+  INTER_LAYER_PRED_ON,
+  // Inter-layer prediction is off on all frames.
+  INTER_LAYER_PRED_OFF,
+  // Inter-layer prediction is off on non-key frames and non-sync frames.
+  INTER_LAYER_PRED_OFF_NONKEY,
+  // Inter-layer prediction is on on all frames, but constrained such
+  // that any layer S (> 0) can only predict from previous spatial
+  // layer S-1, from the same superframe.
+  INTER_LAYER_PRED_ON_CONSTRAINED
+} INTER_LAYER_PRED;
+
+typedef struct BUFFER_LONGTERM_REF {
+  int idx;
+  int is_used;
+} BUFFER_LONGTERM_REF;
+
 typedef struct {
   RATE_CONTROL rc;
   int target_bandwidth;
@@ -42,10 +60,14 @@ typedef struct {
   size_t layer_size;
   struct vpx_psnr_pkt psnr_pkt;
   // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame.
+  // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct.
   int sb_index;
   signed char *map;
   uint8_t *last_coded_q_map;
   uint8_t *consec_zero_mv;
+  int actual_num_seg1_blocks;
+  int actual_num_seg2_blocks;
+  int counter_encode_maxq_scene_change;
   uint8_t speed;
 } LAYER_CONTEXT;
 
@@ -56,8 +78,6 @@ typedef struct SVC {
   int number_temporal_layers;
 
   int spatial_layer_to_encode;
-  int first_spatial_layer_to_encode;
-  int rc_drop_superframe;
 
   // Workaround for multiple frame contexts
   enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state;
@@ -81,14 +101,20 @@ typedef struct SVC {
   // Frame flags and buffer indexes for each spatial layer, set by the
   // application (external settings).
   int ext_frame_flags[VPX_MAX_LAYERS];
-  int ext_lst_fb_idx[VPX_MAX_LAYERS];
-  int ext_gld_fb_idx[VPX_MAX_LAYERS];
-  int ext_alt_fb_idx[VPX_MAX_LAYERS];
-  int ref_frame_index[REF_FRAMES];
+  int lst_fb_idx[VPX_MAX_LAYERS];
+  int gld_fb_idx[VPX_MAX_LAYERS];
+  int alt_fb_idx[VPX_MAX_LAYERS];
   int force_zero_mode_spatial_ref;
+  // Sequence level flag to enable second (long term) temporal reference.
+  int use_gf_temporal_ref;
+  // Frame level flag to enable second (long term) temporal reference.
+  int use_gf_temporal_ref_current_layer;
+  // Allow second reference for at most 2 top highest resolution layers.
+  BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2];
   int current_superframe;
   int non_reference_frame;
   int use_base_mv;
+  int use_partition_reuse;
   // Used to control the downscaling filter for source scaling, for 1 pass CBR.
   // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
   // = 8 will center the target pixel and get a symmetric averaging filter.
@@ -99,8 +125,73 @@ typedef struct SVC {
 
   BLOCK_SIZE *prev_partition_svc;
   int mi_stride[VPX_MAX_LAYERS];
+  int mi_rows[VPX_MAX_LAYERS];
+  int mi_cols[VPX_MAX_LAYERS];
 
   int first_layer_denoise;
+
+  int skip_enhancement_layer;
+
+  int lower_layer_qindex;
+
+  int last_layer_dropped[VPX_MAX_LAYERS];
+  int drop_spatial_layer[VPX_MAX_LAYERS];
+  int framedrop_thresh[VPX_MAX_LAYERS];
+  int drop_count[VPX_MAX_LAYERS];
+  int max_consec_drop;
+  SVC_LAYER_DROP_MODE framedrop_mode;
+
+  INTER_LAYER_PRED disable_inter_layer_pred;
+
+  // Flag to indicate scene change and high num of motion blocks at current
+  // superframe, scene detection is currently checked for each superframe prior
+  // to encoding, on the full resolution source.
+  int high_source_sad_superframe;
+  int high_num_blocks_with_motion;
+
+  // Flags used to get SVC pattern info.
+  int update_buffer_slot[VPX_SS_MAX_LAYERS];
+  uint8_t reference_last[VPX_SS_MAX_LAYERS];
+  uint8_t reference_golden[VPX_SS_MAX_LAYERS];
+  uint8_t reference_altref[VPX_SS_MAX_LAYERS];
+  // TODO(jianj): Remove these last 3, deprecated.
+  uint8_t update_last[VPX_SS_MAX_LAYERS];
+  uint8_t update_golden[VPX_SS_MAX_LAYERS];
+  uint8_t update_altref[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the frame buffer index updated/refreshed on the base
+  // temporal superframe.
+  int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
+
+  // Keep track of the spatial and temporal layer id of the frame that last
+  // updated the frame buffer index.
+  uint8_t fb_idx_spatial_layer_id[REF_FRAMES];
+  uint8_t fb_idx_temporal_layer_id[REF_FRAMES];
+
+  int spatial_layer_sync[VPX_SS_MAX_LAYERS];
+  uint8_t set_intra_only_frame;
+  uint8_t previous_frame_is_intra_only;
+  uint8_t superframe_has_layer_sync;
+
+  uint8_t fb_idx_base[REF_FRAMES];
+
+  int use_set_ref_frame_config;
+
+  int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS];
+
+  int first_spatial_layer_to_encode;
+
+  // Parameters for allowing framerate per spatial layer, and buffer
+  // update based on timestamps.
+  int64_t duration[VPX_SS_MAX_LAYERS];
+  int64_t timebase_fac;
+  int64_t time_stamp_superframe;
+  int64_t time_stamp_prev[VPX_SS_MAX_LAYERS];
+
+  int num_encoded_top_layer;
+
+  // Every spatial layer on a superframe whose base is key is key too.
+  int simulcast_mode;
 } SVC;
 
 struct VP9_COMP;
@@ -148,16 +239,37 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
 // Start a frame and initialize svc parameters
 int vp9_svc_start_frame(struct VP9_COMP *const cpi);
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
+#endif
+
+void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
+
 int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
-void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key);
 
 void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
 
+void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
+
+void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi);
+
+void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame_key_simulcast(struct VP9_COMP *const cpi);
+
+void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
+
+void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_
+#endif  // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
index 2758c42aebf..701bb892871 100644
--- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -34,57 +34,155 @@
 #include "vpx_scale/vpx_scale.h"
 
 static int fixed_divide[512];
+static unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
+                                       39322, 32768, 28087, 24576, 21846,
+                                       19661, 17874, 0,     15124 };
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_index_mult[14] = { 0U,          0U,          0U,
+                                         0U,          3221225472U, 2576980378U,
+                                         2147483648U, 1840700270U, 1610612736U,
+                                         1431655766U, 1288490189U, 1171354718U,
+                                         0U,          991146300U };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void temporal_filter_predictors_mb_c(
     MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
     int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
-    uint8_t *pred, struct scale_factors *scale, int x, int y) {
+    uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs,
+    int use_32x32) {
   const int which_mv = 0;
-  const MV mv = { mv_row, mv_col };
   const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP];
+  int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1);
 
   enum mv_precision mv_precision_uv;
   int uv_stride;
-  if (uv_block_width == 8) {
+  if (uv_block_width == (BW >> 1)) {
     uv_stride = (stride + 1) >> 1;
     mv_precision_uv = MV_PRECISION_Q4;
   } else {
     uv_stride = stride;
     mv_precision_uv = MV_PRECISION_Q3;
   }
+#if !CONFIG_VP9_HIGHBITDEPTH
+  (void)xd;
+#endif
 
+  if (use_32x32) {
+    const MV mv = { mv_row, mv_col };
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
-                                     CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv,
-                                     scale, 16, 16, which_mv, kernel,
-                                     MV_PRECISION_Q3, x, y, xd->bd);
-
-    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
-                                     CONVERT_TO_SHORTPTR(&pred[256]),
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, kernel,
-                                     mv_precision_uv, x, y, xd->bd);
-
-    vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
-                                     CONVERT_TO_SHORTPTR(&pred[512]),
-                                     uv_block_width, &mv, scale, uv_block_width,
-                                     uv_block_height, which_mv, kernel,
-                                     mv_precision_uv, x, y, xd->bd);
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride,
+                                       CONVERT_TO_SHORTPTR(&pred[0]), BW, &mv,
+                                       scale, BW, BH, which_mv, kernel,
+                                       MV_PRECISION_Q3, x, y, xd->bd);
+
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride,
+          CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale,
+          uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x,
+          y, xd->bd);
+
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride,
+          CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv,
+          scale, uv_block_width, uv_block_height, which_mv, kernel,
+          mv_precision_uv, x, y, xd->bd);
+      return;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW,
+                              BH, which_mv, kernel, MV_PRECISION_Q3, x, y);
+
+    vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[BLK_PELS],
+                              uv_block_width, &mv, scale, uv_block_width,
+                              uv_block_height, which_mv, kernel,
+                              mv_precision_uv, x, y);
+
+    vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)],
+                              uv_block_width, &mv, scale, uv_block_width,
+                              uv_block_height, which_mv, kernel,
+                              mv_precision_uv, x, y);
     return;
   }
+
+  // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16
+  // predictors.
+  // Y predictor
+  for (i = 0; i < BH; i += ys) {
+    for (j = 0; j < BW; j += xs) {
+      const MV mv = blk_mvs[k];
+      const int y_offset = i * stride + j;
+      const int p_offset = i * BW + j;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_build_inter_predictor(
+            CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride,
+            CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys,
+            which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd);
+      } else {
+        vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+                                  BW, &mv, scale, xs, ys, which_mv, kernel,
+                                  MV_PRECISION_Q3, x, y);
+      }
+#else
+      vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset],
+                                BW, &mv, scale, xs, ys, which_mv, kernel,
+                                MV_PRECISION_Q3, x, y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  (void)xd;
-  vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
-                            which_mv, kernel, MV_PRECISION_Q3, x, y);
+      k++;
+    }
+  }
+
+  // U and V predictors
+  ys = (uv_block_height >> 1);
+  xs = (uv_block_width >> 1);
+  k = 0;
 
-  vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            which_mv, kernel, mv_precision_uv, x, y);
+  for (i = 0; i < uv_block_height; i += ys) {
+    for (j = 0; j < uv_block_width; j += xs) {
+      const MV mv = blk_mvs[k];
+      const int uv_offset = i * uv_stride + j;
+      const int p_offset = i * uv_block_width + j;
 
-  vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width,
-                            &mv, scale, uv_block_width, uv_block_height,
-                            which_mv, kernel, mv_precision_uv, x, y);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_build_inter_predictor(
+            CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride,
+            CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width,
+            &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y,
+            xd->bd);
+
+        vp9_highbd_build_inter_predictor(
+            CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride,
+            CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]),
+            uv_block_width, &mv, scale, xs, ys, which_mv, kernel,
+            mv_precision_uv, x, y, xd->bd);
+      } else {
+        vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+                                  &pred[BLK_PELS + p_offset], uv_block_width,
+                                  &mv, scale, xs, ys, which_mv, kernel,
+                                  mv_precision_uv, x, y);
+
+        vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride,
+                                  &pred[(BLK_PELS << 1) + p_offset],
+                                  uv_block_width, &mv, scale, xs, ys, which_mv,
+                                  kernel, mv_precision_uv, x, y);
+      }
+#else
+      vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride,
+                                &pred[BLK_PELS + p_offset], uv_block_width, &mv,
+                                scale, xs, ys, which_mv, kernel,
+                                mv_precision_uv, x, y);
+
+      vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride,
+                                &pred[(BLK_PELS << 1) + p_offset],
+                                uv_block_width, &mv, scale, xs, ys, which_mv,
+                                kernel, mv_precision_uv, x, y);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      k++;
+    }
+  }
 }
 
 void vp9_temporal_filter_init(void) {
@@ -94,143 +192,372 @@ void vp9_temporal_filter_init(void) {
   for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i;
 }
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride,
-                                 const uint8_t *frame2,
-                                 unsigned int block_width,
-                                 unsigned int block_height, int strength,
-                                 int filter_weight, uint32_t *accumulator,
-                                 uint16_t *count) {
-  unsigned int i, j, k;
+static INLINE int mod_index(int sum_dist, int index, int rounding, int strength,
+                            int filter_weight) {
+  int mod;
+
+  assert(index >= 0 && index <= 13);
+  assert(index_mult[index] != 0);
+
+  mod =
+      ((unsigned int)clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int highbd_mod_index(int sum_dist, int index, int rounding,
+                                   int strength, int filter_weight) {
+  int mod;
+
+  assert(index >= 0 && index <= 13);
+  assert(highbd_index_mult[index] != 0);
+
+  mod = (int)((clamp(sum_dist, 0, INT32_MAX) * highbd_index_mult[index]) >> 32);
+  mod += rounding;
+  mod >>= strength;
+
+  mod = VPXMIN(16, mod);
+
+  mod = 16 - mod;
+  mod *= filter_weight;
+
+  return mod;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int get_filter_weight(unsigned int i, unsigned int j,
+                                    unsigned int block_height,
+                                    unsigned int block_width,
+                                    const int *const blk_fw, int use_32x32) {
+  // blk_fw[0] ~ blk_fw[3] are the same.
+  if (use_32x32) {
+    return blk_fw[0];
+  }
+
+  if (i < block_height / 2) {
+    if (j < block_width / 2) {
+      return blk_fw[0];
+    }
+
+    return blk_fw[1];
+  }
+
+  if (j < block_width / 2) {
+    return blk_fw[2];
+  }
+
+  return blk_fw[3];
+}
+
+void vp9_apply_temporal_filter_c(
+    const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred,
+    int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1,
+    int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred,
+    int uv_buf_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator,
+    uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) {
+  unsigned int i, j, k, m;
   int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+  const int rounding = (1 << strength) >> 1;
+  const unsigned int uv_block_width = block_width >> ss_x;
+  const unsigned int uv_block_height = block_height >> ss_y;
+  DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]);
+
+  int idx = 0, idy;
 
   assert(strength >= 0);
   assert(strength <= 6);
 
-  assert(filter_weight >= 0);
-  assert(filter_weight <= 2);
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t));
+
+  // Calculate diff^2 for each pixel of the 16x16 block.
+  // TODO(yunqing): the following code needs to be optimized.
+  for (i = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int16_t diff =
+          y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j];
+      y_diff_sse[idx++] = diff * diff;
+    }
+  }
+  idx = 0;
+  for (i = 0; i < uv_block_height; i++) {
+    for (j = 0; j < uv_block_width; j++) {
+      const int16_t diffu =
+          u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j];
+      const int16_t diffv =
+          v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j];
+      u_diff_sse[idx] = diffu * diffu;
+      v_diff_sse[idx] = diffv * diffv;
+      idx++;
+    }
+  }
 
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
+  for (i = 0, k = 0, m = 0; i < block_height; i++) {
+    for (j = 0; j < block_width; j++) {
+      const int pixel_value = y_pred[i * y_buf_stride + j];
+      const int filter_weight =
+          get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32);
 
       // non-local mean approach
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
+      int y_index = 0;
+
+      const int uv_r = i >> ss_y;
+      const int uv_c = j >> ss_x;
+      modifier = 0;
 
       for (idy = -1; idy <= 1; ++idy) {
         for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
+          const int row = (int)i + idy;
+          const int col = (int)j + idx;
 
           if (row >= 0 && row < (int)block_height && col >= 0 &&
               col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
+            modifier += y_diff_sse[row * (int)block_width + col];
+            ++y_index;
           }
         }
       }
 
-      assert(index > 0);
+      assert(y_index > 0);
 
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+      modifier += u_diff_sse[uv_r * uv_block_width + uv_c];
+      modifier += v_diff_sse[uv_r * uv_block_width + uv_c];
 
-      modifier *= 3;
-      modifier /= index;
+      y_index += 2;
 
-      ++frame2;
+      modifier =
+          mod_index(modifier, y_index, rounding, strength, filter_weight);
 
-      modifier += rounding;
-      modifier >>= strength;
+      y_count[k] += modifier;
+      y_accumulator[k] += modifier * pixel_value;
 
-      if (modifier > 16) modifier = 16;
+      ++k;
 
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
+      // Process chroma component
+      if (!(i & ss_y) && !(j & ss_x)) {
+        const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c];
+        const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c];
 
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
+        // non-local mean approach
+        int cr_index = 0;
+        int u_mod = 0, v_mod = 0;
+        int y_diff = 0;
 
-      byte++;
-    }
+        for (idy = -1; idy <= 1; ++idy) {
+          for (idx = -1; idx <= 1; ++idx) {
+            const int row = uv_r + idy;
+            const int col = uv_c + idx;
+
+            if (row >= 0 && row < (int)uv_block_height && col >= 0 &&
+                col < (int)uv_block_width) {
+              u_mod += u_diff_sse[row * uv_block_width + col];
+              v_mod += v_diff_sse[row * uv_block_width + col];
+              ++cr_index;
+            }
+          }
+        }
+
+        assert(cr_index > 0);
+
+        for (idy = 0; idy < 1 + ss_y; ++idy) {
+          for (idx = 0; idx < 1 + ss_x; ++idx) {
+            const int row = (uv_r << ss_y) + idy;
+            const int col = (uv_c << ss_x) + idx;
+            y_diff += y_diff_sse[row * (int)block_width + col];
+            ++cr_index;
+          }
+        }
+
+        u_mod += y_diff;
+        v_mod += y_diff;
+
+        u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
+        v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
 
-    byte += stride - block_width;
+        u_count[m] += u_mod;
+        u_accumulator[m] += u_mod * u_pixel_value;
+        v_count[m] += v_mod;
+        v_accumulator[m] += v_mod * v_pixel_value;
+
+        ++m;
+      }  // Complete YUV pixel
+    }
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_temporal_filter_apply_c(
-    const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8,
-    unsigned int block_width, unsigned int block_height, int strength,
-    int filter_weight, uint32_t *accumulator, uint16_t *count) {
-  const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
-  const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
-  unsigned int i, j, k;
-  int modifier;
-  int byte = 0;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
-
-  for (i = 0, k = 0; i < block_height; i++) {
-    for (j = 0; j < block_width; j++, k++) {
-      int pixel_value = *frame2;
-      int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
+void vp9_highbd_apply_temporal_filter_c(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32,
+    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count) {
+  const int uv_block_width = block_width >> ss_x;
+  const int uv_block_height = block_height >> ss_y;
+  const int y_diff_stride = BW;
+  const int uv_diff_stride = BW;
+
+  DECLARE_ALIGNED(16, uint32_t, y_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, u_diff_sse[BLK_PELS]);
+  DECLARE_ALIGNED(16, uint32_t, v_diff_sse[BLK_PELS]);
+
+  const int rounding = (1 << strength) >> 1;
+
+  // Loop variables
+  int row, col;
+  int uv_row, uv_col;
+  int row_step, col_step;
+
+  memset(y_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(u_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+  memset(v_diff_sse, 0, BLK_PELS * sizeof(uint32_t));
+
+  // Get the square diffs
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
+      const int diff =
+          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
+      y_diff_sse[row * y_diff_stride + col] = diff * diff;
+    }
+  }
 
-      for (idy = -1; idy <= 1; ++idy) {
-        for (idx = -1; idx <= 1; ++idx) {
-          int row = (int)i + idy;
-          int col = (int)j + idx;
+  for (row = 0; row < uv_block_height; row++) {
+    for (col = 0; col < uv_block_width; col++) {
+      const int u_diff =
+          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
+      const int v_diff =
+          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
+      u_diff_sse[row * uv_diff_stride + col] = u_diff * u_diff;
+      v_diff_sse[row * uv_diff_stride + col] = v_diff * v_diff;
+    }
+  }
 
-          if (row >= 0 && row < (int)block_height && col >= 0 &&
-              col < (int)block_width) {
-            int diff = frame1[byte + idy * (int)stride + idx] -
-                       frame2[idy * (int)block_width + idx];
-            diff_sse[index] = diff * diff;
-            ++index;
+  // Apply the filter to luma
+  for (row = 0; row < (int)block_height; row++) {
+    for (col = 0; col < (int)block_width; col++) {
+      const int uv_row = row >> ss_y;
+      const int uv_col = col >> ss_x;
+      const int filter_weight = get_filter_weight(
+          row, col, block_height, block_width, blk_fw, use_32x32);
+
+      // First we get the modifier for the current y pixel
+      const int y_pixel = y_pre[row * y_pre_stride + col];
+      int y_num_used = 0;
+      int y_mod = 0;
+
+      // Sum the neighboring 3x3 y pixels
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = row + row_step;
+          const int sub_col = col + col_step;
+
+          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
+              sub_col < (int)block_width) {
+            y_mod += y_diff_sse[sub_row * y_diff_stride + sub_col];
+            y_num_used++;
           }
         }
       }
-      assert(index > 0);
 
-      modifier = 0;
-      for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+      // Sum the corresponding uv pixels to the current y modifier
+      // Note we are rounding down instead of rounding to the nearest pixel.
+      y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
+      y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 
-      modifier *= 3;
-      modifier /= index;
+      y_num_used += 2;
 
-      ++frame2;
-      modifier += rounding;
-      modifier >>= strength;
+      // Set the modifier
+      y_mod = highbd_mod_index(y_mod, y_num_used, rounding, strength,
+                               filter_weight);
 
-      if (modifier > 16) modifier = 16;
+      // Accumulate the result
+      y_count[row * block_width + col] += y_mod;
+      y_accum[row * block_width + col] += y_mod * y_pixel;
+    }
+  }
 
-      modifier = 16 - modifier;
-      modifier *= filter_weight;
+  // Apply the filter to chroma
+  for (uv_row = 0; uv_row < uv_block_height; uv_row++) {
+    for (uv_col = 0; uv_col < uv_block_width; uv_col++) {
+      const int y_row = uv_row << ss_y;
+      const int y_col = uv_col << ss_x;
+      const int filter_weight = get_filter_weight(
+          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
+
+      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
+      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
+
+      int uv_num_used = 0;
+      int u_mod = 0, v_mod = 0;
+
+      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
+      for (row_step = -1; row_step <= 1; row_step++) {
+        for (col_step = -1; col_step <= 1; col_step++) {
+          const int sub_row = uv_row + row_step;
+          const int sub_col = uv_col + col_step;
+
+          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
+              sub_col < uv_block_width) {
+            u_mod += u_diff_sse[sub_row * uv_diff_stride + sub_col];
+            v_mod += v_diff_sse[sub_row * uv_diff_stride + sub_col];
+            uv_num_used++;
+          }
+        }
+      }
 
-      count[k] += modifier;
-      accumulator[k] += modifier * pixel_value;
+      // Sum all the luma pixels associated with the current luma pixel
+      for (row_step = 0; row_step < 1 + ss_y; row_step++) {
+        for (col_step = 0; col_step < 1 + ss_x; col_step++) {
+          const int sub_row = y_row + row_step;
+          const int sub_col = y_col + col_step;
+          const int y_diff = y_diff_sse[sub_row * y_diff_stride + sub_col];
 
-      byte++;
-    }
+          u_mod += y_diff;
+          v_mod += y_diff;
+          uv_num_used++;
+        }
+      }
 
-    byte += stride - block_width;
+      // Set the modifier
+      u_mod = highbd_mod_index(u_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+      v_mod = highbd_mod_index(v_mod, uv_num_used, rounding, strength,
+                               filter_weight);
+
+      // Accumulate the result
+      u_count[uv_row * uv_block_width + uv_col] += u_mod;
+      u_accum[uv_row * uv_block_width + uv_col] += u_mod * u_pixel;
+      v_count[uv_row * uv_block_width + uv_col] += v_mod;
+      v_accum[uv_row * uv_block_width + uv_col] += v_mod * v_pixel;
+    }
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
-                                                   ThreadData *td,
-                                                   uint8_t *arf_frame_buf,
-                                                   uint8_t *frame_ptr_buf,
-                                                   int stride, MV *ref_mv) {
+static uint32_t temporal_filter_find_matching_mb_c(
+    VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf,
+    uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs,
+    int *blk_bestsme) {
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = HEX;
+  const SEARCH_METHODS search_method = MESH;
+  const SEARCH_METHODS search_method_16 = cpi->sf.temporal_filter_search_method;
   int step_param;
   int sadpb = x->sadperbit16;
   uint32_t bestsme = UINT_MAX;
@@ -245,6 +572,7 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
   // Save input state
   struct buf_2d src = x->plane[0].src;
   struct buf_2d pre = xd->plane[0].pre[0];
+  int i, j, k = 0;
 
   best_ref_mv1_full.col = best_ref_mv1.col >> 3;
   best_ref_mv1_full.row = best_ref_mv1.row >> 3;
@@ -260,19 +588,52 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi,
 
   vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
 
-  vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+  vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param,
                         search_method, sadpb, cond_cost_list(cpi, cost_list),
                         &best_ref_mv1, ref_mv, 0, 0);
 
   /* restore UMV window */
   x->mv_limits = tmp_mv_limits;
 
-  // Ignore mv costing by sending NULL pointer instead of cost array
+  // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost
+  // calculation. The start full mv and the search result are stored in
+  // ref_mv.
   bestsme = cpi->find_fractional_mv_step(
       x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv,
-      x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0,
-      mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL,
-      &distortion, &sse, NULL, 0, 0);
+      x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW,
+      BH, USE_8_TAPS_SHARP);
+
+  // DO motion search on 4 16x16 sub_blocks.
+  best_ref_mv1.row = ref_mv->row;
+  best_ref_mv1.col = ref_mv->col;
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  for (i = 0; i < BH; i += SUB_BH) {
+    for (j = 0; j < BW; j += SUB_BW) {
+      // Setup frame pointers
+      x->plane[0].src.buf = arf_frame_buf + i * stride + j;
+      x->plane[0].src.stride = stride;
+      xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j;
+      xd->plane[0].pre[0].stride = stride;
+
+      vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+      vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full,
+                            step_param, search_method_16, sadpb,
+                            cond_cost_list(cpi, cost_list), &best_ref_mv1,
+                            &blk_mvs[k], 0, 0);
+      /* restore UMV window */
+      x->mv_limits = tmp_mv_limits;
+
+      blk_bestsme[k] = cpi->find_fractional_mv_step(
+          x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv,
+          x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0,
+          mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL,
+          NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP);
+      k++;
+    }
+  }
 
   // Restore input state
   x->plane[0].src = src;
@@ -293,25 +654,24 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
   int byte;
   int frame;
   int mb_col;
-  unsigned int filter_weight;
-  int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
-  int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
-  DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+  int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2;
+  int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2;
+  DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]);
+  DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]);
   MACROBLOCKD *mbd = &td->mb.e_mbd;
   YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
   uint8_t *dst1, *dst2;
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]);
   uint8_t *predictor;
 #else
-  DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]);
 #endif
-  const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
-  const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+  const int mb_uv_height = BH >> mbd->plane[1].subsampling_y;
+  const int mb_uv_width = BW >> mbd->plane[1].subsampling_x;
   // Addition of the tile col level offsets
-  int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start;
+  int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start;
   int mb_uv_offset =
       mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start;
 
@@ -334,21 +694,21 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
   //  8 - VP9_INTERP_EXTEND.
   // To keep the mv in play for both Y and UV planes the max that it
   //  can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1).
-  td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+  td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND));
   td->mb.mv_limits.row_max =
-      ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+      ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND);
 
   for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) {
     int i, j, k;
     int stride;
     MV ref_mv;
 
-    vp9_zero_array(accumulator, 16 * 16 * 3);
-    vp9_zero_array(count, 16 * 16 * 3);
+    vp9_zero_array(accumulator, BLK_PELS * 3);
+    vp9_zero_array(count, BLK_PELS * 3);
 
-    td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND));
+    td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND));
     td->mb.mv_limits.col_max =
-        ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
+        ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND);
 
     if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
       unsigned int src_variance;
@@ -360,92 +720,130 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         src_variance =
-            vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd);
+            vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd);
       } else {
-        src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+        src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
       }
 #else
-      src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+      src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2);
+      if (src_variance <= 2) {
+        strength = VPXMAX(0, arnr_filter_data->strength - 2);
+      }
     }
 
     for (frame = 0; frame < frame_count; frame++) {
-      const uint32_t thresh_low = 10000;
-      const uint32_t thresh_high = 20000;
+      // MVs for 4 16x16 sub blocks.
+      MV blk_mvs[4];
+      // Filter weights for 4 16x16 sub blocks.
+      int blk_fw[4] = { 0, 0, 0, 0 };
+      int use_32x32 = 0;
 
       if (frames[frame] == NULL) continue;
 
       ref_mv.row = 0;
       ref_mv.col = 0;
+      blk_mvs[0] = kZeroMv;
+      blk_mvs[1] = kZeroMv;
+      blk_mvs[2] = kZeroMv;
+      blk_mvs[3] = kZeroMv;
 
       if (frame == alt_ref_index) {
-        filter_weight = 2;
+        blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2;
+        use_32x32 = 1;
       } else {
+        const int thresh_low = 10000;
+        const int thresh_high = 20000;
+        int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+
         // Find best match in this frame by MC
-        uint32_t err = temporal_filter_find_matching_mb_c(
+        int err = temporal_filter_find_matching_mb_c(
             cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset,
             frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
-            &ref_mv);
+            &ref_mv, blk_mvs, blk_bestsme);
+
+        int err16 =
+            blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3];
+        int max_err = INT_MIN, min_err = INT_MAX;
+        for (k = 0; k < 4; k++) {
+          if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k];
+          if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k];
+        }
 
-        // Assign higher weight to matching MB if its error
-        // score is lower. If not applying MC default behavior
-        // is to weight all MBs equal.
-        filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+        if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) ||
+            ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) {
+          use_32x32 = 1;
+          // Assign higher weight to matching MB if it's error
+          // score is lower. If not applying MC default behavior
+          // is to weight all MBs equal.
+          blk_fw[0] = err < (thresh_low << THR_SHIFT)
+                          ? 2
+                          : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+          blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
+        } else {
+          use_32x32 = 0;
+          for (k = 0; k < 4; k++)
+            blk_fw[k] = blk_bestsme[k] < thresh_low
+                            ? 2
+                            : blk_bestsme[k] < thresh_high ? 1 : 0;
+        }
+
+        for (k = 0; k < 4; k++) {
+          switch (abs(frame - alt_ref_index)) {
+            case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break;
+            case 2:
+            case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break;
+            default: break;
+          }
+        }
       }
 
-      if (filter_weight != 0) {
+      if (blk_fw[0] | blk_fw[1] | blk_fw[2] | blk_fw[3]) {
         // Construct the predictors
         temporal_filter_predictors_mb_c(
             mbd, frames[frame]->y_buffer + mb_y_offset,
             frames[frame]->u_buffer + mb_uv_offset,
             frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
             mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale,
-            mb_col * 16, mb_row * 16);
+            mb_col * BW, mb_row * BH, blk_mvs, use_32x32);
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
           int adj_strength = strength + 2 * (mbd->bd - 8);
           // Apply the filter (YUV)
-          vp9_highbd_temporal_filter_apply(
-              f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
-              adj_strength, filter_weight, accumulator, count);
-          vp9_highbd_temporal_filter_apply(
-              f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
-              mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-              accumulator + 256, count + 256);
-          vp9_highbd_temporal_filter_apply(
-              f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
-              mb_uv_width, mb_uv_height, adj_strength, filter_weight,
-              accumulator + 512, count + 512);
+          vp9_highbd_apply_temporal_filter(
+              CONVERT_TO_SHORTPTR(f->y_buffer + mb_y_offset), f->y_stride,
+              CONVERT_TO_SHORTPTR(predictor), BW,
+              CONVERT_TO_SHORTPTR(f->u_buffer + mb_uv_offset),
+              CONVERT_TO_SHORTPTR(f->v_buffer + mb_uv_offset), f->uv_stride,
+              CONVERT_TO_SHORTPTR(predictor + BLK_PELS),
+              CONVERT_TO_SHORTPTR(predictor + (BLK_PELS << 1)), mb_uv_width, BW,
+              BH, mbd->plane[1].subsampling_x, mbd->plane[1].subsampling_y,
+              adj_strength, blk_fw, use_32x32, accumulator, count,
+              accumulator + BLK_PELS, count + BLK_PELS,
+              accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
         } else {
           // Apply the filter (YUV)
-          vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                    predictor, 16, 16, strength, filter_weight,
-                                    accumulator, count);
-          vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 256, mb_uv_width, mb_uv_height,
-                                    strength, filter_weight, accumulator + 256,
-                                    count + 256);
-          vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                    predictor + 512, mb_uv_width, mb_uv_height,
-                                    strength, filter_weight, accumulator + 512,
-                                    count + 512);
+          vp9_apply_temporal_filter(
+              f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+              f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+              f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+              mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+              mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+              accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+              accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
         }
 #else
         // Apply the filter (YUV)
-        vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                  predictor, 16, 16, strength, filter_weight,
-                                  accumulator, count);
-        vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                  predictor + 256, mb_uv_width, mb_uv_height,
-                                  strength, filter_weight, accumulator + 256,
-                                  count + 256);
-        vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                  predictor + 512, mb_uv_width, mb_uv_height,
-                                  strength, filter_weight, accumulator + 512,
-                                  count + 512);
+        vp9_apply_temporal_filter(
+            f->y_buffer + mb_y_offset, f->y_stride, predictor, BW,
+            f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset,
+            f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1),
+            mb_uv_width, BW, BH, mbd->plane[1].subsampling_x,
+            mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32,
+            accumulator, count, accumulator + BLK_PELS, count + BLK_PELS,
+            accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
@@ -459,8 +857,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       dst1_16 = CONVERT_TO_SHORTPTR(dst1);
       stride = cpi->alt_ref_buffer.y_stride;
       byte = mb_y_offset;
-      for (i = 0, k = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++, k++) {
+      for (i = 0, k = 0; i < BH; i++) {
+        for (j = 0; j < BW; j++, k++) {
           unsigned int pval = accumulator[k] + (count[k] >> 1);
           pval *= fixed_divide[count[k]];
           pval >>= 19;
@@ -471,7 +869,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
           byte++;
         }
 
-        byte += stride - 16;
+        byte += stride - BW;
       }
 
       dst1 = cpi->alt_ref_buffer.u_buffer;
@@ -480,9 +878,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       dst2_16 = CONVERT_TO_SHORTPTR(dst2);
       stride = cpi->alt_ref_buffer.uv_stride;
       byte = mb_uv_offset;
-      for (i = 0, k = 256; i < mb_uv_height; i++) {
+      for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
         for (j = 0; j < mb_uv_width; j++, k++) {
-          int m = k + 256;
+          int m = k + BLK_PELS;
 
           // U
           unsigned int pval = accumulator[k] + (count[k] >> 1);
@@ -507,8 +905,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       dst1 = cpi->alt_ref_buffer.y_buffer;
       stride = cpi->alt_ref_buffer.y_stride;
       byte = mb_y_offset;
-      for (i = 0, k = 0; i < 16; i++) {
-        for (j = 0; j < 16; j++, k++) {
+      for (i = 0, k = 0; i < BH; i++) {
+        for (j = 0; j < BW; j++, k++) {
           unsigned int pval = accumulator[k] + (count[k] >> 1);
           pval *= fixed_divide[count[k]];
           pval >>= 19;
@@ -518,16 +916,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
           // move to next pixel
           byte++;
         }
-        byte += stride - 16;
+        byte += stride - BW;
       }
 
       dst1 = cpi->alt_ref_buffer.u_buffer;
       dst2 = cpi->alt_ref_buffer.v_buffer;
       stride = cpi->alt_ref_buffer.uv_stride;
       byte = mb_uv_offset;
-      for (i = 0, k = 256; i < mb_uv_height; i++) {
+      for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
         for (j = 0; j < mb_uv_width; j++, k++) {
-          int m = k + 256;
+          int m = k + BLK_PELS;
 
           // U
           unsigned int pval = accumulator[k] + (count[k] >> 1);
@@ -552,8 +950,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
     dst1 = cpi->alt_ref_buffer.y_buffer;
     stride = cpi->alt_ref_buffer.y_stride;
     byte = mb_y_offset;
-    for (i = 0, k = 0; i < 16; i++) {
-      for (j = 0; j < 16; j++, k++) {
+    for (i = 0, k = 0; i < BH; i++) {
+      for (j = 0; j < BW; j++, k++) {
         unsigned int pval = accumulator[k] + (count[k] >> 1);
         pval *= fixed_divide[count[k]];
         pval >>= 19;
@@ -563,16 +961,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
         // move to next pixel
         byte++;
       }
-      byte += stride - 16;
+      byte += stride - BW;
     }
 
     dst1 = cpi->alt_ref_buffer.u_buffer;
     dst2 = cpi->alt_ref_buffer.v_buffer;
     stride = cpi->alt_ref_buffer.uv_stride;
     byte = mb_uv_offset;
-    for (i = 0, k = 256; i < mb_uv_height; i++) {
+    for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) {
       for (j = 0; j < mb_uv_width; j++, k++) {
-        int m = k + 256;
+        int m = k + BLK_PELS;
 
         // U
         unsigned int pval = accumulator[k] + (count[k] >> 1);
@@ -592,7 +990,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
       byte += stride - mb_uv_width;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    mb_y_offset += 16;
+    mb_y_offset += BW;
     mb_uv_offset += mb_uv_width;
   }
 }
@@ -603,10 +1001,10 @@ static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row,
   const int tile_cols = 1 << cm->log2_tile_cols;
   TileInfo *tile_info =
       &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
-  const int mb_row_start = (tile_info->mi_row_start) >> 1;
-  const int mb_row_end = (tile_info->mi_row_end + 1) >> 1;
-  const int mb_col_start = (tile_info->mi_col_start) >> 1;
-  const int mb_col_end = (tile_info->mi_col_end + 1) >> 1;
+  const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT;
+  const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT;
+  const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT;
+  const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT;
   int mb_row;
 
   for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) {
@@ -620,13 +1018,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) {
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_row, tile_col;
-  MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
-  // Save input state
-  uint8_t *input_buffer[MAX_MB_PLANE];
-  int i;
-
-  for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
-
   vp9_init_tile_data(cpi);
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
@@ -634,15 +1025,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) {
       temporal_filter_iterate_tile_c(cpi, tile_row, tile_col);
     }
   }
-
-  // Restore input state
-  for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
 }
 
 // Apply buffer limits and context specific adjustments to arnr filter.
 static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
                                int *arnr_frames, int *arnr_strength) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
   const int frames_after_arf =
       vp9_lookahead_depth(cpi->lookahead) - distance - 1;
   int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
@@ -696,12 +1085,17 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost,
   }
 
   // Adjustments for second level arf in multi arf case.
-  if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) {
-    const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
-    if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) {
-      strength >>= 1;
-    }
-  }
+  // Leave commented out place holder for possible filtering adjustment with
+  // new multi-layer arf code.
+  // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed)
+  //   if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1;
+
+  // TODO(jingning): Skip temporal filtering for intermediate frames that will
+  // be used as show_existing_frame. Need to further explore the possibility to
+  // apply certain filter.
+  if (gf_group->arf_src_offset[gf_group->index] <
+      cpi->rc.baseline_gf_interval - 1)
+    frames = 1;
 
   *arnr_frames = frames;
   *arnr_strength = strength;
@@ -800,8 +1194,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) {
   }
 
   // Initialize errorperbit and sabperbit.
-  rdmult = (int)vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
-  if (rdmult < 1) rdmult = 1;
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX);
   set_error_per_bit(&cpi->td.mb, rdmult);
   vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX);
 
diff --git a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
index 775e49cc537..553a468280f 100644
--- a/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
+++ b/libs/libvpx/vp9/encoder/vp9_temporal_filter.h
@@ -8,14 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
-#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define ARNR_FILT_QINDEX 128
+static const MV kZeroMv = { 0, 0 };
+
+// Block size used in temporal filtering
+#define TF_BLOCK BLOCK_32X32
+#define BH 32
+#define BH_LOG2 5
+#define BW 32
+#define BW_LOG2 5
+#define BLK_PELS ((BH) * (BW))  // Pixels in the block
+#define TF_SHIFT 2
+#define TF_ROUND 3
+#define THR_SHIFT 2
+#define TF_SUB_BLOCK BLOCK_16X16
+#define SUB_BH 16
+#define SUB_BW 16
 
 void vp9_temporal_filter_init(void);
 void vp9_temporal_filter(VP9_COMP *cpi, int distance);
@@ -28,4 +43,4 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
+#endif  // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_tokenize.h b/libs/libvpx/vp9/encoder/vp9_tokenize.h
index b2f63ffef5a..6407ff92376 100644
--- a/libs/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libs/libvpx/vp9/encoder/vp9_tokenize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TOKENIZE_H_
-#define VP9_ENCODER_VP9_TOKENIZE_H_
+#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_
+#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_
 
 #include "vp9/common/vp9_entropy.h"
 
@@ -127,4 +127,4 @@ static INLINE int vp9_get_token_cost(int v, int16_t *token,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TOKENIZE_H_
+#endif  // VPX_VP9_ENCODER_VP9_TOKENIZE_H_
diff --git a/libs/libvpx/vp9/encoder/vp9_treewriter.h b/libs/libvpx/vp9/encoder/vp9_treewriter.h
index a8b9c2cd312..86c5fa22447 100644
--- a/libs/libvpx/vp9/encoder/vp9_treewriter.h
+++ b/libs/libvpx/vp9/encoder/vp9_treewriter.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_ENCODER_VP9_TREEWRITER_H_
-#define VP9_ENCODER_VP9_TREEWRITER_H_
+#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_
+#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_
 
 #include "vpx_dsp/bitwriter.h"
 
@@ -48,4 +48,4 @@ static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree,
 }  // extern "C"
 #endif
 
-#endif  // VP9_ENCODER_VP9_TREEWRITER_H_
+#endif  // VPX_VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
new file mode 100644
index 00000000000..4fa24512c59
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/x86/temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
+
+  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
+
+  __m128i dist_first, dist_second;
+
+  dist_first = _mm_sub_epi32(a_first, b_first);
+  dist_second = _mm_sub_epi32(a_second, b_second);
+  dist_first = _mm_mullo_epi32(dist_first, dist_first);
+  dist_second = _mm_mullo_epi32(dist_second, dist_second);
+
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
+
+  *sum = _mm_add_epi32(dist_reg, dist_left);
+  *sum = _mm_add_epi32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
+                                    __m128i *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
+                                    const __m128i *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
+  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
+  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
+  const __m128i weight_u32 = _mm_set1_epi32(weight);
+  const __m128i sixteen = _mm_set1_epi32(16);
+  const __m128i zero = _mm_setzero_si128();
+
+  // modifier * 3 / index;
+  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
+  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
+  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
+  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
+
+  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
+  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
+  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
+  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
+
+  // Now we have
+  //   mul_lo: 00 a1 00 a0
+  //   mul_hi: 00 a3 00 a2
+  // Unpack as 64 bit words to get even and odd elements
+  //   unpack_lo: 00 a2 00 a0
+  //   unpack_hi: 00 a3 00 a1
+  // Then we can shift and OR the results to get everything in 32-bits
+  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
+  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
+  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
+
+  // Round
+  *output = _mm_add_epi32(mul, rounding_u32);
+  *output = _mm_srl_epi32(*output, strength_u128);
+
+  // Multiply with the weight
+  *output = _mm_min_epu32(*output, sixteen);
+  *output = _mm_sub_epi32(sixteen, *output);
+  *output = _mm_mullo_epi32(*output, weight_u32);
+}
+
+static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
+                                    const __m128i *sum_0_u32,
+                                    const __m128i *sum_1_u32,
+                                    const __m128i *mul_constants_0,
+                                    const __m128i *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
+                                                 const __m128i sum_second_u32,
+                                                 const uint16_t *pred,
+                                                 uint16_t *count,
+                                                 uint32_t *accumulator) {
+  // Cast down to 16-bit ints
+  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
+  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
+
+  __m128i pred_0_u32, pred_1_u32;
+  __m128i accum_0_u32, accum_1_u32;
+
+  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
+  _mm_storeu_si128((__m128i *)count, count_u16);
+
+  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
+
+  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
+  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
+
+  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
+  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
+
+  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
+  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
+
+  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
+  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
+                                      __m128i *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
+    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
+  }
+}
+
+static void vp9_highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
+    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                     &sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
+  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
+  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
+                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_highbd_apply_temporal_filter_luma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_luma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
+        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_highbd_apply_temporal_filter_luma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
+    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
+  __m128i y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    __m128i y_fst, y_snd;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      __m128i y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
+      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
+    }
+
+    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
+  }
+
+  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  __m128i mul_fst, mul_snd;
+
+  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  __m128i u_sum_row_fst, v_sum_row_fst;
+  __m128i u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[1]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = _mm_load_si128((const __m128i *)neighbors_fst[0]);
+  mul_snd = _mm_load_si128((const __m128i *)neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
+                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
+                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_highbd_apply_temporal_filter_chroma(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      vp9_highbd_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_highbd_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_highbd_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
+      top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_sse4_1(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_highbd_apply_temporal_filter_luma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr,
+      v_dist_ptr);
+
+  vp9_highbd_apply_temporal_filter_chroma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h b/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h
new file mode 100644
index 00000000000..7dcedda192a
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_constants.h
@@ -0,0 +1,410 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#include "./vpx_config.h"
+
+// Division using multiplication and shifting. The C implementation does:
+// modifier *= 3;
+// modifier /= index;
+// where 'modifier' is a set of summed values and 'index' is the number of
+// summed values.
+//
+// This equation works out to (m * 3) / i which reduces to:
+// m * 3/4
+// m * 1/2
+// m * 1/3
+//
+// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
+// m * C / 65536
+// we can create a C to replicate the division.
+//
+// m * 49152 / 65536 = m * 3/4
+// m * 32758 / 65536 = m * 1/2
+// m * 21846 / 65536 = m * 0.3333
+//
+// These are loaded using an instruction expecting int16_t values but are used
+// with _mm_mulhi_epu16(), which treats them as unsigned.
+#define NEIGHBOR_CONSTANT_4 (int16_t)49152
+#define NEIGHBOR_CONSTANT_5 (int16_t)39322
+#define NEIGHBOR_CONSTANT_6 (int16_t)32768
+#define NEIGHBOR_CONSTANT_7 (int16_t)28087
+#define NEIGHBOR_CONSTANT_8 (int16_t)24576
+#define NEIGHBOR_CONSTANT_9 (int16_t)21846
+#define NEIGHBOR_CONSTANT_10 (int16_t)19661
+#define NEIGHBOR_CONSTANT_11 (int16_t)17874
+#define NEIGHBOR_CONSTANT_13 (int16_t)15124
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
+  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
+  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
+  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
+  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
+  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
+};
+
+static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
+};
+
+static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
+  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
+#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
+#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
+#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
+#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
+#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
+#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
+#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
+#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
+  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
+  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
+  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
+};
+
+DECLARE_ALIGNED(16, static const uint32_t,
+                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
+  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
+};
+
+static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
+};
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
+    };
+
+static const uint32_t
+    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
+      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
+    };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define DIST_STRIDE ((BW) + 2)
+
+#endif  // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
index 460dab65938..437f49f5a0d 100644
--- a/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/libs/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -14,96 +14,58 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/x86/temporal_filter_constants.h"
 
-// Division using multiplication and shifting. The C implementation does:
-// modifier *= 3;
-// modifier /= index;
-// where 'modifier' is a set of summed values and 'index' is the number of
-// summed values. 'index' may be 4, 6, or 9, representing a block of 9 values
-// which may be bound by the edges of the block being filtered.
-//
-// This equation works out to (m * 3) / i which reduces to:
-// m * 3/4
-// m * 1/2
-// m * 1/3
-//
-// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
-// m * C / 65536
-// we can create a C to replicate the division.
-//
-// m * 49152 / 65536 = m * 3/4
-// m * 32758 / 65536 = m * 1/2
-// m * 21846 / 65536 = m * 0.3333
-//
-// These are loaded using an instruction expecting int16_t values but are used
-// with _mm_mulhi_epu16(), which treats them as unsigned.
-#define NEIGHBOR_CONSTANT_4 (int16_t)49152
-#define NEIGHBOR_CONSTANT_6 (int16_t)32768
-#define NEIGHBOR_CONSTANT_9 (int16_t)21846
-
-// Load values from 'a' and 'b'. Compute the difference squared and sum
-// neighboring values such that:
-// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
-// Values to the left and right of the row are set to 0.
-// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
-static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
-  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
-  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);
-
-  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
-  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);
-
-  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
-  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);
-
-  // Shift all the values one place to the left/right so we can efficiently sum
-  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
-  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
-  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);
-
-  // It becomes necessary to treat the values as unsigned at this point. The
-  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
-  // forward since the filter is only applied to smooth small pixel changes.
-  // Once the value has saturated to uint16_t it is well outside the useful
-  // range.
-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
-
-  *sum = sum_u16;
-}
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
 
-static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
-                   __m128i *sum_1) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
 
-  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
-  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
-  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
-  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);
+  __m128i dist_first;
 
-  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
-  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
-  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
-  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
 
-  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
-  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
-  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+}
 
-  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
+  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
 
-  *sum_0 = sum_u16;
+  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
+  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
+  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
+  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
 
-  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
-  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);
+  __m128i dist_first, dist_second;
 
-  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
-  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);
+  dist_first = _mm_sub_epi16(a_first, b_first);
+  dist_second = _mm_sub_epi16(a_second, b_second);
+  dist_first = _mm_mullo_epi16(dist_first, dist_first);
+  dist_second = _mm_mullo_epi16(dist_second, dist_second);
 
-  *sum_1 = sum_u16;
+  _mm_storeu_si128((__m128i *)dst, dist_first);
+  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
+  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
+                                __m128i *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
 }
 
 // Average the value based on the number of values summed (9 for pixels away
@@ -111,17 +73,17 @@ static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
 //
 // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
 // by weight.
-static __m128i average_8(__m128i sum, const __m128i mul_constants,
-                         const int strength, const int rounding,
-                         const int weight) {
+static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants,
+                                const int strength, const int rounding,
+                                const __m128i *weight) {
   // _mm_srl_epi16 uses the lower 64 bit value for the shift.
   const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
   const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
+  const __m128i weight_u16 = *weight;
   const __m128i sixteen = _mm_set1_epi16(16);
 
   // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, mul_constants);
+  sum = _mm_mulhi_epu16(sum, *mul_constants);
 
   sum = _mm_adds_epu16(sum, rounding_u16);
   sum = _mm_srl_epi16(sum, strength_u128);
@@ -136,34 +98,6 @@ static __m128i average_8(__m128i sum, const __m128i mul_constants,
   return _mm_mullo_epi16(sum, weight_u16);
 }
 
-static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
-                       const __m128i mul_constants_0,
-                       const __m128i mul_constants_1, const int strength,
-                       const int rounding, const int weight) {
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  __m128i input_0, input_1;
-
-  input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
-  input_0 = _mm_adds_epu16(input_0, rounding_u16);
-
-  input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
-  input_1 = _mm_adds_epu16(input_1, rounding_u16);
-
-  input_0 = _mm_srl_epi16(input_0, strength_u128);
-  input_1 = _mm_srl_epi16(input_1, strength_u128);
-
-  input_0 = _mm_min_epu16(input_0, sixteen);
-  input_1 = _mm_min_epu16(input_1, sixteen);
-  input_0 = _mm_sub_epi16(sixteen, input_0);
-  input_1 = _mm_sub_epi16(sixteen, input_1);
-
-  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
-  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
-}
-
 // Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
 static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
                                    uint16_t *count, uint32_t *accumulator) {
@@ -192,10 +126,10 @@ static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
   _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
 }
 
-static void accumulate_and_store_16(const __m128i sum_0_u16,
-                                    const __m128i sum_1_u16,
-                                    const uint8_t *pred, uint16_t *count,
-                                    uint32_t *accumulator) {
+static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
+                                           const __m128i sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
   const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
   const __m128i zero = _mm_setzero_si128();
   __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
@@ -235,142 +169,768 @@ static void accumulate_and_store_16(const __m128i sum_0_u16,
   _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
 }
 
-void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
-                                      const uint8_t *b, unsigned int width,
-                                      unsigned int height, int strength,
-                                      int weight, uint32_t *accumulator,
-                                      uint16_t *count) {
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
+  __m128i dist_reg, dist_left, dist_right;
+
+  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
+  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
+  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
+
+  *sum = _mm_adds_epu16(dist_reg, dist_left);
+  *sum = _mm_adds_epu16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
+                              __m128i *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           __m128i *u_first, __m128i *u_second,
+                                           __m128i *v_first,
+                                           __m128i *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    __m128i u_reg, v_reg;
+
+    read_dist_8(u_dist, &u_reg);
+
+    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
+    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
+
+    read_dist_8(v_dist, &v_reg);
+
+    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
+    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
+  }
+}
+
+// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
+// int in dst.
+static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i shift_right = _mm_srli_si128(*src, 2);
+
+  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
+  const __m128i even = _mm_blend_epi16(*src, zero, 170);
+
+  *dst = _mm_add_epi32(even, odd);
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 __m128i *u_mod,
+                                                 __m128i *v_mod) {
+  __m128i y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      __m128i y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = _mm_adds_epu16(y_reg, y_tmp);
+    }
+  } else {
+    __m128i y_first, y_second;
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      __m128i y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = _mm_adds_epu16(y_first, y_tmp_0);
+      y_second = _mm_adds_epu16(y_second, y_tmp_1);
+    }
+
+    hadd_epu16(&y_first, &y_first);
+    hadd_epu16(&y_second, &y_second);
+
+    y_reg = _mm_packus_epi32(y_first, y_second);
+  }
+
+  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
+  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_luma_16(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
+    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
+    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  __m128i weight_first, weight_second;
+
+  __m128i mul_first, mul_second;
+
+  __m128i sum_row_1_first, sum_row_1_second;
+  __m128i sum_row_2_first, sum_row_2_second;
+  __m128i sum_row_3_first, sum_row_3_second;
+
+  __m128i u_first, u_second;
+  __m128i v_first, v_second;
+
+  __m128i sum_row_first;
+  __m128i sum_row_second;
+
+  // Loop variables
   unsigned int h;
-  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
 
   assert(strength >= 0);
   assert(strength <= 6);
 
-  assert(weight >= 0);
-  assert(weight <= 2);
-
-  assert(width == 8 || width == 16);
-
-  if (width == 8) {
-    __m128i sum_row_a, sum_row_b, sum_row_c;
-    __m128i mul_constants = _mm_setr_epi16(
-        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
-    sum_8(a, b, &sum_row_a);
-    sum_8(a + stride, b + width, &sum_row_b);
-    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
-    sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
-    accumulate_and_store_8(sum_row_c, b, count, accumulator);
-
-    a += stride + stride;
-    b += width;
-    count += width;
-    accumulator += width;
-
-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
-
-    for (h = 0; h < height - 2; ++h) {
-      sum_8(a, b + width, &sum_row_c);
-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
-      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
-      sum_row_a =
-          average_8(sum_row_a, mul_constants, strength, rounding, weight);
-      accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
-      a += stride;
-      b += width;
-      count += width;
-      accumulator += width;
-
-      sum_row_a = sum_row_b;
-      sum_row_b = sum_row_c;
+  assert(block_width == 16);
+
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = _mm_set1_epi16(blk_fw[0]);
+    weight_second = _mm_set1_epi16(blk_fw[1]);
+  } else {
+    weight_first = _mm_set1_epi16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_src += y_src_stride;
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = _mm_set1_epi16(blk_fw[2]);
+        weight_second = _mm_set1_epi16(blk_fw[3]);
+      } else {
+        weight_first = _mm_set1_epi16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+
+      u_src += uv_src_stride;
+      u_pre += uv_pre_stride;
+      u_dist += DIST_STRIDE;
+      v_src += uv_src_stride;
+      v_pre += uv_pre_stride;
+      v_dist += DIST_STRIDE;
     }
 
-    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
-    sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
-    accumulate_and_store_8(sum_row_a, b, count, accumulator);
-
-  } else {  // width == 16
-    __m128i sum_row_a_0, sum_row_a_1;
-    __m128i sum_row_b_0, sum_row_b_1;
-    __m128i sum_row_c_0, sum_row_c_1;
-    __m128i mul_constants_0 = _mm_setr_epi16(
-                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
-            mul_constants_1 = _mm_setr_epi16(
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-
-    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
-    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);
-
-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
-    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
-               strength, rounding, weight);
-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
-
-    a += stride + stride;
-    b += width;
-    count += width;
-    accumulator += width;
-
-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
-                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
-    for (h = 0; h < height - 2; ++h) {
-      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);
-
-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);
-
-      average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
-                 strength, rounding, weight);
-      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);
-
-      a += stride;
-      b += width;
-      count += width;
-      accumulator += width;
-
-      sum_row_a_0 = sum_row_b_0;
-      sum_row_a_1 = sum_row_b_1;
-      sum_row_b_0 = sum_row_c_0;
-      sum_row_b_1 = sum_row_c_1;
+    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+
+    y_src += y_src_stride;
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]);
+  mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
+  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
+  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void vp9_apply_temporal_filter_luma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
+    const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The blockwidth is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+          bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_luma_16(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
+          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
     }
 
-    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
-    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
-                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
-    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
-    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
-
-    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
-               strength, rounding, weight);
-    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
   }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_luma_16(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
+        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
+        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
+        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  vp9_apply_temporal_filter_luma_16(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void vp9_apply_temporal_filter_chroma_8(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int uv_block_width,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  __m128i weight;
+
+  __m128i mul;
+
+  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  __m128i u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // Initilize weight
+  if (blk_fw) {
+    weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
+                            blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
+  } else {
+    weight = _mm_set1_epi16(top_weight);
+  }
+
+  // First row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_src += uv_src_stride;
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_src += uv_src_stride;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_src += y_src_stride * (1 + ss_y);
+  y_pre += y_pre_stride * (1 + ss_y);
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = _mm_load_si128((const __m128i *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2],
+                                blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]);
+      } else {
+        weight = _mm_set1_epi16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_src += uv_src_stride;
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_src += uv_src_stride;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_src += y_src_stride * (1 + ss_y);
+    y_pre += y_pre_stride * (1 + ss_y);
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = _mm_load_si128((const __m128i *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void vp9_apply_temporal_filter_chroma(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      vp9_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          top_weight, bottom_weight, NULL);
+    } else {
+      vp9_apply_temporal_filter_chroma_8(
+          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+          0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    vp9_apply_temporal_filter_chroma_8(
+        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
+        top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  vp9_apply_temporal_filter_chroma_8(
+      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
+      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
+      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
+      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_sse4_1(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference sqaured
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  vp9_apply_temporal_filter_luma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
+      u_dist_ptr, v_dist_ptr);
+
+  vp9_apply_temporal_filter_chroma(
+      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
 }
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index dbd243ac103..2188903b173 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -14,6 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
@@ -170,452 +171,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_4x4(input, in, stride);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-    default: assert(0); break;
-  }
-}
-
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
-                            int16_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, int16_t *qcoeff_ptr,
-                            int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan_ptr,
-                            const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero);
-      _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
   }
 }
 
@@ -1097,14 +659,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_8x8(input, in, stride);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
-    default: assert(0); break;
   }
 }
 
@@ -1963,13 +1525,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    case ADST_ADST:
+    default:
+      assert(tx_type == ADST_ADST);
       load_buffer_16x16(input, in0, in1, stride);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
-    default: assert(0); break;
   }
 }
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
deleted file mode 100644
index bf874a09ec5..00000000000
--- a/libs/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ /dev/null
@@ -1,465 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <tmmintrin.h>  // SSSE3
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-void vp9_fdct8x8_quant_ssse3(
-    const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs,
-    int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr,
-    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-    uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) {
-  __m128i zero;
-  int pass;
-
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  __m128i *in[8];
-  int index = 0;
-
-  (void)scan_ptr;
-  (void)coeff_ptr;
-
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  in[0] = &in0;
-  in[1] = &in1;
-  in[2] = &in2;
-  in[3] = &in3;
-  in[4] = &in4;
-  in[5] = &in5;
-  in[6] = &in6;
-  in[7] = &in7;
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = _mm_add_epi16(in0, in7);
-    const __m128i q1 = _mm_add_epi16(in1, in6);
-    const __m128i q2 = _mm_add_epi16(in2, in5);
-    const __m128i q3 = _mm_add_epi16(in3, in4);
-    const __m128i q4 = _mm_sub_epi16(in3, in4);
-    const __m128i q5 = _mm_sub_epi16(in2, in5);
-    const __m128i q6 = _mm_sub_epi16(in1, in6);
-    const __m128i q7 = _mm_sub_epi16(in0, in7);
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = _mm_add_epi16(q0, q3);
-      const __m128i r1 = _mm_add_epi16(q1, q2);
-      const __m128i r2 = _mm_sub_epi16(q1, q2);
-      const __m128i r3 = _mm_sub_epi16(q0, q3);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-      // dct_const_round_shift
-
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-
-      res0 = _mm_packs_epi32(w0, w1);
-      res4 = _mm_packs_epi32(w2, w3);
-      res2 = _mm_packs_epi32(w4, w5);
-      res6 = _mm_packs_epi32(w6, w7);
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_sub_epi16(q6, q5);
-      const __m128i d1 = _mm_add_epi16(q6, q5);
-      const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
-      const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
-
-      // Add/subtract
-      const __m128i x0 = _mm_add_epi16(q4, r0);
-      const __m128i x1 = _mm_sub_epi16(q4, r0);
-      const __m128i x2 = _mm_sub_epi16(q7, r1);
-      const __m128i x3 = _mm_add_epi16(q7, r1);
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-      // dct_const_round_shift
-      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-      // Combine
-      res1 = _mm_packs_epi32(w0, w1);
-      res7 = _mm_packs_epi32(w2, w3);
-      res5 = _mm_packs_epi32(w4, w5);
-      res3 = _mm_packs_epi32(w6, w7);
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-  }
-
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant, thr;
-    int16_t nzflag;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = *in[0];
-        coeff1 = *in[1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    index = 2;
-    thr = _mm_srai_epi16(dequant, 1);
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
-        coeff0 = *in[index];
-        coeff1 = *in[index + 1];
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-        } else {
-          // Maybe a more efficient way to store 0?
-          store_zero_tran_low(qcoeff_ptr + n_coeffs);
-          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-      index += 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
-  }
-}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 2f3c66c0835..aa46c5889d3 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -160,7 +160,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
       }
 
       // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
+      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
       // Shift right to keep the sign bit clear, we will use this later
       // to set the cost to the maximum value.
       v_outside_d = _mm_srli_epi32(v_outside_d, 1);
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
index 91f627c343a..d7aafe7b011 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c
@@ -11,27 +11,28 @@
 #include <emmintrin.h>
 #include <stdio.h>
 
+#include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
 
-int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
-                                    intptr_t block_size, int64_t *ssz,
-                                    int bps) {
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
   int i, j, test;
   uint32_t temp[4];
   __m128i max, min, cmp0, cmp1, cmp2, cmp3;
   int64_t error = 0, sqcoeff = 0;
-  const int shift = 2 * (bps - 8);
+  const int shift = 2 * (bd - 8);
   const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
 
   for (i = 0; i < block_size; i += 8) {
     // Load the data into xmm registers
-    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
-    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
-    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
-    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+    __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i));
+    __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4));
+    __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i));
+    __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4));
     // Check if any values require more than 15 bit
     max = _mm_set1_epi32(0x3fff);
-    min = _mm_set1_epi32(0xffffc000);
+    min = _mm_set1_epi32((int32_t)0xffffc000);
     cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                          _mm_cmplt_epi32(mm_coeff, min));
     cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 00000000000..8dfdbd50f6c
--- /dev/null
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,139 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+  const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm256_storeu_si256((__m256i *)(a), zero);
+  _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+  _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+                                   __m256i *coeff256) {
+  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+  const __m256i zero256 = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
+  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
+  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+#else
+  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+#endif
+  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+  // Add one to convert from indices to counts
+  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *round_ptr,
+                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
+  __m128i eob;
+  __m256i round256, quant256, dequant256;
+  __m256i eob256, thr256;
+
+  (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  {
+    __m256i coeff256;
+
+    // Setup global values
+    {
+      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      round256 = _mm256_castsi128_si256(round);
+      round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+      quant256 = _mm256_castsi128_si256(quant);
+      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+      dequant256 = _mm256_castsi128_si256(dequant);
+      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+    }
+
+    {
+      __m256i qcoeff256;
+      __m256i qtmp256;
+      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+      qcoeff256 = _mm256_abs_epi16(coeff256);
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+    }
+
+    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
+    n_coeffs += 8 * 2;
+  }
+
+  // remove dc constants
+  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+  thr256 = _mm256_srai_epi16(dequant256, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+    int32_t nzflag =
+        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+    if (nzflag) {
+      __m256i qtmp256;
+      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+      eob256 = _mm256_max_epi16(
+          eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+                      _mm256_extracti128_si256(eob256, 1));
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index ca0ad4407e5..885220a7129 100644
--- a/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libs/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -21,20 +21,20 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t *round_ptr,
                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan_ptr,
-                          const int16_t *iscan_ptr) {
+                          uint16_t *eob_ptr, const int16_t *scan,
+                          const int16_t *iscan) {
   __m128i zero;
   __m128i thr;
   int16_t nzflag;
   __m128i eob;
   __m128i round, quant, dequant;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
   coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
+  iscan += n_coeffs;
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
@@ -100,8 +100,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
       // Add one to convert from indices to counts
       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
@@ -175,8 +175,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
       nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
       nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
       // Add one to convert from indices to counts
       iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
       iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
diff --git a/libs/libvpx/vp9/vp9_common.mk b/libs/libvpx/vp9/vp9_common.mk
index 5bfc0d3599f..c9a55669e16 100644
--- a/libs/libvpx/vp9/vp9_common.mk
+++ b/libs/libvpx/vp9/vp9_common.mk
@@ -63,30 +63,36 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
-ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
-endif
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
-endif
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct16x16_msa.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
 
-# common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+VP9_COMMON_SRCS-$(HAVE_SSE2)  += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_VSX)   += common/ppc/vp9_idct_vsx.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht16x16_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht_neon.h
 
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA)  += common/mips/msa/vp9_mfqe_msa.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm
 endif
 
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
-
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
+else
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON)   += common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
 endif
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/libs/libvpx/vp9/vp9_cx_iface.c b/libs/libvpx/vp9/vp9_cx_iface.c
index 881caae78bf..45e03f2def3 100644
--- a/libs/libvpx/vp9/vp9_cx_iface.c
+++ b/libs/libvpx/vp9/vp9_cx_iface.c
@@ -15,6 +15,7 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_util/vpx_timestamp.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vpx_version.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -30,6 +31,7 @@ struct vp9_extracfg {
   unsigned int static_thresh;
   unsigned int tile_columns;
   unsigned int tile_rows;
+  unsigned int enable_tpl_model;
   unsigned int arnr_max_frames;
   unsigned int arnr_strength;
   unsigned int min_gf_interval;
@@ -63,6 +65,7 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                     // static_thresh
   6,                     // tile_columns
   0,                     // tile_rows
+  1,                     // enable_tpl_model
   7,                     // arnr_max_frames
   5,                     // arnr_strength
   0,                     // min_gf_interval; 0 -> default decision
@@ -92,6 +95,9 @@ struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_enc_cfg_t cfg;
   struct vp9_extracfg extra_cfg;
+  vpx_rational64_t timestamp_ratio;
+  vpx_codec_pts_t pts_offset;
+  unsigned char pts_offset_initialized;
   VP9EncoderConfig oxcf;
   VP9_COMP *cpi;
   unsigned char *cx_data;
@@ -128,10 +134,10 @@ static vpx_codec_err_t update_error_state(
     return VPX_CODEC_INVALID_PARAM; \
   } while (0)
 
-#define RANGE_CHECK(p, memb, lo, hi)                                 \
-  do {                                                               \
-    if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \
-      ERROR(#memb " out of range [" #lo ".." #hi "]");               \
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
   } while (0)
 
 #define RANGE_CHECK_HI(p, memb, hi)                                     \
@@ -149,6 +155,22 @@ static vpx_codec_err_t update_error_state(
     if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
   } while (0)
 
+#if defined(_MSC_VER)
+#define COMPILE_TIME_ASSERT(boolexp)              \
+  do {                                            \
+    char compile_time_assert[(boolexp) ? 1 : -1]; \
+    (void)compile_time_assert;                    \
+  } while (0)
+#else  // !_MSC_VER
+#define COMPILE_TIME_ASSERT(boolexp)                         \
+  do {                                                       \
+    struct {                                                 \
+      unsigned int compile_time_assert : (boolexp) ? 1 : -1; \
+    } compile_time_assert;                                   \
+    (void)compile_time_assert;                               \
+  } while (0)
+#endif  // _MSC_VER
+
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp9_extracfg *extra_cfg) {
@@ -237,22 +259,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         ERROR("ts_rate_decimator factors are not powers of 2");
   }
 
-#if CONFIG_SPATIAL_SVC
-
-  if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) &&
-      cfg->g_pass == VPX_RC_LAST_PASS) {
-    unsigned int i, alt_ref_sum = 0;
-    for (i = 0; i < cfg->ss_number_layers; ++i) {
-      if (cfg->ss_enable_auto_alt_ref[i]) ++alt_ref_sum;
-    }
-    if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers)
-      ERROR("Not enough ref buffers for svc alt ref frames");
-    if (cfg->ss_number_layers * cfg->ts_number_layers > 3 &&
-        cfg->g_error_resilient == 0)
-      ERROR("Multiple frame context are not supported for more than 3 layers");
-  }
-#endif
-
   // VP9 does not support a lower bound on the keyframe interval in
   // automatic keyframe placement mode.
   if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
@@ -263,8 +269,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
 
   RANGE_CHECK(extra_cfg, row_mt, 0, 1);
   RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2);
-  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
-  RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
+  RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS);
+  RANGE_CHECK(extra_cfg, cpu_used, -9, 9);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
@@ -277,10 +283,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK(extra_cfg, content, VP9E_CONTENT_DEFAULT,
               VP9E_CONTENT_INVALID - 1);
 
-  // TODO(yaowu): remove this when ssim tuning is implemented for vp9
-  if (extra_cfg->tuning == VP8_TUNE_SSIM)
-    ERROR("Option --tune=ssim is not currently supported in VP9.");
-
 #if !CONFIG_REALTIME_ONLY
   if (cfg->g_pass == VPX_RC_LAST_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
@@ -560,6 +562,8 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->tile_columns = extra_cfg->tile_columns;
 
+  oxcf->enable_tpl_model = extra_cfg->enable_tpl_model;
+
   // TODO(yunqing): The dependencies between row tiles cause error in multi-
   // threaded encoding. For now, tile_rows is forced to be 0 in this case.
   // The further fix can be done by adding synchronizations after a tile row
@@ -589,9 +593,6 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
 
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
-#if CONFIG_SPATIAL_SVC
-    oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
-#endif
     for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
       oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
           1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
@@ -599,9 +600,6 @@ static vpx_codec_err_t set_encoder_config(
   }
   if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
     oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth;
-#if CONFIG_SPATIAL_SVC
-    oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref;
-#endif
   }
   if (oxcf->ts_number_layers > 1) {
     for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) {
@@ -716,7 +714,10 @@ static vpx_codec_err_t update_extra_cfg(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_cpuused(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  // Use fastest speed setting (speed 9 or -9) if it's set beyond the range.
   extra_cfg.cpu_used = CAST(VP8E_SET_CPUUSED, args);
+  extra_cfg.cpu_used = VPXMIN(9, extra_cfg.cpu_used);
+  extra_cfg.cpu_used = VPXMAX(-9, extra_cfg.cpu_used);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -762,6 +763,13 @@ static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx,
                                                 va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -809,7 +817,7 @@ static vpx_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
     vpx_codec_alg_priv_t *ctx, va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.rc_max_inter_bitrate_pct =
-      CAST(VP8E_SET_MAX_INTER_BITRATE_PCT, args);
+      CAST(VP9E_SET_MAX_INTER_BITRATE_PCT, args);
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -926,6 +934,12 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
     if (res == VPX_CODEC_OK) {
+      priv->pts_offset_initialized = 0;
+      priv->timestamp_ratio.den = priv->cfg.g_timebase.den;
+      priv->timestamp_ratio.num = (int64_t)priv->cfg.g_timebase.num;
+      priv->timestamp_ratio.num *= TICKS_PER_SEC;
+      reduce_ratio(&priv->timestamp_ratio);
+
       set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
 #if CONFIG_VP9_HIGHBITDEPTH
       priv->oxcf.use_highbitdepth =
@@ -962,12 +976,14 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
   switch (ctx->cfg.g_pass) {
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
-        const vpx_codec_enc_cfg_t *const cfg = &ctx->cfg;
-
         // Convert duration parameter from stream timebase to microseconds.
-        const uint64_t duration_us = (uint64_t)duration * 1000000 *
-                                     (uint64_t)cfg->g_timebase.num /
-                                     (uint64_t)cfg->g_timebase.den;
+        uint64_t duration_us;
+
+        COMPILE_TIME_ASSERT(TICKS_PER_SEC > 1000000 &&
+                            (TICKS_PER_SEC % 1000000) == 0);
+
+        duration_us = duration * (uint64_t)ctx->timestamp_ratio.num /
+                      (ctx->timestamp_ratio.den * (TICKS_PER_SEC / 1000000));
 
         // If the deadline is more that the duration this frame is to be shown,
         // use good quality mode. Otherwise use realtime mode.
@@ -1051,15 +1067,16 @@ static int write_superframe_index(vpx_codec_alg_priv_t *ctx) {
   return index_sz;
 }
 
-static int64_t timebase_units_to_ticks(const vpx_rational_t *timebase,
+static int64_t timebase_units_to_ticks(const vpx_rational64_t *timestamp_ratio,
                                        int64_t n) {
-  return n * TICKS_PER_SEC * timebase->num / timebase->den;
+  return n * timestamp_ratio->num / timestamp_ratio->den;
 }
 
-static int64_t ticks_to_timebase_units(const vpx_rational_t *timebase,
+static int64_t ticks_to_timebase_units(const vpx_rational64_t *timestamp_ratio,
                                        int64_t n) {
-  const int64_t round = (int64_t)TICKS_PER_SEC * timebase->num / 2 - 1;
-  return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+  int64_t round = timestamp_ratio->num / 2;
+  if (round > 0) --round;
+  return (n * timestamp_ratio->den + round) / timestamp_ratio->num;
 }
 
 static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
@@ -1067,12 +1084,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
   vpx_codec_frame_flags_t flags = lib_flags << 16;
 
   if (lib_flags & FRAMEFLAGS_KEY ||
-      (cpi->use_svc &&
-       cpi->svc
-           .layer_context[cpi->svc.spatial_layer_id *
-                              cpi->svc.number_temporal_layers +
-                          cpi->svc.temporal_layer_id]
-           .is_key_frame))
+      (cpi->use_svc && cpi->svc
+                           .layer_context[cpi->svc.spatial_layer_id *
+                                              cpi->svc.number_temporal_layers +
+                                          cpi->svc.temporal_layer_id]
+                           .is_key_frame))
     flags |= VPX_FRAME_IS_KEY;
 
   if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
@@ -1083,37 +1099,26 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
 const size_t kMinCompressedSize = 8192;
 static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
                                       const vpx_image_t *img,
-                                      vpx_codec_pts_t pts,
+                                      vpx_codec_pts_t pts_val,
                                       unsigned long duration,
                                       vpx_enc_frame_flags_t enc_flags,
                                       unsigned long deadline) {
   volatile vpx_codec_err_t res = VPX_CODEC_OK;
   volatile vpx_enc_frame_flags_t flags = enc_flags;
+  volatile vpx_codec_pts_t pts = pts_val;
   VP9_COMP *const cpi = ctx->cpi;
-  const vpx_rational_t *const timebase = &ctx->cfg.g_timebase;
+  const vpx_rational64_t *const timestamp_ratio = &ctx->timestamp_ratio;
   size_t data_sz;
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
   if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 &&
       !cpi->level_constraint.rc_config_updated) {
-    SVC *const svc = &cpi->svc;
-    const int is_two_pass_svc =
-        (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
     const VP9EncoderConfig *const oxcf = &cpi->oxcf;
     TWO_PASS *const twopass = &cpi->twopass;
     FIRSTPASS_STATS *stats = &twopass->total_stats;
-    if (is_two_pass_svc) {
-      const double frame_rate = 10000000.0 * stats->count / stats->duration;
-      vp9_update_spatial_layer_framerate(cpi, frame_rate);
-      twopass->bits_left =
-          (int64_t)(stats->duration *
-                    svc->layer_context[svc->spatial_layer_id].target_bandwidth /
-                    10000000.0);
-    } else {
-      twopass->bits_left =
-          (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
-    }
+    twopass->bits_left =
+        (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
     cpi->level_constraint.rc_config_updated = 1;
   }
 
@@ -1123,7 +1128,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
-                (cpi->multi_arf_allowed ? 8 : 2);
+                (cpi->multi_layer_arf ? 8 : 2);
       if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
         ctx->cx_data_sz = data_sz;
@@ -1136,6 +1141,12 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
+  if (!ctx->pts_offset_initialized) {
+    ctx->pts_offset = pts;
+    ctx->pts_offset_initialized = 1;
+  }
+  pts -= ctx->pts_offset;
+
   pick_quickcompress_mode(ctx, duration, deadline);
   vpx_codec_pkt_list_init(&ctx->pkt_list);
 
@@ -1168,12 +1179,15 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
   if (res == VPX_CODEC_OK) {
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
-    int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+    int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
     int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timebase, pts + duration);
+        timebase_units_to_ticks(timestamp_ratio, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
+    cpi->svc.timebase_fac = timebase_units_to_ticks(timestamp_ratio, 1);
+    cpi->svc.time_stamp_superframe = dst_time_stamp;
+
     // Set up internal flags
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
@@ -1213,34 +1227,31 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
            -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
                                          &dst_time_stamp, &dst_end_time_stamp,
                                          !img)) {
-      if (size) {
+      if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) {
         vpx_codec_cx_pkt_t pkt;
 
-#if CONFIG_SPATIAL_SVC
-        if (cpi->use_svc)
-          cpi->svc
-              .layer_context[cpi->svc.spatial_layer_id *
-                             cpi->svc.number_temporal_layers]
-              .layer_size += size;
-#endif
-
         // Pack invisible frames with the next visible frame
         if (!cpi->common.show_frame ||
             (cpi->use_svc &&
              cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)) {
           if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
-          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
           ctx->pending_frame_magnitude |= size;
           cx_data += size;
           cx_data_sz -= size;
+          pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+          pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
+          pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] =
+              1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id];
 
           if (ctx->output_cx_pkt_cb.output_cx_pkt) {
             pkt.kind = VPX_CODEC_CX_FRAME_PKT;
             pkt.data.frame.pts =
-                ticks_to_timebase_units(timebase, dst_time_stamp);
+                ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+                ctx->pts_offset;
             pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-                timebase, dst_end_time_stamp - dst_time_stamp);
+                timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
             pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
             pkt.data.frame.buf = ctx->pending_cx_data;
             pkt.data.frame.sz = size;
@@ -1256,13 +1267,19 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
         // Add the frame packet to the list of returned packets.
         pkt.kind = VPX_CODEC_CX_FRAME_PKT;
-        pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+        pkt.data.frame.pts =
+            ticks_to_timebase_units(timestamp_ratio, dst_time_stamp) +
+            ctx->pts_offset;
         pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
-            timebase, dst_end_time_stamp - dst_time_stamp);
+            timestamp_ratio, dst_end_time_stamp - dst_time_stamp);
         pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+        pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+        pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
+        pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] =
+            1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id];
 
         if (ctx->pending_cx_data) {
-          ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
+          if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
           ctx->pending_frame_magnitude |= size;
           ctx->pending_cx_data_sz += size;
           // write the superframe only for the case when
@@ -1288,27 +1305,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
         cx_data += size;
         cx_data_sz -= size;
-#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
-        if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
-          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
-          int sl;
-          vp9_zero(pkt_sizes);
-          vp9_zero(pkt_psnr);
-          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
-          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
-          for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-            LAYER_CONTEXT *lc =
-                &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers];
-            pkt_sizes.data.layer_sizes[sl] = lc->layer_size;
-            pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt;
-            lc->layer_size = 0;
-          }
-
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
-
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
-        }
-#endif
         if (is_one_pass_cbr_svc(cpi) &&
             (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
           // Encoded all spatial layers; exit loop.
@@ -1338,9 +1334,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
     vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
                           &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
@@ -1354,9 +1349,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
     vp9_copy_reference_enc(ctx->cpi,
                            ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
@@ -1364,14 +1358,13 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
   vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *);
 
   if (frame != NULL) {
-    YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+    const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx;
+    YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
-
     yuvconfig2image(&frame->img, fb, NULL);
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
@@ -1381,9 +1374,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
   if (config != NULL) {
     ctx->preview_ppcfg = *config;
     return VPX_CODEC_OK;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 #else
   (void)ctx;
   (void)args;
@@ -1405,17 +1397,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
   if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
     yuvconfig2image(&ctx->preview_img, &sd, NULL);
     return &ctx->preview_img;
-  } else {
-    return NULL;
   }
+  return NULL;
 }
 
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
-  (void)ctx;
-  (void)args;
+  vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
+
+  if (data) {
+    vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
 
-  // TODO(yaowu): Need to re-implement and test for VP9.
+    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                         roi->delta_q, roi->delta_lf, roi->skip,
+                         roi->ref_frame)) {
+      return VPX_CODEC_OK;
+    }
+    return VPX_CODEC_INVALID_PARAM;
+  }
   return VPX_CODEC_INVALID_PARAM;
 }
 
@@ -1427,11 +1426,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
@@ -1442,11 +1440,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
     if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
                             (int)map->cols))
       return VPX_CODEC_OK;
-    else
-      return VPX_CODEC_INVALID_PARAM;
-  } else {
+
     return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
@@ -1458,9 +1455,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
         vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
                               (VPX_SCALING)mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
   }
+  return VPX_CODEC_INVALID_PARAM;
 }
 
 static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -1491,22 +1487,23 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx,
   vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *);
   VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi;
   SVC *const svc = &cpi->svc;
+  int sl;
 
-  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
   svc->spatial_layer_to_encode = data->spatial_layer_id;
+  svc->first_spatial_layer_to_encode = data->spatial_layer_id;
+  // TODO(jianj): Deprecated to be removed.
   svc->temporal_layer_id = data->temporal_layer_id;
+  // Allow for setting temporal layer per spatial layer for superframe.
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
+    svc->temporal_layer_id_per_spatial[sl] =
+        data->temporal_layer_id_per_spatial[sl];
+  }
   // Checks on valid layer_id input.
   if (svc->temporal_layer_id < 0 ||
       svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (svc->first_spatial_layer_to_encode < 0 ||
-      svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) {
-    return VPX_CODEC_INVALID_PARAM;
-  }
-  // First spatial layer to encode not implemented for two-pass.
-  if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0)
-    return VPX_CODEC_INVALID_PARAM;
+
   return VPX_CODEC_OK;
 }
 
@@ -1546,20 +1543,87 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+  int sl;
+  for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) {
+    data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl];
+    data->reference_last[sl] = cpi->svc.reference_last[sl];
+    data->reference_golden[sl] = cpi->svc.reference_golden[sl];
+    data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl];
+    data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl];
+    data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl];
+    data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl];
+    // TODO(jianj): Remove these 3, deprecated.
+    data->update_last[sl] = cpi->svc.update_last[sl];
+    data->update_golden[sl] = cpi->svc.update_golden[sl];
+    data->update_alt_ref[sl] = cpi->svc.update_altref[sl];
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   VP9_COMP *const cpi = ctx->cpi;
   vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
   int sl;
+  cpi->svc.use_set_ref_frame_config = 1;
   for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
-    cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
-    cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
-    cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
-    cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+    cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl];
+    cpi->svc.reference_last[sl] = data->reference_last[sl];
+    cpi->svc.reference_golden[sl] = data->reference_golden[sl];
+    cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl];
+    cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl];
+    cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl];
+    cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl];
+    cpi->svc.duration[sl] = data->duration[sl];
   }
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  const int data = va_arg(args, int);
+  VP9_COMP *const cpi = ctx->cpi;
+  cpi->svc.disable_inter_layer_pred = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx,
+                                                     va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *);
+  int sl;
+  cpi->svc.framedrop_mode = data->framedrop_mode;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl)
+    cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl];
+  // Don't allow max_consec_drop values below 1.
+  cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->svc.use_gf_temporal_ref = data;
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync(
+    vpx_codec_alg_priv_t *ctx, va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  vpx_svc_spatial_layer_sync_t *data =
+      va_arg(args, vpx_svc_spatial_layer_sync_t *);
+  int sl;
+  for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl)
+    cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl];
+  cpi->svc.set_intra_only_frame = data->base_layer_intra_only;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp =
@@ -1600,13 +1664,21 @@ static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const unsigned int data = va_arg(args, unsigned int);
+  cpi->rc.ext_use_post_encode_drop = data;
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
   // Setters
   { VP8_SET_REFERENCE, ctrl_set_reference },
   { VP8_SET_POSTPROC, ctrl_set_previewpp },
-  { VP8E_SET_ROI_MAP, ctrl_set_roi_map },
+  { VP9E_SET_ROI_MAP, ctrl_set_roi_map },
   { VP8E_SET_ACTIVEMAP, ctrl_set_active_map },
   { VP8E_SET_SCALEMODE, ctrl_set_scale_mode },
   { VP8E_SET_CPUUSED, ctrl_set_cpuused },
@@ -1615,6 +1687,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
   { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
   { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows },
+  { VP9E_SET_TPL, ctrl_set_tpl_model },
   { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
   { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
   { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type },
@@ -1642,7 +1715,12 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
   { VP9E_SET_ROW_MT, ctrl_set_row_mt },
+  { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop },
   { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+  { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred },
+  { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer },
+  { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref },
+  { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
@@ -1651,6 +1729,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
   { VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
   { VP9E_GET_LEVEL, ctrl_get_level },
+  { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
 
   { -1, NULL },
 };
@@ -1659,7 +1738,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
   { 0,
     {
         // NOLINT
-        0,  // g_usage
+        0,  // g_usage (unused)
         8,  // g_threads
         0,  // g_profile
 
diff --git a/libs/libvpx/vp9/vp9_dx_iface.c b/libs/libvpx/vp9/vp9_dx_iface.c
index 657490f4bd3..fa79f7aedc2 100644
--- a/libs/libvpx/vp9/vp9_dx_iface.c
+++ b/libs/libvpx/vp9/vp9_dx_iface.c
@@ -97,7 +97,7 @@ static vpx_codec_err_t decoder_peek_si_internal(
     const uint8_t *data, unsigned int data_sz, vpx_codec_stream_info_t *si,
     int *is_intra_only, vpx_decrypt_cb decrypt_cb, void *decrypt_state) {
   int intra_only_flag = 0;
-  uint8_t clear_buffer[10];
+  uint8_t clear_buffer[11];
 
   if (data + data_sz <= data) return VPX_CODEC_INVALID_PARAM;
 
@@ -158,6 +158,9 @@ static vpx_codec_err_t decoder_peek_si_internal(
         if (profile > PROFILE_0) {
           if (!parse_bitdepth_colorspace_sampling(profile, &rb))
             return VPX_CODEC_UNSUP_BITSTREAM;
+          // The colorspace info may cause vp9_read_frame_size() to need 11
+          // bytes.
+          if (data_sz < 11) return VPX_CODEC_UNSUP_BITSTREAM;
         }
         rb.bit_offset += REF_FRAMES;  // refresh_frame_flags
         vp9_read_frame_size(&rb, (int *)&si->w, (int *)&si->h);
@@ -235,6 +238,19 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
+#undef ERROR
+#define ERROR(str)                  \
+  do {                              \
+    ctx->base.err_detail = str;     \
+    return VPX_CODEC_INVALID_PARAM; \
+  } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi)                                     \
+  do {                                                                   \
+    if (!(((p)->memb == (lo) || (p)->memb > (lo)) && (p)->memb <= (hi))) \
+      ERROR(#memb " out of range [" #lo ".." #hi "]");                   \
+  } while (0)
+
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   ctx->last_show_frame = -1;
   ctx->need_resync = 1;
@@ -251,6 +267,12 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   ctx->pbi->max_threads = ctx->cfg.threads;
   ctx->pbi->inv_tile_order = ctx->invert_tile_order;
 
+  RANGE_CHECK(ctx, row_mt, 0, 1);
+  ctx->pbi->row_mt = ctx->row_mt;
+
+  RANGE_CHECK(ctx, lpf_opt, 0, 1);
+  ctx->pbi->lpf_mt_opt = ctx->lpf_opt;
+
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
@@ -452,8 +474,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
   if (data) {
-    YV12_BUFFER_CONFIG *fb;
-    fb = get_ref_frame(&ctx->pbi->common, data->idx);
+    const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx;
+    YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
@@ -632,6 +654,20 @@ static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  ctx->row_mt = va_arg(args, int);
+
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  ctx->lpf_opt = va_arg(args, int);
+
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -643,6 +679,8 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
   { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
   { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc },
+  { VP9D_SET_ROW_MT, ctrl_set_row_mt },
+  { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt },
 
   // Getters
   { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/libs/libvpx/vp9/vp9_dx_iface.h b/libs/libvpx/vp9/vp9_dx_iface.h
index 18bc7ab0d62..f60688c4db2 100644
--- a/libs/libvpx/vp9/vp9_dx_iface.h
+++ b/libs/libvpx/vp9/vp9_dx_iface.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_VP9_DX_IFACE_H_
-#define VP9_VP9_DX_IFACE_H_
+#ifndef VPX_VP9_VP9_DX_IFACE_H_
+#define VPX_VP9_VP9_DX_IFACE_H_
 
 #include "vp9/decoder/vp9_decoder.h"
 
@@ -45,6 +45,8 @@ struct vpx_codec_alg_priv {
   // Allow for decoding up to a given spatial layer for SVC stream.
   int svc_decoding;
   int svc_spatial_layer;
+  int row_mt;
+  int lpf_opt;
 };
 
-#endif  // VP9_VP9_DX_IFACE_H_
+#endif  // VPX_VP9_VP9_DX_IFACE_H_
diff --git a/libs/libvpx/vp9/vp9_iface_common.h b/libs/libvpx/vp9/vp9_iface_common.h
index d68872750b9..a1921db636c 100644
--- a/libs/libvpx/vp9/vp9_iface_common.h
+++ b/libs/libvpx/vp9/vp9_iface_common.h
@@ -7,17 +7,17 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VP9_VP9_IFACE_COMMON_H_
-#define VP9_VP9_IFACE_COMMON_H_
+#ifndef VPX_VP9_VP9_IFACE_COMMON_H_
+#define VPX_VP9_VP9_IFACE_COMMON_H_
 
 #include "vpx_ports/mem.h"
 
 static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
                             void *user_priv) {
   /** vpx_img_wrap() doesn't allow specifying independent strides for
-    * the Y, U, and V planes, nor other alignment adjustments that
-    * might be representable by a YV12_BUFFER_CONFIG, so we just
-    * initialize all the fields.*/
+   * the Y, U, and V planes, nor other alignment adjustments that
+   * might be representable by a YV12_BUFFER_CONFIG, so we just
+   * initialize all the fields.*/
   int bps;
   if (!yv12->subsampling_y) {
     if (!yv12->subsampling_x) {
@@ -142,4 +142,4 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
   assert(0 && "Invalid Reference Frame");
   return VP9_LAST_FLAG;
 }
-#endif  // VP9_VP9_IFACE_COMMON_H_
+#endif  // VPX_VP9_VP9_IFACE_COMMON_H_
diff --git a/libs/libvpx/vp9/vp9cx.mk b/libs/libvpx/vp9/vp9cx.mk
index d633ed1429d..736ff017064 100644
--- a/libs/libvpx/vp9/vp9cx.mk
+++ b/libs/libvpx/vp9/vp9cx.mk
@@ -64,6 +64,7 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c
 VP9_CX_SRCS-yes += encoder/vp9_rd.c
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.c
 VP9_CX_SRCS-yes += encoder/vp9_pickmode.c
+VP9_CX_SRCS-yes += encoder/vp9_partition_models.h
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.c
 VP9_CX_SRCS-yes += encoder/vp9_segmentation.h
 VP9_CX_SRCS-yes += encoder/vp9_speed_features.c
@@ -74,6 +75,7 @@ VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.c
 VP9_CX_SRCS-yes += encoder/vp9_resize.h
 VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c
+VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h
 
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
@@ -101,11 +103,14 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
@@ -116,7 +121,6 @@ VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_dct_ssse3.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
@@ -129,20 +133,34 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 endif
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct4x4_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c
 
 # Strip unnecessary files with CONFIG_REALTIME_ONLY
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_360.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_complexity.h
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libs/libvpx/vp9/vp9dx.mk b/libs/libvpx/vp9/vp9dx.mk
index 59f612b94c9..93a5f368bdf 100644
--- a/libs/libvpx/vp9/vp9dx.mk
+++ b/libs/libvpx/vp9/vp9dx.mk
@@ -28,5 +28,7 @@ VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.c
+VP9_DX_SRCS-yes += decoder/vp9_job_queue.h
 
 VP9_DX_SRCS-yes := $(filter-out $(VP9_DX_SRCS_REMOVE-yes),$(VP9_DX_SRCS-yes))
diff --git a/libs/libvpx/vpx/exports_spatial_svc b/libs/libvpx/vpx/exports_spatial_svc
deleted file mode 100644
index d258a1d6181..00000000000
--- a/libs/libvpx/vpx/exports_spatial_svc
+++ /dev/null
@@ -1,6 +0,0 @@
-text vpx_svc_dump_statistics
-text vpx_svc_encode
-text vpx_svc_get_message
-text vpx_svc_init
-text vpx_svc_release
-text vpx_svc_set_options
diff --git a/libs/libvpx/vpx/internal/vpx_codec_internal.h b/libs/libvpx/vpx/internal/vpx_codec_internal.h
index 522e5c1684a..9eed85e5de7 100644
--- a/libs/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/libs/libvpx/vpx/internal/vpx_codec_internal.h
@@ -40,8 +40,8 @@
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
-#ifndef VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
-#define VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#ifndef VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#define VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
 #include "../vpx_decoder.h"
 #include "../vpx_encoder.h"
 #include <stdarg.h>
@@ -442,4 +442,4 @@ void vpx_internal_error(struct vpx_internal_error_info *info,
 }  // extern "C"
 #endif
 
-#endif  // VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
+#endif  // VPX_VPX_INTERNAL_VPX_CODEC_INTERNAL_H_
diff --git a/libs/libvpx/vpx/src/vpx_encoder.c b/libs/libvpx/vpx/src/vpx_encoder.c
index 1cf2dca695a..c227ee902d9 100644
--- a/libs/libvpx/vpx/src/vpx_encoder.c
+++ b/libs/libvpx/vpx/src/vpx_encoder.c
@@ -20,7 +20,7 @@
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
-#define SAVE_STATUS(ctx, var) (ctx ? (ctx->err = var) : var)
+#define SAVE_STATUS(ctx, var) ((ctx) ? ((ctx)->err = (var)) : (var))
 
 static vpx_codec_alg_priv_t *get_alg_priv(vpx_codec_ctx_t *ctx) {
   return (vpx_codec_alg_priv_t *)ctx->priv;
@@ -82,6 +82,9 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
     res = VPX_CODEC_INCAPABLE;
   else {
     int i;
+#if CONFIG_MULTI_RES_ENCODING
+    int mem_loc_owned = 0;
+#endif
     void *mem_loc = NULL;
 
     if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
@@ -101,12 +104,6 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
           mr_cfg.mr_down_sampling_factor.num = dsf->num;
           mr_cfg.mr_down_sampling_factor.den = dsf->den;
 
-          /* Force Key-frame synchronization. Namely, encoder at higher
-           * resolution always use the same frame_type chosen by the
-           * lowest-resolution encoder.
-           */
-          if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
           ctx->iface = iface;
           ctx->name = iface->name;
           ctx->priv = NULL;
@@ -129,13 +126,17 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             i--;
           }
 #if CONFIG_MULTI_RES_ENCODING
-          assert(mem_loc);
-          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
-          free(mem_loc);
+          if (!mem_loc_owned) {
+            assert(mem_loc);
+            free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+            free(mem_loc);
+          }
 #endif
           return SAVE_STATUS(ctx, res);
         }
-
+#if CONFIG_MULTI_RES_ENCODING
+        mem_loc_owned = 1;
+#endif
         ctx++;
         cfg++;
         dsf++;
@@ -154,7 +155,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
   vpx_codec_enc_cfg_map_t *map;
   int i;
 
-  if (!iface || !cfg || usage > INT_MAX)
+  if (!iface || !cfg || usage != 0)
     res = VPX_CODEC_INVALID_PARAM;
   else if (!(iface->caps & VPX_CODEC_CAP_ENCODER))
     res = VPX_CODEC_INCAPABLE;
@@ -163,12 +164,9 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
 
     for (i = 0; i < iface->enc.cfg_map_count; ++i) {
       map = iface->enc.cfg_maps + i;
-      if (map->usage == (int)usage) {
-        *cfg = map->cfg;
-        cfg->g_usage = usage;
-        res = VPX_CODEC_OK;
-        break;
-      }
+      *cfg = map->cfg;
+      res = VPX_CODEC_OK;
+      break;
     }
   }
 
diff --git a/libs/libvpx/vpx/vp8.h b/libs/libvpx/vpx/vp8.h
index 059c9d0f656..f30dafed585 100644
--- a/libs/libvpx/vpx/vp8.h
+++ b/libs/libvpx/vpx/vp8.h
@@ -10,7 +10,7 @@
 
 /*!\defgroup vp8 VP8
  * \ingroup codecs
- * VP8 is vpx's newest video compression algorithm that uses motion
+ * VP8 is a video compression algorithm that uses motion
  * compensated prediction, Discrete Cosine Transform (DCT) coding of the
  * prediction error signal and context dependent entropy coding techniques
  * based on arithmetic principles. It features:
@@ -27,8 +27,8 @@
 /*!\file
  * \brief Provides controls common to both the VP8 encoder and decoder.
  */
-#ifndef VPX_VP8_H_
-#define VPX_VP8_H_
+#ifndef VPX_VPX_VP8_H_
+#define VPX_VPX_VP8_H_
 
 #include "./vpx_codec.h"
 #include "./vpx_image.h"
@@ -47,10 +47,6 @@ enum vp8_com_control_id {
   VP8_SET_REFERENCE = 1,
   VP8_COPY_REFERENCE = 2, /**< get a copy of reference frame from the decoder */
   VP8_SET_POSTPROC = 3,   /**< set the decoder's post processing settings  */
-  VP8_SET_DBG_COLOR_REF_FRAME = 4, /**< \deprecated */
-  VP8_SET_DBG_COLOR_MB_MODES = 5,  /**< \deprecated */
-  VP8_SET_DBG_COLOR_B_MODES = 6,   /**< \deprecated */
-  VP8_SET_DBG_DISPLAY_MV = 7,      /**< \deprecated */
 
   /* TODO(jkoleszar): The encoder incorrectly reuses some of these values (5+)
    * for its control ids. These should be migrated to something like the
@@ -70,12 +66,7 @@ enum vp8_postproc_level {
   VP8_DEBLOCK = 1 << 0,
   VP8_DEMACROBLOCK = 1 << 1,
   VP8_ADDNOISE = 1 << 2,
-  VP8_DEBUG_TXT_FRAME_INFO = 1 << 3, /**< print frame information */
-  VP8_DEBUG_TXT_MBLK_MODES =
-      1 << 4, /**< print macro block modes over each macro block */
-  VP8_DEBUG_TXT_DC_DIFF = 1 << 5,   /**< print dc diff for each macro block */
-  VP8_DEBUG_TXT_RATE_INFO = 1 << 6, /**< print video rate info (encoder only) */
-  VP8_MFQE = 1 << 10
+  VP8_MFQE = 1 << 3
 };
 
 /*!\brief post process flags
@@ -132,14 +123,6 @@ VPX_CTRL_USE_TYPE(VP8_COPY_REFERENCE, vpx_ref_frame_t *)
 #define VPX_CTRL_VP8_COPY_REFERENCE
 VPX_CTRL_USE_TYPE(VP8_SET_POSTPROC, vp8_postproc_cfg_t *)
 #define VPX_CTRL_VP8_SET_POSTPROC
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_REF_FRAME, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_REF_FRAME
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_MB_MODES, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_MB_MODES
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_COLOR_B_MODES, int)
-#define VPX_CTRL_VP8_SET_DBG_COLOR_B_MODES
-VPX_CTRL_USE_TYPE_DEPRECATED(VP8_SET_DBG_DISPLAY_MV, int)
-#define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
 
@@ -150,4 +133,4 @@ VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE, vp9_ref_frame_t *)
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8_H_
+#endif  // VPX_VPX_VP8_H_
diff --git a/libs/libvpx/vpx/vp8cx.h b/libs/libvpx/vpx/vp8cx.h
index c21b8b60db9..6e613b7273b 100644
--- a/libs/libvpx/vpx/vp8cx.h
+++ b/libs/libvpx/vpx/vp8cx.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VP8CX_H_
-#define VPX_VP8CX_H_
+#ifndef VPX_VPX_VP8CX_H_
+#define VPX_VPX_VP8CX_H_
 
 /*!\defgroup vp8_encoder WebM VP8/VP9 Encoder
  * \ingroup vp8
@@ -125,7 +125,7 @@ extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 enum vp8e_enc_control_id {
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
-   * Supported in codecs: VP8, VP9
+   * Supported in codecs: VP8
    */
   VP8E_SET_ROI_MAP = 8,
 
@@ -148,13 +148,16 @@ enum vp8e_enc_control_id {
    * speed at the expense of quality.
    *
    * \note Valid range for VP8: -16..16
-   * \note Valid range for VP9: -8..8
+   * \note Valid range for VP9: -9..9
    *
    * Supported in codecs: VP8, VP9
    */
   VP8E_SET_CPUUSED = 13,
 
-  /*!\brief Codec control function to enable automatic set and use alf frames.
+  /*!\brief Codec control function to enable automatic use of arf frames.
+   *
+   * \note Valid range for VP8: 0..1
+   * \note Valid range for VP9: 0..6
    *
    * Supported in codecs: VP8, VP9
    */
@@ -169,7 +172,10 @@ enum vp8e_enc_control_id {
    */
   VP8E_SET_NOISE_SENSITIVITY,
 
-  /*!\brief Codec control function to set sharpness.
+  /*!\brief Codec control function to set higher sharpness at the expense
+   * of a lower PSNR.
+   *
+   * \note Valid range: 0..7
    *
    * Supported in codecs: VP8, VP9
    */
@@ -227,8 +233,8 @@ enum vp8e_enc_control_id {
 
   /*!\brief Codec control function to set constrained quality level.
    *
-   * \attention For this value to be used vpx_codec_enc_cfg_t::g_usage must be
-   *            set to #VPX_CQ.
+   * \attention For this value to be used vpx_codec_enc_cfg_t::rc_end_usage must
+   *            be set to #VPX_CQ
    * \note Valid range: 0..63
    *
    * Supported in codecs: VP8, VP9
@@ -423,6 +429,12 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_SVC,
 
+  /*!\brief Codec control function to pass an ROI map to encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_ROI_MAP,
+
   /*!\brief Codec control function to set parameters for SVC.
    * \note Parameters contain min_q, max_q, scaling factor for each of the
    *       SVC layers.
@@ -529,7 +541,7 @@ enum vp8e_enc_control_id {
    * struct #vpx_svc_ref_frame_config defined below.
    *
    * Supported in codecs: VP9
-  */
+   */
   VP9E_SET_SVC_REF_FRAME_CONFIG,
 
   /*!\brief Codec control function to set intended rendering image size.
@@ -550,11 +562,11 @@ enum vp8e_enc_control_id {
   VP9E_SET_TARGET_LEVEL,
 
   /*!\brief Codec control function to set row level multi-threading.
-  *
-  * 0 : off, 1 : on
-  *
-  * Supported in codecs: VP9
-  */
+   *
+   * 0 : off, 1 : on
+   *
+   * Supported in codecs: VP9
+   */
   VP9E_SET_ROW_MT,
 
   /*!\brief Codec control function to get bitstream level.
@@ -574,18 +586,18 @@ enum vp8e_enc_control_id {
   VP9E_SET_ALT_REF_AQ,
 
   /*!\brief Boost percentage for Golden Frame in CBR mode.
-    *
-    * This value controls the amount of boost given to Golden Frame in
-    * CBR mode. It is expressed as a percentage of the average
-    * per-frame bitrate, with the special (and default) value 0 meaning
-    * the feature is off, i.e., no golden frame boost in CBR mode and
-    * average bitrate target is used.
-    *
-    * For example, to allow 100% more bits, i.e, 2X, in a golden frame
-    * than average frame, set this to 100.
-    *
-    * Supported in codecs: VP8
-    */
+   *
+   * This value controls the amount of boost given to Golden Frame in
+   * CBR mode. It is expressed as a percentage of the average
+   * per-frame bitrate, with the special (and default) value 0 meaning
+   * the feature is off, i.e., no golden frame boost in CBR mode and
+   * average bitrate target is used.
+   *
+   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * than average frame, set this to 100.
+   *
+   * Supported in codecs: VP8
+   */
   VP8E_SET_GF_CBR_BOOST_PCT,
 
   /*!\brief Codec control function to enable the extreme motion vector unit test
@@ -596,6 +608,74 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP9
    */
   VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST,
+
+  /*!\brief Codec control function to constrain the inter-layer prediction
+   * (prediction of lower spatial resolution) in VP9 SVC.
+   *
+   * 0 : inter-layer prediction on, 1 : off, 2 : off only on non-key frames
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_INTER_LAYER_PRED,
+
+  /*!\brief Codec control function to set mode and thresholds for frame
+   *  dropping in SVC. Drop frame thresholds are set per-layer. Mode is set as:
+   * 0 : layer-dependent dropping, 1 : constrained dropping, current layer drop
+   * forces drop on all upper layers. Default mode is 0.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_FRAME_DROP_LAYER,
+
+  /*!\brief Codec control function to get the refresh and reference flags and
+   * the buffer indices, up to the last encoded spatial layer.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_SVC_REF_FRAME_CONFIG,
+
+  /*!\brief Codec control function to enable/disable use of golden reference as
+   * a second temporal reference for SVC. Only used when inter-layer prediction
+   * is disabled on INTER frames.
+   *
+   * 0: Off, 1: Enabled (default)
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_GF_TEMPORAL_REF,
+
+  /*!\brief Codec control function to enable spatial layer sync frame, for any
+   * spatial layer. Enabling it for layer k means spatial layer k will disable
+   * all temporal prediction, but keep the inter-layer prediction. It will
+   * refresh any temporal reference buffer for that layer, and reset the
+   * temporal layer for the superframe to 0. Setting the layer sync for base
+   * spatial layer forces a key frame. Default is off (0) for all spatial
+   * layers. Spatial layer sync flag is reset to 0 after each encoded layer,
+   * so when control is invoked it is only used for the current superframe.
+   *
+   * 0: Off (default), 1: Enabled
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
+
+  /*!\brief Codec control function to enable temporal dependency model.
+   *
+   * Vp9 allows the encoder to run temporal dependency model and use it to
+   * improve the compression performance. To enable, set this parameter to be
+   * 1. The default value is set to be 1.
+   */
+  VP9E_SET_TPL,
+
+  /*!\brief Codec control function to enable postencode frame drop.
+   *
+   * This will allow encoder to drop frame after it's encoded.
+   *
+   * 0: Off (default), 1: Enabled
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_POSTENCODE_DROP,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -643,16 +723,20 @@ typedef enum vp9e_temporal_layering_mode {
  */
 
 typedef struct vpx_roi_map {
-  /*! An id between 0 and 3 for each 16x16 region within a frame. */
+  /*! If ROI is enabled. */
+  uint8_t enabled;
+  /*! An id between 0-3 (0-7 for vp9) for each 16x16 (8x8 for VP9)
+   * region within a frame. */
   unsigned char *roi_map;
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
-  // TODO(paulwilkins): broken for VP9 which has 8 segments
-  // q and loop filter deltas for each segment
-  // (see MAX_MB_SEGMENTS)
-  int delta_q[4];  /**< Quantizer deltas. */
-  int delta_lf[4]; /**< Loop filter deltas. */
-  /*! Static breakout threshold for each segment. */
+  /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
+  int delta_q[8];  /**< Quantizer deltas. */
+  int delta_lf[8]; /**< Loop filter deltas. */
+  /*! skip and ref frame segment is only used in VP9. */
+  int skip[8];      /**< Skip this block. */
+  int ref_frame[8]; /**< Reference frame for this block. */
+  /*! Static breakout threshold for each segment. Only used in VP8. */
   unsigned int static_threshold[4];
 } vpx_roi_map_t;
 
@@ -716,11 +800,13 @@ typedef enum { VP8_TUNE_PSNR, VP8_TUNE_SSIM } vp8e_tuning;
  *
  */
 typedef struct vpx_svc_layer_id {
-  int spatial_layer_id;  /**< Spatial layer id number. */
+  int spatial_layer_id; /**< First spatial layer to start encoding. */
+  // TODO(jianj): Deprecated, to be removed.
   int temporal_layer_id; /**< Temporal layer id number. */
+  int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; /**< Temp layer id. */
 } vpx_svc_layer_id_t;
 
-/*!\brief  vp9 svc frame flag parameters.
+/*!\brief vp9 svc frame flag parameters.
  *
  * This defines the frame flags and buffer indices for each spatial layer for
  * svc encoding.
@@ -729,12 +815,56 @@ typedef struct vpx_svc_layer_id {
  *
  */
 typedef struct vpx_svc_ref_frame_config {
-  int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */
-  int lst_fb_idx[VPX_TS_MAX_LAYERS];  /**< Last buffer index. */
-  int gld_fb_idx[VPX_TS_MAX_LAYERS];  /**< Golden buffer index. */
-  int alt_fb_idx[VPX_TS_MAX_LAYERS];  /**< Altref buffer index. */
+  int lst_fb_idx[VPX_SS_MAX_LAYERS];         /**< Last buffer index. */
+  int gld_fb_idx[VPX_SS_MAX_LAYERS];         /**< Golden buffer index. */
+  int alt_fb_idx[VPX_SS_MAX_LAYERS];         /**< Altref buffer index. */
+  int update_buffer_slot[VPX_SS_MAX_LAYERS]; /**< Update reference frames. */
+  // TODO(jianj): Remove update_last/golden/alt_ref, these are deprecated.
+  int update_last[VPX_SS_MAX_LAYERS];       /**< Update last. */
+  int update_golden[VPX_SS_MAX_LAYERS];     /**< Update golden. */
+  int update_alt_ref[VPX_SS_MAX_LAYERS];    /**< Update altref. */
+  int reference_last[VPX_SS_MAX_LAYERS];    /**< Last as reference. */
+  int reference_golden[VPX_SS_MAX_LAYERS];  /**< Golden as reference. */
+  int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
+  int64_t duration[VPX_SS_MAX_LAYERS];      /**< Duration per spatial layer. */
 } vpx_svc_ref_frame_config_t;
 
+/*!\brief VP9 svc frame dropping mode.
+ *
+ * This defines the frame drop mode for SVC.
+ *
+ */
+typedef enum {
+  CONSTRAINED_LAYER_DROP,
+  /**< Upper layers are constrained to drop if current layer drops. */
+  LAYER_DROP,           /**< Any spatial layer can drop. */
+  FULL_SUPERFRAME_DROP, /**< Only full superframe can drop. */
+} SVC_LAYER_DROP_MODE;
+
+/*!\brief vp9 svc frame dropping parameters.
+ *
+ * This defines the frame drop thresholds for each spatial layer, and
+ * the frame dropping mode: 0 = layer based frame dropping (default),
+ * 1 = constrained dropping where current layer drop forces all upper
+ * spatial layers to drop.
+ */
+typedef struct vpx_svc_frame_drop {
+  int framedrop_thresh[VPX_SS_MAX_LAYERS]; /**< Frame drop thresholds */
+  SVC_LAYER_DROP_MODE
+  framedrop_mode;      /**< Layer-based or constrained dropping. */
+  int max_consec_drop; /**< Maximum consecutive drops, for any layer. */
+} vpx_svc_frame_drop_t;
+
+/*!\brief vp9 svc spatial layer sync parameters.
+ *
+ * This defines the spatial layer sync flag, defined per spatial layer.
+ *
+ */
+typedef struct vpx_svc_spatial_layer_sync {
+  int spatial_layer_sync[VPX_SS_MAX_LAYERS]; /**< Sync layer flags */
+  int base_layer_intra_only; /**< Flag for setting Intra-only frame on base */
+} vpx_svc_spatial_layer_sync_t;
+
 /*!\cond */
 /*!\brief VP8 encoder control function parameter type
  *
@@ -749,6 +879,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
 #define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
@@ -792,6 +924,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
 #define VPX_CTRL_VP9E_SET_TILE_ROWS
 
+VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
+#define VPX_CTRL_VP9E_SET_TPL
+
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
 #define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
@@ -801,8 +936,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
 
 VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
-VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
 
 VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
 #define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
@@ -867,10 +1002,29 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
 
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
+#define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
+#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
+
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int)
+#define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF
+
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
+                  vpx_svc_spatial_layer_sync_t *)
+#define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
+
+VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
+#define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
+
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8CX_H_
+#endif  // VPX_VPX_VP8CX_H_
diff --git a/libs/libvpx/vpx/vp8dx.h b/libs/libvpx/vpx/vp8dx.h
index 398c6702209..af92f21ae30 100644
--- a/libs/libvpx/vpx/vp8dx.h
+++ b/libs/libvpx/vpx/vp8dx.h
@@ -17,8 +17,8 @@
  * \brief Provides definitions for using VP8 or VP9 within the vpx Decoder
  *        interface.
  */
-#ifndef VPX_VP8DX_H_
-#define VPX_VP8DX_H_
+#ifndef VPX_VPX_VP8DX_H_
+#define VPX_VPX_VP8DX_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -124,6 +124,24 @@ enum vp8_dec_control_id {
    */
   VPXD_GET_LAST_QUANTIZER,
 
+  /*!\brief Codec control function to set row level multi-threading.
+   *
+   * 0 : off, 1 : on
+   *
+   * Supported in codecs: VP9
+   */
+  VP9D_SET_ROW_MT,
+
+  /*!\brief Codec control function to set loopfilter optimization.
+   *
+   * 0 : off, Loop filter is done after all tiles have been decoded
+   * 1 : on, Loop filter is done immediately after decode without
+   *     waiting for all threads to sync.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9D_SET_LOOP_FILTER_OPT,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -145,10 +163,6 @@ typedef struct vpx_decrypt_init {
   void *decrypt_state;
 } vpx_decrypt_init;
 
-/*!\brief A deprecated alias for vpx_decrypt_init.
- */
-typedef vpx_decrypt_init vp8_decrypt_init;
-
 /*!\cond */
 /*!\brief VP8 decoder control function parameter type
  *
@@ -181,6 +195,10 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
 #define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
 VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
+#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
+VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
+VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
@@ -189,4 +207,4 @@ VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
 }  // extern "C"
 #endif
 
-#endif  // VPX_VP8DX_H_
+#endif  // VPX_VPX_VP8DX_H_
diff --git a/libs/libvpx/vpx/vpx_codec.h b/libs/libvpx/vpx/vpx_codec.h
index ad05f4c74e9..6371a6ca281 100644
--- a/libs/libvpx/vpx/vpx_codec.h
+++ b/libs/libvpx/vpx/vpx_codec.h
@@ -35,8 +35,8 @@
  * Once initialized, the instance is manged using other functions from
  * the vpx_codec_* family.
  */
-#ifndef VPX_VPX_CODEC_H_
-#define VPX_VPX_CODEC_H_
+#ifndef VPX_VPX_VPX_CODEC_H_
+#define VPX_VPX_VPX_CODEC_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -241,11 +241,11 @@ typedef enum vpx_bit_depth {
  */
 int vpx_codec_version(void);
 #define VPX_VERSION_MAJOR(v) \
-  ((v >> 16) & 0xff) /**< extract major from packed version */
+  (((v) >> 16) & 0xff) /**< extract major from packed version */
 #define VPX_VERSION_MINOR(v) \
-  ((v >> 8) & 0xff) /**< extract minor from packed version */
+  (((v) >> 8) & 0xff) /**< extract minor from packed version */
 #define VPX_VERSION_PATCH(v) \
-  ((v >> 0) & 0xff) /**< extract patch from packed version */
+  (((v) >> 0) & 0xff) /**< extract patch from packed version */
 
 /*!\brief Return the version major number */
 #define vpx_codec_version_major() ((vpx_codec_version() >> 16) & 0xff)
@@ -465,4 +465,4 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_CODEC_H_
+#endif  // VPX_VPX_VPX_CODEC_H_
diff --git a/libs/libvpx/vpx/vpx_codec.mk b/libs/libvpx/vpx/vpx_codec.mk
index b77f45817b0..4ed77ad6d9d 100644
--- a/libs/libvpx/vpx/vpx_codec.mk
+++ b/libs/libvpx/vpx/vpx_codec.mk
@@ -15,10 +15,6 @@ API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
-ifeq ($(CONFIG_VP9_ENCODER),yes)
-  API_SRCS-$(CONFIG_SPATIAL_SVC) += src/svc_encodeframe.c
-  API_SRCS-$(CONFIG_SPATIAL_SVC) += svc_context.h
-endif
 
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
diff --git a/libs/libvpx/vpx/vpx_decoder.h b/libs/libvpx/vpx/vpx_decoder.h
index 2ff12112bcf..f113f7196b6 100644
--- a/libs/libvpx/vpx/vpx_decoder.h
+++ b/libs/libvpx/vpx/vpx_decoder.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VPX_DECODER_H_
-#define VPX_VPX_DECODER_H_
+#ifndef VPX_VPX_VPX_DECODER_H_
+#define VPX_VPX_VPX_DECODER_H_
 
 /*!\defgroup decoder Decoder Algorithm Interface
  * \ingroup codec
@@ -362,4 +362,4 @@ vpx_codec_err_t vpx_codec_set_frame_buffer_functions(
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_DECODER_H_
+#endif  // VPX_VPX_VPX_DECODER_H_
diff --git a/libs/libvpx/vpx/vpx_encoder.h b/libs/libvpx/vpx/vpx_encoder.h
index 464bc408c88..c18de703fb8 100644
--- a/libs/libvpx/vpx/vpx_encoder.h
+++ b/libs/libvpx/vpx/vpx_encoder.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_VPX_ENCODER_H_
-#define VPX_VPX_ENCODER_H_
+#ifndef VPX_VPX_VPX_ENCODER_H_
+#define VPX_VPX_VPX_ENCODER_H_
 
 /*!\defgroup encoder Encoder Algorithm Interface
  * \ingroup codec
@@ -39,15 +39,9 @@ extern "C" {
 /*! Temporal Scalability: Maximum number of coding layers */
 #define VPX_TS_MAX_LAYERS 5
 
-/*!\deprecated Use #VPX_TS_MAX_PERIODICITY instead. */
-#define MAX_PERIODICITY VPX_TS_MAX_PERIODICITY
-
 /*! Temporal+Spatial Scalability: Maximum number of coding layers */
 #define VPX_MAX_LAYERS 12  // 3 temporal + 4 spatial layers are allowed.
 
-/*!\deprecated Use #VPX_MAX_LAYERS instead. */
-#define MAX_LAYERS VPX_MAX_LAYERS  // 3 temporal + 4 spatial layers allowed.
-
 /*! Spatial Scalability: Maximum number of coding layers */
 #define VPX_SS_MAX_LAYERS 5
 
@@ -63,7 +57,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (14 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -150,15 +144,10 @@ typedef uint32_t vpx_codec_er_flags_t;
  * extend this list to provide additional functionality.
  */
 enum vpx_codec_cx_pkt_kind {
-  VPX_CODEC_CX_FRAME_PKT,   /**< Compressed video frame */
-  VPX_CODEC_STATS_PKT,      /**< Two-pass statistics for this frame */
-  VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
-  VPX_CODEC_PSNR_PKT,       /**< PSNR statistics for this frame */
-// Spatial SVC is still experimental and may be removed.
-#if defined(VPX_TEST_SPATIAL_SVC)
-  VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
-  VPX_CODEC_SPATIAL_SVC_LAYER_PSNR,  /**< PSNR for each layer in this frame*/
-#endif
+  VPX_CODEC_CX_FRAME_PKT,    /**< Compressed video frame */
+  VPX_CODEC_STATS_PKT,       /**< Two-pass statistics for this frame */
+  VPX_CODEC_FPMB_STATS_PKT,  /**< first pass mb statistics for this frame */
+  VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
   VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
 };
 
@@ -182,6 +171,13 @@ typedef struct vpx_codec_cx_pkt {
        * Only applicable when "output partition" mode is enabled. First
        * partition has id 0.*/
       int partition_id;
+      /*!\brief Width and height of frames in this packet. VP8 will only use the
+       * first one.*/
+      unsigned int width[VPX_SS_MAX_LAYERS];  /**< frame width */
+      unsigned int height[VPX_SS_MAX_LAYERS]; /**< frame height */
+      /*!\brief Flag to indicate if spatial layer frame in this packet is
+       * encoded or dropped. VP8 will always be set to 1.*/
+      uint8_t spatial_layer_encoded[VPX_SS_MAX_LAYERS];
     } frame;                            /**< data for compressed frame packet */
     vpx_fixed_buf_t twopass_stats;      /**< data for two-pass packet */
     vpx_fixed_buf_t firstpass_mb_stats; /**< first pass mb packet */
@@ -191,11 +187,6 @@ typedef struct vpx_codec_cx_pkt {
       double psnr[4];          /**< PSNR, total/y/u/v */
     } psnr;                    /**< data for PSNR packet */
     vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
-// Spatial SVC is still experimental and may be removed.
-#if defined(VPX_TEST_SPATIAL_SVC)
-    size_t layer_sizes[VPX_SS_MAX_LAYERS];
-    struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
-#endif
 
     /* This packet size is fixed to allow codecs to extend this
      * interface without having to manage storage for raw packets,
@@ -211,8 +202,6 @@ typedef struct vpx_codec_cx_pkt {
  * This callback function, when registered, returns with packets when each
  * spatial layer is encoded.
  */
-// putting the definitions here for now. (agrange: find if there
-// is a better place for this)
 typedef void (*vpx_codec_enc_output_cx_pkt_cb_fn_t)(vpx_codec_cx_pkt_t *pkt,
                                                     void *user_data);
 
@@ -281,12 +270,9 @@ typedef struct vpx_codec_enc_cfg {
    * generic settings (g)
    */
 
-  /*!\brief Algorithm specific "usage" value
+  /*!\brief Deprecated: Algorithm specific "usage" value
    *
-   * Algorithms may define multiple values for usage, which may convey the
-   * intent of how the application intends to use the stream. If this value
-   * is non-zero, consult the documentation for the codec to determine its
-   * meaning.
+   * This value must be zero.
    */
   unsigned int g_usage;
 
@@ -397,9 +383,6 @@ typedef struct vpx_codec_enc_cfg {
    * trade-off is often acceptable, but for many applications is not. It can
    * be disabled in these cases.
    *
-   * Note that not all codecs support this feature. All vpx VPx codecs do.
-   * For other codecs, consult the documentation for that algorithm.
-   *
    * This threshold is described as a percentage of the target data buffer.
    * When the data buffer falls below this percentage of fullness, a
    * dropped frame is indicated. Set the threshold to zero (0) to disable
@@ -485,8 +468,7 @@ typedef struct vpx_codec_enc_cfg {
    * The quantizer is the most direct control over the quality of the
    * encoded image. The range of valid values for the quantizer is codec
    * specific. Consult the documentation for the codec to determine the
-   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * values to use.
    */
   unsigned int rc_min_quantizer;
 
@@ -495,8 +477,7 @@ typedef struct vpx_codec_enc_cfg {
    * The quantizer is the most direct control over the quality of the
    * encoded image. The range of valid values for the quantizer is codec
    * specific. Consult the documentation for the codec to determine the
-   * values to use. To determine the range programmatically, call
-   * vpx_codec_enc_config_default() with a usage value of 0.
+   * values to use.
    */
   unsigned int rc_max_quantizer;
 
@@ -512,7 +493,7 @@ typedef struct vpx_codec_enc_cfg {
    * be subtracted from the target bitrate in order to compensate
    * for prior overshoot.
    * VP9: Expressed as a percentage of the target bitrate, a threshold
-   * undershoot level (current rate vs target) beyond which more agressive
+   * undershoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *   *
    * Valid values in the range VP8:0-1000 VP9: 0-100.
@@ -527,7 +508,7 @@ typedef struct vpx_codec_enc_cfg {
    * be added to the target bitrate in order to compensate for
    * prior undershoot.
    * VP9: Expressed as a percentage of the target bitrate, a threshold
-   * overshoot level (current rate vs target) beyond which more agressive
+   * overshoot level (current rate vs target) beyond which more aggressive
    * corrective measures are taken.
    *
    * Valid values in the range VP8:0-1000 VP9: 0-100.
@@ -596,10 +577,10 @@ typedef struct vpx_codec_enc_cfg {
   unsigned int rc_2pass_vbr_maxsection_pct;
 
   /*!\brief Two-pass corpus vbr mode complexity control
-  * Used only in VP9: A value representing the corpus midpoint complexity
-  * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
-  * mode in favour of normal vbr mode.
-  */
+   * Used only in VP9: A value representing the corpus midpoint complexity
+   * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
+   * mode in favour of normal vbr mode.
+   */
   unsigned int rc_2pass_vbr_corpus_complexity;
 
   /*
@@ -682,7 +663,7 @@ typedef struct vpx_codec_enc_cfg {
    * membership of frames to temporal layers. For example, if the
    * ts_periodicity = 8, then the frames are assigned to coding layers with a
    * repeated sequence of length 8.
-  */
+   */
   unsigned int ts_periodicity;
 
   /*!\brief Template defining the membership of frames to temporal layers.
@@ -691,7 +672,7 @@ typedef struct vpx_codec_enc_cfg {
    * For a 2-layer encoding that assigns even numbered frames to one temporal
    * layer (0) and odd numbered frames to a second temporal layer (1) with
    * ts_periodicity=8, then ts_layer_id = (0,1,0,1,0,1,0,1).
-  */
+   */
   unsigned int ts_layer_id[VPX_TS_MAX_PERIODICITY];
 
   /*!\brief Target bitrate for each spatial/temporal layer.
@@ -802,7 +783,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
  *
  * \param[in]    iface     Pointer to the algorithm interface to use.
  * \param[out]   cfg       Configuration buffer to populate.
- * \param[in]    reserved  Must set to 0 for VP8 and VP9.
+ * \param[in]    usage     Must be set to 0.
  *
  * \retval #VPX_CODEC_OK
  *     The configuration was populated.
@@ -813,7 +794,7 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
  */
 vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
                                              vpx_codec_enc_cfg_t *cfg,
-                                             unsigned int reserved);
+                                             unsigned int usage);
 
 /*!\brief Set or change configuration
  *
@@ -862,7 +843,7 @@ vpx_fixed_buf_t *vpx_codec_get_global_headers(vpx_codec_ctx_t *ctx);
  * implicit that limiting the available time to encode will degrade the
  * output quality. The encoder can be given an unlimited time to produce the
  * best possible frame by specifying a deadline of '0'. This deadline
- * supercedes the VPx notion of "best quality, good quality, realtime".
+ * supersedes the VPx notion of "best quality, good quality, realtime".
  * Applications that wish to map these former settings to the new deadline
  * based system can use the symbols #VPX_DL_REALTIME, #VPX_DL_GOOD_QUALITY,
  * and #VPX_DL_BEST_QUALITY.
@@ -984,4 +965,4 @@ const vpx_image_t *vpx_codec_get_preview_frame(vpx_codec_ctx_t *ctx);
 #ifdef __cplusplus
 }
 #endif
-#endif  // VPX_VPX_ENCODER_H_
+#endif  // VPX_VPX_VPX_ENCODER_H_
diff --git a/libs/libvpx/vpx/vpx_frame_buffer.h b/libs/libvpx/vpx/vpx_frame_buffer.h
index ad70cdd572b..fc8320017be 100644
--- a/libs/libvpx/vpx/vpx_frame_buffer.h
+++ b/libs/libvpx/vpx/vpx_frame_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_FRAME_BUFFER_H_
-#define VPX_VPX_FRAME_BUFFER_H_
+#ifndef VPX_VPX_VPX_FRAME_BUFFER_H_
+#define VPX_VPX_VPX_FRAME_BUFFER_H_
 
 /*!\file
  * \brief Describes the decoder external frame buffer interface.
@@ -52,12 +52,12 @@ typedef struct vpx_codec_frame_buffer {
  * data. The callback is triggered when the decoder needs a frame buffer to
  * decode a compressed image into. This function may be called more than once
  * for every call to vpx_codec_decode. The application may set fb->priv to
- * some data which will be passed back in the ximage and the release function
- * call. |fb| is guaranteed to not be NULL. On success the callback must
- * return 0. Any failure the callback must return a value less than 0.
+ * some data which will be passed back in the vpx_image_t and the release
+ * function call. |fb| is guaranteed to not be NULL. On success the callback
+ * must return 0. Any failure the callback must return a value less than 0.
  *
  * \param[in] priv         Callback's private data
- * \param[in] new_size     Size in bytes needed by the buffer
+ * \param[in] min_size     Size in bytes needed by the buffer
  * \param[in,out] fb       Pointer to vpx_codec_frame_buffer_t
  */
 typedef int (*vpx_get_frame_buffer_cb_fn_t)(void *priv, size_t min_size,
@@ -80,4 +80,4 @@ typedef int (*vpx_release_frame_buffer_cb_fn_t)(void *priv,
 }  // extern "C"
 #endif
 
-#endif  // VPX_VPX_FRAME_BUFFER_H_
+#endif  // VPX_VPX_VPX_FRAME_BUFFER_H_
diff --git a/libs/libvpx/vpx/vpx_image.h b/libs/libvpx/vpx/vpx_image.h
index d6d3166d2ff..83cb19126aa 100644
--- a/libs/libvpx/vpx/vpx_image.h
+++ b/libs/libvpx/vpx/vpx_image.h
@@ -12,8 +12,8 @@
  * \brief Describes the vpx image descriptor and associated operations
  *
  */
-#ifndef VPX_VPX_IMAGE_H_
-#define VPX_VPX_IMAGE_H_
+#ifndef VPX_VPX_VPX_IMAGE_H_
+#define VPX_VPX_VPX_IMAGE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -167,21 +167,21 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
  * storage for descriptor has been allocated elsewhere, and a descriptor is
  * desired to "wrap" that storage.
  *
- * \param[in]    img       Pointer to storage for descriptor. If this parameter
- *                         is NULL, the storage for the descriptor will be
- *                         allocated on the heap.
- * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
- * \param[in]    align     Alignment, in bytes, of each row in the image.
- * \param[in]    img_data  Storage to use for the image
+ * \param[in]    img           Pointer to storage for descriptor. If this
+ *                             parameter is NULL, the storage for the descriptor
+ *                             will be allocated on the heap.
+ * \param[in]    fmt           Format for the image
+ * \param[in]    d_w           Width of the image
+ * \param[in]    d_h           Height of the image
+ * \param[in]    stride_align  Alignment, in bytes, of each row in the image.
+ * \param[in]    img_data      Storage to use for the image
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
  *         returned.
  */
 vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
-                          unsigned int d_h, unsigned int align,
+                          unsigned int d_h, unsigned int stride_align,
                           unsigned char *img_data);
 
 /*!\brief Set the rectangle identifying the displayed portion of the image
@@ -221,4 +221,4 @@ void vpx_img_free(vpx_image_t *img);
 }  // extern "C"
 #endif
 
-#endif  // VPX_VPX_IMAGE_H_
+#endif  // VPX_VPX_VPX_IMAGE_H_
diff --git a/libs/libvpx/vpx/vpx_integer.h b/libs/libvpx/vpx/vpx_integer.h
index 09bad9222d4..4129d156f8a 100644
--- a/libs/libvpx/vpx/vpx_integer.h
+++ b/libs/libvpx/vpx/vpx_integer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VPX_INTEGER_H_
-#define VPX_VPX_INTEGER_H_
+#ifndef VPX_VPX_VPX_INTEGER_H_
+#define VPX_VPX_VPX_INTEGER_H_
 
 /* get ptrdiff_t, size_t, wchar_t, NULL */
 #include <stddef.h>
@@ -18,27 +18,12 @@
 #define VPX_FORCE_INLINE __forceinline
 #define VPX_INLINE __inline
 #else
-#define VPX_FORCE_INLINE __inline__ __attribute__(always_inline)
+#define VPX_FORCE_INLINE __inline__ __attribute__((always_inline))
 // TODO(jbb): Allow a way to force inline off for older compilers.
 #define VPX_INLINE inline
 #endif
 
-#if defined(VPX_EMULATE_INTTYPES)
-typedef signed char int8_t;
-typedef signed short int16_t;
-typedef signed int int32_t;
-
-typedef unsigned char uint8_t;
-typedef unsigned short uint16_t;
-typedef unsigned int uint32_t;
-
-#ifndef _UINTPTR_T_DEFINED
-typedef size_t uintptr_t;
-#endif
-
-#else
-
-/* Most platforms have the C99 standard integer types. */
+/* Assume platforms have the C99 standard integer types. */
 
 #if defined(__cplusplus)
 #if !defined(__STDC_FORMAT_MACROS)
@@ -49,15 +34,7 @@ typedef size_t uintptr_t;
 #endif
 #endif  // __cplusplus
 
-#include <stdint.h>
-
-#endif
-
-/* VS2010 defines stdint.h, but not inttypes.h */
-#if defined(_MSC_VER) && _MSC_VER < 1800
-#define PRId64 "I64d"
-#else
 #include <inttypes.h>
-#endif
+#include <stdint.h>
 
-#endif  // VPX_VPX_INTEGER_H_
+#endif  // VPX_VPX_VPX_INTEGER_H_
diff --git a/libs/libvpx/vpx_dsp/add_noise.c b/libs/libvpx/vpx_dsp/add_noise.c
index cda6ae8814a..6839e979284 100644
--- a/libs/libvpx/vpx_dsp/add_noise.c
+++ b/libs/libvpx/vpx_dsp/add_noise.c
@@ -52,6 +52,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
     const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
     if (a_i) {
       for (j = 0; j < a_i; ++j) {
+        if (next + j >= 256) goto set_noise;
         char_dist[next + j] = (int8_t)i;
       }
       next = next + j;
@@ -63,6 +64,7 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size) {
     char_dist[next] = 0;
   }
 
+set_noise:
   for (i = 0; i < size; ++i) {
     noise[i] = char_dist[rand() & 0xff];  // NOLINT
   }
diff --git a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c
index 1370ec2d2ea..5afdece0aba 100644
--- a/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
 void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
   if (width > 8) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
+    int x, y = height;
+    do {
       for (x = 0; x < width; x += 16) {
         const uint8x16_t p = vld1q_u8(pred + x);
         const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
       comp += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--y);
+  } else if (width == 8) {
+    int i = width * height;
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+      const uint8x8_t r_0 = vld1_u8(ref);
+      const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+      r = vcombine_u8(r_0, r_1);
+      ref += 2 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
   } else {
-    int i;
-    for (i = 0; i < width * height; i += 16) {
+    int i = width * height;
+    assert(width == 4);
+    do {
       const uint8x16_t p = vld1q_u8(pred);
       uint8x16_t r;
 
-      if (width == 4) {
-        r = load_unaligned_u8q(ref, ref_stride);
-        ref += 4 * ref_stride;
-      } else {
-        const uint8x8_t r_0 = vld1_u8(ref);
-        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
-        assert(width == 8);
-        r = vcombine_u8(r_0, r_1);
-        ref += 2 * ref_stride;
-      }
+      r = load_unaligned_u8q(ref, ref_stride);
+      ref += 4 * ref_stride;
       r = vrhaddq_u8(r, p);
       vst1q_u8(comp, r);
 
       pred += 16;
       comp += 16;
-    }
+      i -= 16;
+    } while (i);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/deblock_neon.c b/libs/libvpx/vpx_dsp/arm/deblock_neon.c
index 1fb41d29920..7efce327359 100644
--- a/libs/libvpx/vpx_dsp/arm/deblock_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/deblock_neon.c
@@ -91,11 +91,6 @@ void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
   int row;
   int col;
 
-  // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for
-  // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8
-  // (for U/V).
-  assert((size == 8 || size == 16) && cols % 8 == 0);
-
   // While columns of length 16 can be processed, load them.
   for (col = 0; col < cols - 8; col += 16) {
     uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
diff --git a/libs/libvpx/vpx_dsp/arm/fdct_neon.c b/libs/libvpx/vpx_dsp/arm/fdct_neon.c
index 04646ed2e0e..3708cbb11fe 100644
--- a/libs/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
diff --git a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index 8049277b132..374a262b93e 100644
--- a/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 5358839b538..654ab42ca40 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -11,61 +11,37 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t,
-                                                     int32x4x2_t *const d0,
-                                                     int32x4x2_t *const d1) {
-  int32x2x2_t t32[4];
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
-  t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
-  t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
-  t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
-  d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
-  d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
-  d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]);
+static INLINE int32x4_t dct_const_round_shift_high_4(const int64x2x2_t in) {
+  int32x2x2_t t32;
+
+  t32.val[0] = vrshrn_n_s64(in.val[0], DCT_CONST_BITS);
+  t32.val[1] = vrshrn_n_s64(in.val[1], DCT_CONST_BITS);
+  return vcombine_s32(t32.val[0], t32.val[1]);
 }
 
-static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t,
-                                                     int32x4_t *const d0,
-                                                     int32x4_t *const d1) {
-  int32x2x2_t t32[2];
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+static INLINE void dct_const_round_shift_high_4_dual(
+    const int64x2x2_t *const in, int32x4_t *const d0, int32x4_t *const d1) {
+  *d0 = dct_const_round_shift_high_4(in[0]);
+  *d1 = dct_const_round_shift_high_4(in[1]);
 }
 
 static INLINE int32x4x2_t
-highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) {
-  int32x2x2_t t32[2];
-  int32x4x2_t d;
-
-  t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
-  t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
-  t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
-  t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
-  d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
-  d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
-  return d;
+dct_const_round_shift_high_4x2_int64x2x2(const int64x2x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = dct_const_round_shift_high_4(in[0]);
+  out.val[1] = dct_const_round_shift_high_4(in[1]);
+  return out;
 }
 
-static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) {
-  int32x2x2_t t32;
-
-  t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS);
-  t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS);
-  return vcombine_s32(t32.val[0], t32.val[1]);
+static INLINE void dct_const_round_shift_high_4x2x2(const int64x2x2_t *const in,
+                                                    int32x4x2_t *const d0,
+                                                    int32x4x2_t *const d1) {
+  *d0 = dct_const_round_shift_high_4x2_int64x2x2(in + 0);
+  *d1 = dct_const_round_shift_high_4x2_int64x2x2(in + 2);
 }
 
 static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
@@ -107,7 +83,7 @@ static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
                                vget_low_s32(cospi_2_30_10_22), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_2_30_10_22), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
@@ -149,7 +125,7 @@ static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
                                vget_low_s32(cospi_4_12_20N_28), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_4_12_20N_28), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
@@ -191,7 +167,7 @@ static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
                                vget_low_s32(cospi_6_26N_14_18N), 1);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_low_s32(cospi_6_26N_14_18N), 1);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
@@ -233,7 +209,7 @@ static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
                                vget_high_s32(cospi_2_30_10_22), 0);
   t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_2_30_10_22), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
@@ -275,7 +251,7 @@ static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
                                vget_high_s32(cospi_4_12_20N_28), 0);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_4_12_20N_28), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
@@ -317,7 +293,7 @@ static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
                                vget_high_s32(cospi_6_26N_14_18N), 1);
   t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_6_26N_14_18N), 1);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_q_kernel(
@@ -386,7 +362,7 @@ static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
   int64x2x2_t t[4];
 
   highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
@@ -397,7 +373,7 @@ static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
   int64x2x2_t t[2];
 
   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
@@ -412,7 +388,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
   t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
   t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
   t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
@@ -425,7 +401,7 @@ static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
   highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
   t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
   t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
@@ -459,7 +435,7 @@ static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
                                vget_high_s32(cospi_0_8_16_24), 0);
   t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
                                vget_high_s32(cospi_0_8_16_24), 0);
-  highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+  dct_const_round_shift_high_4x2x2(t, d0, d1);
 }
 
 static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
@@ -481,7 +457,7 @@ static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
                                vget_high_s32(cospi_0_8_16_24), 0);
   t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
                                vget_high_s32(cospi_0_8_16_24), 0);
-  highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+  dct_const_round_shift_high_4_dual(t, d0, d1);
 }
 
 static INLINE void highbd_idct16x16_add_stage7_dual(
@@ -540,62 +516,9 @@ static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
   out[15] = vsubq_s32(step2[0], step2[15]);
 }
 
-static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
-                                                int32_t *output) {
-  // Save the result into output
-  vst1q_s32(output + 0, out[0].val[0]);
-  vst1q_s32(output + 4, out[0].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[1].val[0]);
-  vst1q_s32(output + 4, out[1].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[2].val[0]);
-  vst1q_s32(output + 4, out[2].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[3].val[0]);
-  vst1q_s32(output + 4, out[3].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[4].val[0]);
-  vst1q_s32(output + 4, out[4].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[5].val[0]);
-  vst1q_s32(output + 4, out[5].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[6].val[0]);
-  vst1q_s32(output + 4, out[6].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[7].val[0]);
-  vst1q_s32(output + 4, out[7].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[8].val[0]);
-  vst1q_s32(output + 4, out[8].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[9].val[0]);
-  vst1q_s32(output + 4, out[9].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[10].val[0]);
-  vst1q_s32(output + 4, out[10].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[11].val[0]);
-  vst1q_s32(output + 4, out[11].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[12].val[0]);
-  vst1q_s32(output + 4, out[12].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[13].val[0]);
-  vst1q_s32(output + 4, out[13].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[14].val[0]);
-  vst1q_s32(output + 4, out[14].val[1]);
-  output += 16;
-  vst1q_s32(output + 0, out[15].val[0]);
-  vst1q_s32(output + 4, out[15].val[1]);
-}
-
-static void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input,
-                                                int32_t *output, uint16_t *dest,
-                                                const int stride,
-                                                const int bd) {
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+                                         uint16_t *dest, const int stride,
+                                         const int bd) {
   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
@@ -815,7 +738,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
-  return highbd_idct16x16_add_wrap_low_8x1(t);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
 }
 
 static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
@@ -824,7 +747,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
 
   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
-  return highbd_idct16x16_add_wrap_low_4x1(t);
+  return dct_const_round_shift_high_4(t);
 }
 
 static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
@@ -835,7 +758,7 @@ static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
   t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
   t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
   t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
-  return highbd_idct16x16_add_wrap_low_8x1(t);
+  return dct_const_round_shift_high_4x2_int64x2x2(t);
 }
 
 static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
@@ -844,7 +767,7 @@ static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
 
   t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
   t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
-  return highbd_idct16x16_add_wrap_low_4x1(t);
+  return dct_const_round_shift_high_4(t);
 }
 
 static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
@@ -1003,8 +926,8 @@ static void vpx_highbd_idct16x16_38_add_half1d(const int32_t *input,
   }
 }
 
-void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
-                                              int32_t *output) {
+static void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+                                                 int32_t *output) {
   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
@@ -1142,10 +1065,11 @@ void vpx_highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
   vst1q_s32(output, out[15]);
 }
 
-void vpx_highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
-                                              int32_t *const output,
-                                              uint16_t *const dest,
-                                              const int stride, const int bd) {
+static void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+                                                 int32_t *const output,
+                                                 uint16_t *const dest,
+                                                 const int stride,
+                                                 const int bd) {
   const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
   const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
   const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
@@ -1366,16 +1290,16 @@ void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest,
 
     // pass 1
     // Parallel idct on the upper 8 rows
-    vpx_highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+    highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
 
     // pass 2
     // Parallel idct to get the left 8 columns
-    vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest,
-                                             stride, bd);
+    highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
+                                         bd);
 
     // Parallel idct to get the right 8 columns
-    vpx_highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
-                                             dest + 8, stride, bd);
+    highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+                                         dest + 8, stride, bd);
   }
 }
 
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
index 96a55c472f6..5b36f733678 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c
@@ -124,83 +124,77 @@ static INLINE void do_butterfly(const int32x4x2_t qIn0, const int32x4x2_t qIn1,
                                vrshrn_n_s64(q[3].val[1], DCT_CONST_BITS));
 }
 
-static INLINE void load_s32x4q_dual(
-    const int32_t *in, int32x4x2_t *const s0, int32x4x2_t *const s1,
-    int32x4x2_t *const s2, int32x4x2_t *const s3, int32x4x2_t *const s4,
-    int32x4x2_t *const s5, int32x4x2_t *const s6, int32x4x2_t *const s7) {
-  s0->val[0] = vld1q_s32(in);
-  s0->val[1] = vld1q_s32(in + 4);
+static INLINE void load_s32x4q_dual(const int32_t *in, int32x4x2_t *const s) {
+  s[0].val[0] = vld1q_s32(in);
+  s[0].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s1->val[0] = vld1q_s32(in);
-  s1->val[1] = vld1q_s32(in + 4);
+  s[1].val[0] = vld1q_s32(in);
+  s[1].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s2->val[0] = vld1q_s32(in);
-  s2->val[1] = vld1q_s32(in + 4);
+  s[2].val[0] = vld1q_s32(in);
+  s[2].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s3->val[0] = vld1q_s32(in);
-  s3->val[1] = vld1q_s32(in + 4);
+  s[3].val[0] = vld1q_s32(in);
+  s[3].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s4->val[0] = vld1q_s32(in);
-  s4->val[1] = vld1q_s32(in + 4);
+  s[4].val[0] = vld1q_s32(in);
+  s[4].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s5->val[0] = vld1q_s32(in);
-  s5->val[1] = vld1q_s32(in + 4);
+  s[5].val[0] = vld1q_s32(in);
+  s[5].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s6->val[0] = vld1q_s32(in);
-  s6->val[1] = vld1q_s32(in + 4);
+  s[6].val[0] = vld1q_s32(in);
+  s[6].val[1] = vld1q_s32(in + 4);
   in += 32;
-  s7->val[0] = vld1q_s32(in);
-  s7->val[1] = vld1q_s32(in + 4);
+  s[7].val[0] = vld1q_s32(in);
+  s[7].val[1] = vld1q_s32(in + 4);
 }
 
-static INLINE void transpose_and_store_s32_8x8(int32x4x2_t a0, int32x4x2_t a1,
-                                               int32x4x2_t a2, int32x4x2_t a3,
-                                               int32x4x2_t a4, int32x4x2_t a5,
-                                               int32x4x2_t a6, int32x4x2_t a7,
+static INLINE void transpose_and_store_s32_8x8(int32x4x2_t *const a,
                                                int32_t **out) {
-  transpose_s32_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+  transpose_s32_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
 
-  vst1q_s32(*out, a0.val[0]);
+  vst1q_s32(*out, a[0].val[0]);
   *out += 4;
-  vst1q_s32(*out, a0.val[1]);
+  vst1q_s32(*out, a[0].val[1]);
   *out += 4;
-  vst1q_s32(*out, a1.val[0]);
+  vst1q_s32(*out, a[1].val[0]);
   *out += 4;
-  vst1q_s32(*out, a1.val[1]);
+  vst1q_s32(*out, a[1].val[1]);
   *out += 4;
-  vst1q_s32(*out, a2.val[0]);
+  vst1q_s32(*out, a[2].val[0]);
   *out += 4;
-  vst1q_s32(*out, a2.val[1]);
+  vst1q_s32(*out, a[2].val[1]);
   *out += 4;
-  vst1q_s32(*out, a3.val[0]);
+  vst1q_s32(*out, a[3].val[0]);
   *out += 4;
-  vst1q_s32(*out, a3.val[1]);
+  vst1q_s32(*out, a[3].val[1]);
   *out += 4;
-  vst1q_s32(*out, a4.val[0]);
+  vst1q_s32(*out, a[4].val[0]);
   *out += 4;
-  vst1q_s32(*out, a4.val[1]);
+  vst1q_s32(*out, a[4].val[1]);
   *out += 4;
-  vst1q_s32(*out, a5.val[0]);
+  vst1q_s32(*out, a[5].val[0]);
   *out += 4;
-  vst1q_s32(*out, a5.val[1]);
+  vst1q_s32(*out, a[5].val[1]);
   *out += 4;
-  vst1q_s32(*out, a6.val[0]);
+  vst1q_s32(*out, a[6].val[0]);
   *out += 4;
-  vst1q_s32(*out, a6.val[1]);
+  vst1q_s32(*out, a[6].val[1]);
   *out += 4;
-  vst1q_s32(*out, a7.val[0]);
+  vst1q_s32(*out, a[7].val[0]);
   *out += 4;
-  vst1q_s32(*out, a7.val[1]);
+  vst1q_s32(*out, a[7].val[1]);
   *out += 4;
 }
 
 static INLINE void idct32_transpose_pair(const int32_t *input, int32_t *t_buf) {
   int i;
-  int32x4x2_t s0, s1, s2, s3, s4, s5, s6, s7;
+  int32x4x2_t s[8];
 
   for (i = 0; i < 4; i++, input += 8) {
-    load_s32x4q_dual(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-    transpose_and_store_s32_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf);
+    load_s32x4q_dual(input, s);
+    transpose_and_store_s32_8x8(s, &t_buf);
   }
 }
 
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
index 3970a5a8613..6750c1a426d 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
index 5d9063b15dc..f05932cec3a 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c
@@ -12,6 +12,7 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 1418a75a15b..7be1dad1d31 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -11,27 +11,10 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
-                                                const int stride,
-                                                const int16x8_t res,
-                                                const int16x8_t max) {
-  const uint16x4_t a0 = vld1_u16(*dest);
-  const uint16x4_t a1 = vld1_u16(*dest + stride);
-  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
-  // Note: In some profile tests, res is quite close to +/-32767.
-  // We use saturating addition.
-  const int16x8_t b = vqaddq_s16(res, a);
-  const int16x8_t c = vminq_s16(b, max);
-  const uint16x8_t d = vqshluq_n_s16(c, 0);
-  vst1_u16(*dest, vget_low_u16(d));
-  *dest += stride;
-  vst1_u16(*dest, vget_high_u16(d));
-  *dest += stride;
-}
-
 // res is in reverse row order
 static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
                                                 const int stride,
@@ -65,109 +48,42 @@ void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
   highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max);
 }
 
-static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
-  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
-  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
-  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
-  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
-  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
-  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
-  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
-  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
-  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
-static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
-                                          int32x4_t *const a0,
-                                          int32x4_t *const a1,
-                                          int32x4_t *const a2,
-                                          int32x4_t *const a3) {
-  int32x4_t b0, b1, b2, b3;
-  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;
-
-  transpose_s32_4x4(a0, a1, a2, a3);
-  b0 = vaddq_s32(*a0, *a2);
-  b1 = vsubq_s32(*a0, *a2);
-  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
-  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
-  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
-  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
-  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
-  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
-  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
-  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
-  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
-  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
-  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
-  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
-  c4 = vsubq_s64(c4, c8);
-  c5 = vsubq_s64(c5, c9);
-  c6 = vaddq_s64(c6, c10);
-  c7 = vaddq_s64(c7, c11);
-  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
-                    vrshrn_n_s64(c1, DCT_CONST_BITS));
-  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
-                    vrshrn_n_s64(c3, DCT_CONST_BITS));
-  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
-                    vrshrn_n_s64(c5, DCT_CONST_BITS));
-  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
-                    vrshrn_n_s64(c7, DCT_CONST_BITS));
-  *a0 = vaddq_s32(b0, b3);
-  *a1 = vaddq_s32(b1, b2);
-  *a2 = vsubq_s32(b1, b2);
-  *a3 = vsubq_s32(b0, b3);
-}
-
 void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  int32x4_t c0 = vld1q_s32(input);
-  int32x4_t c1 = vld1q_s32(input + 4);
-  int32x4_t c2 = vld1q_s32(input + 8);
-  int32x4_t c3 = vld1q_s32(input + 12);
-  int16x8_t a0, a1;
+  int16x8_t a[2];
+  int32x4_t c[4];
 
-  if (bd == 8) {
-    const int16x4_t cospis = vld1_s16(kCospi);
+  c[0] = vld1q_s32(input);
+  c[1] = vld1q_s32(input + 4);
+  c[2] = vld1q_s32(input + 8);
+  c[3] = vld1q_s32(input + 12);
 
+  if (bd == 8) {
     // Rows
-    a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1));
-    a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+    a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+    a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+    transpose_idct4x4_16_bd8(a);
 
     // Columns
-    a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-    idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-    a0 = vrshrq_n_s16(a0, 4);
-    a1 = vrshrq_n_s16(a1, 4);
+    a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+    transpose_idct4x4_16_bd8(a);
+    a[0] = vrshrq_n_s16(a[0], 4);
+    a[1] = vrshrq_n_s16(a[1], 4);
   } else {
     const int32x4_t cospis = vld1q_s32(kCospi32);
 
     if (bd == 10) {
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd10(cospis, c);
+      idct4x4_16_kernel_bd10(cospis, c);
     } else {
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
-      idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3);
+      idct4x4_16_kernel_bd12(cospis, c);
+      idct4x4_16_kernel_bd12(cospis, c);
     }
-    a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4));
-    a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4));
+    a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+    a[1] = vcombine_s16(vqrshrn_n_s32(c[3], 4), vqrshrn_n_s32(c[2], 4));
   }
 
-  highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max);
-  highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max);
+  highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+  highbd_idct4x4_1_add_kernel2(&dest, stride, a[1], max);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index dd90134a6e0..bed3227ca7d 100644
--- a/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
@@ -127,7 +128,7 @@ static INLINE void idct8x8_12_half1d_bd12(
     int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
     int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
     int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h;
+  int32x2_t input1l, input1h, input3l, input3h;
   int32x2_t step1l[2], step1h[2];
   int32x4_t step1[8], step2[8];
   int64x2_t t64[8];
@@ -136,23 +137,23 @@ static INLINE void idct8x8_12_half1d_bd12(
   transpose_s32_4x4(io0, io1, io2, io3);
 
   // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
   step1l[0] = vget_low_s32(*io0);
   step1h[0] = vget_high_s32(*io0);
   step1l[1] = vget_low_s32(*io2);
   step1h[1] = vget_high_s32(*io2);
 
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
   t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
   t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
   t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
@@ -222,82 +223,15 @@ static INLINE void idct8x8_12_half1d_bd12(
   *io7 = vsubq_s32(step1[0], step2[7]);
 }
 
-static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                                 int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                                 int16x8_t a6, int16x8_t a7, uint16_t *dest,
-                                 const int stride, const int bd) {
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  const uint16_t *dst = dest;
-  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
-  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
-
-  d0 = vld1q_u16(dst);
-  dst += stride;
-  d1 = vld1q_u16(dst);
-  dst += stride;
-  d2 = vld1q_u16(dst);
-  dst += stride;
-  d3 = vld1q_u16(dst);
-  dst += stride;
-  d4 = vld1q_u16(dst);
-  dst += stride;
-  d5 = vld1q_u16(dst);
-  dst += stride;
-  d6 = vld1q_u16(dst);
-  dst += stride;
-  d7 = vld1q_u16(dst);
-
-  d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0));
-  d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1));
-  d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2));
-  d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3));
-  d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4));
-  d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5));
-  d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6));
-  d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7));
-
-  d0_s16 = vminq_s16(d0_s16, max);
-  d1_s16 = vminq_s16(d1_s16, max);
-  d2_s16 = vminq_s16(d2_s16, max);
-  d3_s16 = vminq_s16(d3_s16, max);
-  d4_s16 = vminq_s16(d4_s16, max);
-  d5_s16 = vminq_s16(d5_s16, max);
-  d6_s16 = vminq_s16(d6_s16, max);
-  d7_s16 = vminq_s16(d7_s16, max);
-  d0_u16 = vqshluq_n_s16(d0_s16, 0);
-  d1_u16 = vqshluq_n_s16(d1_s16, 0);
-  d2_u16 = vqshluq_n_s16(d2_s16, 0);
-  d3_u16 = vqshluq_n_s16(d3_s16, 0);
-  d4_u16 = vqshluq_n_s16(d4_s16, 0);
-  d5_u16 = vqshluq_n_s16(d5_s16, 0);
-  d6_u16 = vqshluq_n_s16(d6_s16, 0);
-  d7_u16 = vqshluq_n_s16(d7_s16, 0);
-
-  vst1q_u16(dest, d0_u16);
-  dest += stride;
-  vst1q_u16(dest, d1_u16);
-  dest += stride;
-  vst1q_u16(dest, d2_u16);
-  dest += stride;
-  vst1q_u16(dest, d3_u16);
-  dest += stride;
-  vst1q_u16(dest, d4_u16);
-  dest += stride;
-  vst1q_u16(dest, d5_u16);
-  dest += stride;
-  vst1q_u16(dest, d6_u16);
-  dest += stride;
-  vst1q_u16(dest, d7_u16);
-}
-
 void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 8);
-  int32x4_t a2 = vld1q_s32(input + 16);
-  int32x4_t a3 = vld1q_s32(input + 24);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 8);
+  a[2] = vld1q_s32(input + 16);
+  a[3] = vld1q_s32(input + 24);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
@@ -305,327 +239,133 @@ void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest,
     const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
     const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
     const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-    int16x4_t b0 = vmovn_s32(a0);
-    int16x4_t b1 = vmovn_s32(a1);
-    int16x4_t b2 = vmovn_s32(a2);
-    int16x4_t b3 = vmovn_s32(a3);
-    int16x4_t b4, b5, b6, b7;
-
-    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4,
-                         &b5, &b6, &b7);
-    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5,
-                         b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7);
-    c0 = vrshrq_n_s16(c0, 5);
-    c1 = vrshrq_n_s16(c1, 5);
-    c2 = vrshrq_n_s16(c2, 5);
-    c3 = vrshrq_n_s16(c3, 5);
-    c4 = vrshrq_n_s16(c4, 5);
-    c5 = vrshrq_n_s16(c5, 5);
-    c6 = vrshrq_n_s16(c6, 5);
-    c7 = vrshrq_n_s16(c7, 5);
+    int16x4_t b[8];
+
+    b[0] = vmovn_s32(a[0]);
+    b[1] = vmovn_s32(a[1]);
+    b[2] = vmovn_s32(a[2]);
+    b[3] = vmovn_s32(a[3]);
+
+    idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, b);
+    idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b, c);
+    c[0] = vrshrq_n_s16(c[0], 5);
+    c[1] = vrshrq_n_s16(c[1], 5);
+    c[2] = vrshrq_n_s16(c[2], 5);
+    c[3] = vrshrq_n_s16(c[3], 5);
+    c[4] = vrshrq_n_s16(c[4], 5);
+    c[5] = vrshrq_n_s16(c[5], 5);
+    c[6] = vrshrq_n_s16(c[6], 5);
+    c[7] = vrshrq_n_s16(c[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
-    int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15;
 
     if (bd == 10) {
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd10(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     } else {
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9,
-                             &a10, &a11);
-      idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13,
-                             &a14, &a15);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[8], &a[9], &a[10], &a[11]);
+      idct8x8_12_half1d_bd12(cospis0, cospis1, &a[4], &a[5], &a[6], &a[7],
+                             &a[12], &a[13], &a[14], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
-}
-
-static INLINE void idct8x8_64_half1d_bd10(
-    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
-    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
-    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
-    int32x4_t *const io7) {
-  int32x4_t step1[8], step2[8];
-
-  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
-
-  // stage 1
-  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
-  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
-  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
-  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
-
-  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
-  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
-  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
-  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
-
-  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
-  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
-  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
-  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
-
-  // stage 2
-  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
-  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
-  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
-
-  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
-  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
-  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
-  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
-
-  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
-  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
-  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
-  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
-
-  step2[4] = vaddq_s32(step1[4], step1[5]);
-  step2[5] = vsubq_s32(step1[4], step1[5]);
-  step2[6] = vsubq_s32(step1[7], step1[6]);
-  step2[7] = vaddq_s32(step1[7], step1[6]);
-
-  // stage 3
-  step1[0] = vaddq_s32(step2[0], step2[3]);
-  step1[1] = vaddq_s32(step2[1], step2[2]);
-  step1[2] = vsubq_s32(step2[1], step2[2]);
-  step1[3] = vsubq_s32(step2[0], step2[3]);
-
-  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
-  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
-  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
-  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
-
-  // stage 4
-  *io0 = vaddq_s32(step1[0], step2[7]);
-  *io1 = vaddq_s32(step1[1], step1[6]);
-  *io2 = vaddq_s32(step1[2], step1[5]);
-  *io3 = vaddq_s32(step1[3], step2[4]);
-  *io4 = vsubq_s32(step1[3], step2[4]);
-  *io5 = vsubq_s32(step1[2], step1[5]);
-  *io6 = vsubq_s32(step1[1], step1[6]);
-  *io7 = vsubq_s32(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_64_half1d_bd12(
-    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
-    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
-    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
-    int32x4_t *const io7) {
-  int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
-  int32x2_t step1l[4], step1h[4];
-  int32x4_t step1[8], step2[8];
-  int64x2_t t64[8];
-  int32x2_t t32[8];
-
-  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
-
-  // stage 1
-  input_1l = vget_low_s32(*io1);
-  input_1h = vget_high_s32(*io1);
-  input_3l = vget_low_s32(*io3);
-  input_3h = vget_high_s32(*io3);
-  input_5l = vget_low_s32(*io5);
-  input_5h = vget_high_s32(*io5);
-  input_7l = vget_low_s32(*io7);
-  input_7h = vget_high_s32(*io7);
-  step1l[0] = vget_low_s32(*io0);
-  step1h[0] = vget_high_s32(*io0);
-  step1l[1] = vget_low_s32(*io2);
-  step1h[1] = vget_high_s32(*io2);
-  step1l[2] = vget_low_s32(*io4);
-  step1h[2] = vget_high_s32(*io4);
-  step1l[3] = vget_low_s32(*io6);
-  step1h[3] = vget_high_s32(*io6);
-
-  t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1);
-  t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1);
-  t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0);
-  t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0);
-  t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1);
-  t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
-  t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
-  t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
-  t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0);
-  t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0);
-  t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1);
-  t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1);
-  t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0);
-  t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
-  t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
-  t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
-  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
-  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
-  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
-  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
-  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
-  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
-  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
-  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
-  step1[4] = vcombine_s32(t32[0], t32[1]);
-  step1[5] = vcombine_s32(t32[2], t32[3]);
-  step1[6] = vcombine_s32(t32[4], t32[5]);
-  step1[7] = vcombine_s32(t32[6], t32[7]);
-
-  // stage 2
-  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
-  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
-  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
-  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
-  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
-  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
-  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
-  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
-  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
-  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
-  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
-  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
-  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
-  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
-  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
-  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
-  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
-  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
-  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
-  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
-  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
-  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
-  step2[0] = vcombine_s32(t32[0], t32[1]);
-  step2[1] = vcombine_s32(t32[2], t32[3]);
-  step2[2] = vcombine_s32(t32[4], t32[5]);
-  step2[3] = vcombine_s32(t32[6], t32[7]);
-
-  step2[4] = vaddq_s32(step1[4], step1[5]);
-  step2[5] = vsubq_s32(step1[4], step1[5]);
-  step2[6] = vsubq_s32(step1[7], step1[6]);
-  step2[7] = vaddq_s32(step1[7], step1[6]);
-
-  // stage 3
-  step1[0] = vaddq_s32(step2[0], step2[3]);
-  step1[1] = vaddq_s32(step2[1], step2[2]);
-  step1[2] = vsubq_s32(step2[1], step2[2]);
-  step1[3] = vsubq_s32(step2[0], step2[3]);
-
-  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
-  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
-  t64[0] =
-      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
-  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
-                          vget_high_s32(cospis0), 0);
-  t64[2] =
-      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
-  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
-                          vget_high_s32(cospis0), 0);
-  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
-  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
-  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
-  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
-  step1[5] = vcombine_s32(t32[0], t32[1]);
-  step1[6] = vcombine_s32(t32[2], t32[3]);
-
-  // stage 4
-  *io0 = vaddq_s32(step1[0], step2[7]);
-  *io1 = vaddq_s32(step1[1], step1[6]);
-  *io2 = vaddq_s32(step1[2], step1[5]);
-  *io3 = vaddq_s32(step1[3], step2[4]);
-  *io4 = vsubq_s32(step1[3], step2[4]);
-  *io5 = vsubq_s32(step1[2], step1[5]);
-  *io6 = vsubq_s32(step1[1], step1[6]);
-  *io7 = vsubq_s32(step1[0], step2[7]);
+  highbd_add8x8(c, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  int32x4_t a0 = vld1q_s32(input);
-  int32x4_t a1 = vld1q_s32(input + 4);
-  int32x4_t a2 = vld1q_s32(input + 8);
-  int32x4_t a3 = vld1q_s32(input + 12);
-  int32x4_t a4 = vld1q_s32(input + 16);
-  int32x4_t a5 = vld1q_s32(input + 20);
-  int32x4_t a6 = vld1q_s32(input + 24);
-  int32x4_t a7 = vld1q_s32(input + 28);
-  int32x4_t a8 = vld1q_s32(input + 32);
-  int32x4_t a9 = vld1q_s32(input + 36);
-  int32x4_t a10 = vld1q_s32(input + 40);
-  int32x4_t a11 = vld1q_s32(input + 44);
-  int32x4_t a12 = vld1q_s32(input + 48);
-  int32x4_t a13 = vld1q_s32(input + 52);
-  int32x4_t a14 = vld1q_s32(input + 56);
-  int32x4_t a15 = vld1q_s32(input + 60);
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
+  int32x4_t a[16];
+  int16x8_t c[8];
+
+  a[0] = vld1q_s32(input);
+  a[1] = vld1q_s32(input + 4);
+  a[2] = vld1q_s32(input + 8);
+  a[3] = vld1q_s32(input + 12);
+  a[4] = vld1q_s32(input + 16);
+  a[5] = vld1q_s32(input + 20);
+  a[6] = vld1q_s32(input + 24);
+  a[7] = vld1q_s32(input + 28);
+  a[8] = vld1q_s32(input + 32);
+  a[9] = vld1q_s32(input + 36);
+  a[10] = vld1q_s32(input + 40);
+  a[11] = vld1q_s32(input + 44);
+  a[12] = vld1q_s32(input + 48);
+  a[13] = vld1q_s32(input + 52);
+  a[14] = vld1q_s32(input + 56);
+  a[15] = vld1q_s32(input + 60);
 
   if (bd == 8) {
     const int16x8_t cospis = vld1q_s16(kCospi);
     const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
     const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-    int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1));
-    int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3));
-    int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5));
-    int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7));
-    int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9));
-    int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11));
-    int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13));
-    int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15));
-
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-    idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-
-    c0 = vrshrq_n_s16(b0, 5);
-    c1 = vrshrq_n_s16(b1, 5);
-    c2 = vrshrq_n_s16(b2, 5);
-    c3 = vrshrq_n_s16(b3, 5);
-    c4 = vrshrq_n_s16(b4, 5);
-    c5 = vrshrq_n_s16(b5, 5);
-    c6 = vrshrq_n_s16(b6, 5);
-    c7 = vrshrq_n_s16(b7, 5);
+    int16x8_t b[8];
+
+    b[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1]));
+    b[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3]));
+    b[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5]));
+    b[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7]));
+    b[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9]));
+    b[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11]));
+    b[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13]));
+    b[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15]));
+
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+    idct8x8_64_1d_bd8(cospis0, cospis1, b);
+
+    c[0] = vrshrq_n_s16(b[0], 5);
+    c[1] = vrshrq_n_s16(b[1], 5);
+    c[2] = vrshrq_n_s16(b[2], 5);
+    c[3] = vrshrq_n_s16(b[3], 5);
+    c[4] = vrshrq_n_s16(b[4], 5);
+    c[5] = vrshrq_n_s16(b[5], 5);
+    c[6] = vrshrq_n_s16(b[6], 5);
+    c[7] = vrshrq_n_s16(b[7], 5);
   } else {
     const int32x4_t cospis0 = vld1q_s32(kCospi32);      // cospi 0, 8, 16, 24
     const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4);  // cospi 4, 12, 20, 28
 
     if (bd == 10) {
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     } else {
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5,
-                             &a6, &a7);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13,
-                             &a14, &a15);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10,
-                             &a3, &a11);
-      idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14,
-                             &a7, &a15);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3],
+                             &a[4], &a[5], &a[6], &a[7]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11],
+                             &a[12], &a[13], &a[14], &a[15]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9],
+                             &a[2], &a[10], &a[3], &a[11]);
+      idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13],
+                             &a[6], &a[14], &a[7], &a[15]);
     }
-    c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5));
-    c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5));
-    c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5));
-    c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5));
-    c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5));
-    c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5));
-    c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5));
-    c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5));
+    c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5));
+    c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5));
+    c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5));
+    c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5));
+    c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5));
+    c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5));
+    c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5));
+    c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5));
   }
-  highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd);
+  highbd_add8x8(c, dest, stride, bd);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h b/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h
new file mode 100644
index 00000000000..518ef4336e8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/highbd_idct_neon.h
@@ -0,0 +1,474 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest,
+                                                const int stride,
+                                                const int16x8_t res,
+                                                const int16x8_t max) {
+  const uint16x4_t a0 = vld1_u16(*dest);
+  const uint16x4_t a1 = vld1_u16(*dest + stride);
+  const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1));
+  // Note: In some profile tests, res is quite close to +/-32767.
+  // We use saturating addition.
+  const int16x8_t b = vqaddq_s16(res, a);
+  const int16x8_t c = vminq_s16(b, max);
+  const uint16x8_t d = vqshluq_n_s16(c, 0);
+  vst1_u16(*dest, vget_low_u16(d));
+  *dest += stride;
+  vst1_u16(*dest, vget_high_u16(d));
+  *dest += stride;
+}
+
+static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
+  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
+  b2 = vmulq_lane_s32(a[1], vget_high_s32(cospis), 1);
+  b3 = vmulq_lane_s32(a[1], vget_low_s32(cospis), 1);
+  b2 = vmlsq_lane_s32(b2, a[3], vget_low_s32(cospis), 1);
+  b3 = vmlaq_lane_s32(b3, a[3], vget_high_s32(cospis), 1);
+  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
+                                          int32x4_t *const a) {
+  int32x4_t b0, b1, b2, b3;
+  int64x2_t c[12];
+
+  transpose_s32_4x4(&a[0], &a[1], &a[2], &a[3]);
+  b0 = vaddq_s32(a[0], a[2]);
+  b1 = vsubq_s32(a[0], a[2]);
+  c[0] = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
+  c[1] = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
+  c[2] = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
+  c[3] = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
+  c[4] = vmull_lane_s32(vget_low_s32(a[1]), vget_high_s32(cospis), 1);
+  c[5] = vmull_lane_s32(vget_high_s32(a[1]), vget_high_s32(cospis), 1);
+  c[6] = vmull_lane_s32(vget_low_s32(a[1]), vget_low_s32(cospis), 1);
+  c[7] = vmull_lane_s32(vget_high_s32(a[1]), vget_low_s32(cospis), 1);
+  c[8] = vmull_lane_s32(vget_low_s32(a[3]), vget_low_s32(cospis), 1);
+  c[9] = vmull_lane_s32(vget_high_s32(a[3]), vget_low_s32(cospis), 1);
+  c[10] = vmull_lane_s32(vget_low_s32(a[3]), vget_high_s32(cospis), 1);
+  c[11] = vmull_lane_s32(vget_high_s32(a[3]), vget_high_s32(cospis), 1);
+  c[4] = vsubq_s64(c[4], c[8]);
+  c[5] = vsubq_s64(c[5], c[9]);
+  c[6] = vaddq_s64(c[6], c[10]);
+  c[7] = vaddq_s64(c[7], c[11]);
+  b0 = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[1], DCT_CONST_BITS));
+  b1 = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[3], DCT_CONST_BITS));
+  b2 = vcombine_s32(vrshrn_n_s64(c[4], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[5], DCT_CONST_BITS));
+  b3 = vcombine_s32(vrshrn_n_s64(c[6], DCT_CONST_BITS),
+                    vrshrn_n_s64(c[7], DCT_CONST_BITS));
+  a[0] = vaddq_s32(b0, b3);
+  a[1] = vaddq_s32(b1, b2);
+  a[2] = vsubq_s32(b1, b2);
+  a[3] = vsubq_s32(b0, b3);
+}
+
+static INLINE void highbd_add8x8(int16x8_t *const a, uint16_t *dest,
+                                 const int stride, const int bd) {
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  const uint16_t *dst = dest;
+  uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7;
+  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
+  int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16;
+
+  d0 = vld1q_u16(dst);
+  dst += stride;
+  d1 = vld1q_u16(dst);
+  dst += stride;
+  d2 = vld1q_u16(dst);
+  dst += stride;
+  d3 = vld1q_u16(dst);
+  dst += stride;
+  d4 = vld1q_u16(dst);
+  dst += stride;
+  d5 = vld1q_u16(dst);
+  dst += stride;
+  d6 = vld1q_u16(dst);
+  dst += stride;
+  d7 = vld1q_u16(dst);
+
+  d0_s16 = vqaddq_s16(a[0], vreinterpretq_s16_u16(d0));
+  d1_s16 = vqaddq_s16(a[1], vreinterpretq_s16_u16(d1));
+  d2_s16 = vqaddq_s16(a[2], vreinterpretq_s16_u16(d2));
+  d3_s16 = vqaddq_s16(a[3], vreinterpretq_s16_u16(d3));
+  d4_s16 = vqaddq_s16(a[4], vreinterpretq_s16_u16(d4));
+  d5_s16 = vqaddq_s16(a[5], vreinterpretq_s16_u16(d5));
+  d6_s16 = vqaddq_s16(a[6], vreinterpretq_s16_u16(d6));
+  d7_s16 = vqaddq_s16(a[7], vreinterpretq_s16_u16(d7));
+
+  d0_s16 = vminq_s16(d0_s16, max);
+  d1_s16 = vminq_s16(d1_s16, max);
+  d2_s16 = vminq_s16(d2_s16, max);
+  d3_s16 = vminq_s16(d3_s16, max);
+  d4_s16 = vminq_s16(d4_s16, max);
+  d5_s16 = vminq_s16(d5_s16, max);
+  d6_s16 = vminq_s16(d6_s16, max);
+  d7_s16 = vminq_s16(d7_s16, max);
+  d0_u16 = vqshluq_n_s16(d0_s16, 0);
+  d1_u16 = vqshluq_n_s16(d1_s16, 0);
+  d2_u16 = vqshluq_n_s16(d2_s16, 0);
+  d3_u16 = vqshluq_n_s16(d3_s16, 0);
+  d4_u16 = vqshluq_n_s16(d4_s16, 0);
+  d5_u16 = vqshluq_n_s16(d5_s16, 0);
+  d6_u16 = vqshluq_n_s16(d6_s16, 0);
+  d7_u16 = vqshluq_n_s16(d7_s16, 0);
+
+  vst1q_u16(dest, d0_u16);
+  dest += stride;
+  vst1q_u16(dest, d1_u16);
+  dest += stride;
+  vst1q_u16(dest, d2_u16);
+  dest += stride;
+  vst1q_u16(dest, d3_u16);
+  dest += stride;
+  vst1q_u16(dest, d4_u16);
+  dest += stride;
+  vst1q_u16(dest, d5_u16);
+  dest += stride;
+  vst1q_u16(dest, d6_u16);
+  dest += stride;
+  vst1q_u16(dest, d7_u16);
+}
+
+static INLINE void idct8x8_64_half1d_bd10(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x4_t step1[8], step2[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1);
+  step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
+  step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
+  step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
+
+  step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0);
+  step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1);
+  step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
+  step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
+
+  step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+  step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
+
+  // stage 2
+  step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
+  step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
+  step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
+
+  step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0);
+  step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
+  step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
+
+  step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+  step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+  step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+  step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
+  step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
+  step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+  step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_half1d_bd12(
+    const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0,
+    int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3,
+    int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6,
+    int32x4_t *const io7) {
+  int32x2_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
+  int32x2_t step1l[4], step1h[4];
+  int32x4_t step1[8], step2[8];
+  int64x2_t t64[8];
+  int32x2_t t32[8];
+
+  transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7);
+
+  // stage 1
+  input1l = vget_low_s32(*io1);
+  input1h = vget_high_s32(*io1);
+  input3l = vget_low_s32(*io3);
+  input3h = vget_high_s32(*io3);
+  input5l = vget_low_s32(*io5);
+  input5h = vget_high_s32(*io5);
+  input7l = vget_low_s32(*io7);
+  input7h = vget_high_s32(*io7);
+  step1l[0] = vget_low_s32(*io0);
+  step1h[0] = vget_high_s32(*io0);
+  step1l[1] = vget_low_s32(*io2);
+  step1h[1] = vget_high_s32(*io2);
+  step1l[2] = vget_low_s32(*io4);
+  step1h[2] = vget_high_s32(*io4);
+  step1l[3] = vget_low_s32(*io6);
+  step1h[3] = vget_high_s32(*io6);
+
+  t64[0] = vmull_lane_s32(input1l, vget_high_s32(cospis1), 1);
+  t64[1] = vmull_lane_s32(input1h, vget_high_s32(cospis1), 1);
+  t64[2] = vmull_lane_s32(input3l, vget_high_s32(cospis1), 0);
+  t64[3] = vmull_lane_s32(input3h, vget_high_s32(cospis1), 0);
+  t64[4] = vmull_lane_s32(input3l, vget_low_s32(cospis1), 1);
+  t64[5] = vmull_lane_s32(input3h, vget_low_s32(cospis1), 1);
+  t64[6] = vmull_lane_s32(input1l, vget_low_s32(cospis1), 0);
+  t64[7] = vmull_lane_s32(input1h, vget_low_s32(cospis1), 0);
+  t64[0] = vmlsl_lane_s32(t64[0], input7l, vget_low_s32(cospis1), 0);
+  t64[1] = vmlsl_lane_s32(t64[1], input7h, vget_low_s32(cospis1), 0);
+  t64[2] = vmlal_lane_s32(t64[2], input5l, vget_low_s32(cospis1), 1);
+  t64[3] = vmlal_lane_s32(t64[3], input5h, vget_low_s32(cospis1), 1);
+  t64[4] = vmlsl_lane_s32(t64[4], input5l, vget_high_s32(cospis1), 0);
+  t64[5] = vmlsl_lane_s32(t64[5], input5h, vget_high_s32(cospis1), 0);
+  t64[6] = vmlal_lane_s32(t64[6], input7l, vget_high_s32(cospis1), 1);
+  t64[7] = vmlal_lane_s32(t64[7], input7h, vget_high_s32(cospis1), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step1[4] = vcombine_s32(t32[0], t32[1]);
+  step1[5] = vcombine_s32(t32[2], t32[3]);
+  step1[6] = vcombine_s32(t32[4], t32[5]);
+  step1[7] = vcombine_s32(t32[6], t32[7]);
+
+  // stage 2
+  t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0);
+  t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1);
+  t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
+  t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
+  t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
+  t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0);
+  t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0);
+  t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1);
+  t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
+  t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
+  t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+  t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+  t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+  t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
+  step2[0] = vcombine_s32(t32[0], t32[1]);
+  step2[1] = vcombine_s32(t32[2], t32[3]);
+  step2[2] = vcombine_s32(t32[4], t32[5]);
+  step2[3] = vcombine_s32(t32[6], t32[7]);
+
+  step2[4] = vaddq_s32(step1[4], step1[5]);
+  step2[5] = vsubq_s32(step1[4], step1[5]);
+  step2[6] = vsubq_s32(step1[7], step1[6]);
+  step2[7] = vaddq_s32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = vaddq_s32(step2[0], step2[3]);
+  step1[1] = vaddq_s32(step2[1], step2[2]);
+  step1[2] = vsubq_s32(step2[1], step2[2]);
+  step1[3] = vsubq_s32(step2[0], step2[3]);
+
+  t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0);
+  t64[0] =
+      vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t64[2] =
+      vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
+  t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
+                          vget_high_s32(cospis0), 0);
+  t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+  t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+  t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+  t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+  step1[5] = vcombine_s32(t32[0], t32[1]);
+  step1[6] = vcombine_s32(t32[2], t32[3]);
+
+  // stage 4
+  *io0 = vaddq_s32(step1[0], step2[7]);
+  *io1 = vaddq_s32(step1[1], step1[6]);
+  *io2 = vaddq_s32(step1[2], step1[5]);
+  *io3 = vaddq_s32(step1[3], step2[4]);
+  *io4 = vsubq_s32(step1[3], step2[4]);
+  *io5 = vsubq_s32(step1[2], step1[5]);
+  *io6 = vsubq_s32(step1[1], step1[6]);
+  *io7 = vsubq_s32(step1[0], step2[7]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+                                                int32_t *output) {
+  // Save the result into output
+  vst1q_s32(output + 0, out[0].val[0]);
+  vst1q_s32(output + 4, out[0].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[1].val[0]);
+  vst1q_s32(output + 4, out[1].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[2].val[0]);
+  vst1q_s32(output + 4, out[2].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[3].val[0]);
+  vst1q_s32(output + 4, out[3].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[4].val[0]);
+  vst1q_s32(output + 4, out[4].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[5].val[0]);
+  vst1q_s32(output + 4, out[5].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[6].val[0]);
+  vst1q_s32(output + 4, out[6].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[7].val[0]);
+  vst1q_s32(output + 4, out[7].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[8].val[0]);
+  vst1q_s32(output + 4, out[8].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[9].val[0]);
+  vst1q_s32(output + 4, out[9].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[10].val[0]);
+  vst1q_s32(output + 4, out[10].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[11].val[0]);
+  vst1q_s32(output + 4, out[11].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[12].val[0]);
+  vst1q_s32(output + 4, out[12].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[13].val[0]);
+  vst1q_s32(output + 4, out[13].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[14].val[0]);
+  vst1q_s32(output + 4, out[14].val[1]);
+  output += 16;
+  vst1q_s32(output + 0, out[15].val[0]);
+  vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+                                              uint16_t *dest, const int stride,
+                                              const int bd) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+  int16x8_t o[16];
+  o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+                      vrshrn_n_s32(out[0].val[1], 6));
+  o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+                      vrshrn_n_s32(out[1].val[1], 6));
+  o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+                      vrshrn_n_s32(out[2].val[1], 6));
+  o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+                      vrshrn_n_s32(out[3].val[1], 6));
+  o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+                      vrshrn_n_s32(out[4].val[1], 6));
+  o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+                      vrshrn_n_s32(out[5].val[1], 6));
+  o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+                      vrshrn_n_s32(out[6].val[1], 6));
+  o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+                      vrshrn_n_s32(out[7].val[1], 6));
+  o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+                      vrshrn_n_s32(out[8].val[1], 6));
+  o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+                      vrshrn_n_s32(out[9].val[1], 6));
+  o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+                       vrshrn_n_s32(out[10].val[1], 6));
+  o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+                       vrshrn_n_s32(out[11].val[1], 6));
+  o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+                       vrshrn_n_s32(out[12].val[1], 6));
+  o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+                       vrshrn_n_s32(out[13].val[1], 6));
+  o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+                       vrshrn_n_s32(out[14].val[1], 6));
+  o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+                       vrshrn_n_s32(out[15].val[1], 6));
+  highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+void vpx_highbd_idct16x16_256_add_half1d(const int32_t *input, int32_t *output,
+                                         uint16_t *dest, const int stride,
+                                         const int bd);
+
+#endif  // VPX_VPX_DSP_ARM_HIGHBD_IDCT_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index 5c5963d277e..fc7f4a7747c 100644
--- a/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -63,65 +63,6 @@ static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
   wrap_low_4x2(t32, d0, d1);
 }
 
-static INLINE void idct16x16_add_store(const int16x8_t *const out,
-                                       uint8_t *dest, const int stride) {
-  // Add the result to dest
-  idct16x16_add8x1(out[0], &dest, stride);
-  idct16x16_add8x1(out[1], &dest, stride);
-  idct16x16_add8x1(out[2], &dest, stride);
-  idct16x16_add8x1(out[3], &dest, stride);
-  idct16x16_add8x1(out[4], &dest, stride);
-  idct16x16_add8x1(out[5], &dest, stride);
-  idct16x16_add8x1(out[6], &dest, stride);
-  idct16x16_add8x1(out[7], &dest, stride);
-  idct16x16_add8x1(out[8], &dest, stride);
-  idct16x16_add8x1(out[9], &dest, stride);
-  idct16x16_add8x1(out[10], &dest, stride);
-  idct16x16_add8x1(out[11], &dest, stride);
-  idct16x16_add8x1(out[12], &dest, stride);
-  idct16x16_add8x1(out[13], &dest, stride);
-  idct16x16_add8x1(out[14], &dest, stride);
-  idct16x16_add8x1(out[15], &dest, stride);
-}
-
-static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
-                                           const int stride) {
-  // Add the result to dest
-  const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
-  out[0] = vrshrq_n_s16(out[0], 6);
-  out[1] = vrshrq_n_s16(out[1], 6);
-  out[2] = vrshrq_n_s16(out[2], 6);
-  out[3] = vrshrq_n_s16(out[3], 6);
-  out[4] = vrshrq_n_s16(out[4], 6);
-  out[5] = vrshrq_n_s16(out[5], 6);
-  out[6] = vrshrq_n_s16(out[6], 6);
-  out[7] = vrshrq_n_s16(out[7], 6);
-  out[8] = vrshrq_n_s16(out[8], 6);
-  out[9] = vrshrq_n_s16(out[9], 6);
-  out[10] = vrshrq_n_s16(out[10], 6);
-  out[11] = vrshrq_n_s16(out[11], 6);
-  out[12] = vrshrq_n_s16(out[12], 6);
-  out[13] = vrshrq_n_s16(out[13], 6);
-  out[14] = vrshrq_n_s16(out[14], 6);
-  out[15] = vrshrq_n_s16(out[15], 6);
-  highbd_idct16x16_add8x1(out[0], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[1], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[2], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[3], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[4], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[5], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[6], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[7], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[8], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[9], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[10], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[11], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[12], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[13], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[14], max, &dest, stride);
-  highbd_idct16x16_add8x1(out[15], max, &dest, stride);
-}
-
 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
                                   void *const dest, const int stride,
                                   const int highbd_flag) {
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
index 021211bc990..057731ad924 100644
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c
@@ -650,14 +650,10 @@ void vpx_idct32_16_neon(const int16_t *const input, void *const output,
     highbd_add_and_store_bd8(out, output, stride);
   } else {
     uint8_t *const outputT = (uint8_t *)output;
-    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], outputT, stride);
-    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], outputT + (8 * stride), stride);
-    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], outputT + (16 * stride), stride);
-    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], outputT + (24 * stride), stride);
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
   }
 }
 
diff --git a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
index f3c336fa31f..f570547e449 100644
--- a/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c
@@ -490,14 +490,10 @@ void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
     highbd_add_and_store_bd8(out, output, stride);
   } else {
     uint8_t *const outputT = (uint8_t *)output;
-    add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6],
-                         out[7], outputT, stride);
-    add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13],
-                         out[14], out[15], outputT + (8 * stride), stride);
-    add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21],
-                         out[22], out[23], outputT + (16 * stride), stride);
-    add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29],
-                         out[30], out[31], outputT + (24 * stride), stride);
+    add_and_store_u8_s16(out + 0, outputT, stride);
+    add_and_store_u8_s16(out + 8, outputT + (8 * stride), stride);
+    add_and_store_u8_s16(out + 16, outputT + (16 * stride), stride);
+    add_and_store_u8_s16(out + 24, outputT + (24 * stride), stride);
   }
 }
 
diff --git a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
index 673a36840e3..8192ee4cf87 100644
--- a/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct4x4_add_neon.c
@@ -19,44 +19,41 @@
 void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const uint8_t *dst = dest;
-  const int16x4_t cospis = vld1_s16(kCospi);
-  uint8x8_t dest01_u8;
-  uint32x2_t dest32_u32 = vdup_n_u32(0);
-  int16x8_t a0, a1;
-  uint8x8_t d01, d32;
-  uint16x8_t d01_u16, d32_u16;
+  uint32x2_t s32 = vdup_n_u32(0);
+  int16x8_t a[2];
+  uint8x8_t s, d[2];
+  uint16x8_t sum[2];
 
   assert(!((intptr_t)dest % sizeof(uint32_t)));
   assert(!(stride % sizeof(uint32_t)));
 
   // Rows
-  a0 = load_tran_low_to_s16q(input);
-  a1 = load_tran_low_to_s16q(input + 8);
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  transpose_idct4x4_16_bd8(a);
 
   // Columns
-  a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1));
-  idct4x4_16_kernel_bd8(cospis, &a0, &a1);
-  a0 = vrshrq_n_s16(a0, 4);
-  a1 = vrshrq_n_s16(a1, 4);
+  a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+  transpose_idct4x4_16_bd8(a);
+  a[0] = vrshrq_n_s16(a[0], 4);
+  a[1] = vrshrq_n_s16(a[1], 4);
 
-  dest01_u8 = load_u8(dst, stride);
+  s = load_u8(dst, stride);
   dst += 2 * stride;
   // The elements are loaded in reverse order.
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 1);
   dst += stride;
-  dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
+  s32 = vld1_lane_u32((const uint32_t *)dst, s32, 0);
 
-  d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
-  d32_u16 =
-      vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
-  d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
-  d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
+  sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s);
+  sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), vreinterpret_u8_u32(s32));
+  d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+  d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
 
-  store_u8(dest, stride, d01);
+  store_u8(dest, stride, d[0]);
   dest += 2 * stride;
   // The elements are stored in reverse order.
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 1);
   dest += stride;
-  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
+  vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d[1]), 0);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
index 1121ade2796..7471387e476 100644
--- a/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/idct8x8_add_neon.c
@@ -17,91 +17,25 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/txfm_common.h"
 
-static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2,
-                          int16x8_t a3, int16x8_t a4, int16x8_t a5,
-                          int16x8_t a6, int16x8_t a7, uint8_t *dest,
-                          const int stride) {
-  const uint8_t *dst = dest;
-  uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
-  uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16;
-
-  a0 = vrshrq_n_s16(a0, 5);
-  a1 = vrshrq_n_s16(a1, 5);
-  a2 = vrshrq_n_s16(a2, 5);
-  a3 = vrshrq_n_s16(a3, 5);
-  a4 = vrshrq_n_s16(a4, 5);
-  a5 = vrshrq_n_s16(a5, 5);
-  a6 = vrshrq_n_s16(a6, 5);
-  a7 = vrshrq_n_s16(a7, 5);
-
-  d0 = vld1_u8(dst);
-  dst += stride;
-  d1 = vld1_u8(dst);
-  dst += stride;
-  d2 = vld1_u8(dst);
-  dst += stride;
-  d3 = vld1_u8(dst);
-  dst += stride;
-  d4 = vld1_u8(dst);
-  dst += stride;
-  d5 = vld1_u8(dst);
-  dst += stride;
-  d6 = vld1_u8(dst);
-  dst += stride;
-  d7 = vld1_u8(dst);
-
-  d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0);
-  d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1);
-  d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2);
-  d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3);
-  d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4);
-  d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5);
-  d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6);
-  d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7);
-
-  d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16));
-  d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16));
-  d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16));
-  d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16));
-  d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16));
-  d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16));
-  d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16));
-  d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16));
-
-  vst1_u8(dest, d0);
-  dest += stride;
-  vst1_u8(dest, d1);
-  dest += stride;
-  vst1_u8(dest, d2);
-  dest += stride;
-  vst1_u8(dest, d3);
-  dest += stride;
-  vst1_u8(dest, d4);
-  dest += stride;
-  vst1_u8(dest, d5);
-  dest += stride;
-  vst1_u8(dest, d6);
-  dest += stride;
-  vst1_u8(dest, d7);
-}
-
 void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const int16x8_t cospis = vld1q_s16(kCospi);
   const int16x4_t cospis0 = vget_low_s16(cospis);   // cospi 0, 8, 16, 24
   const int16x4_t cospis1 = vget_high_s16(cospis);  // cospi 4, 12, 20, 28
-  int16x8_t a0 = load_tran_low_to_s16q(input);
-  int16x8_t a1 = load_tran_low_to_s16q(input + 8);
-  int16x8_t a2 = load_tran_low_to_s16q(input + 16);
-  int16x8_t a3 = load_tran_low_to_s16q(input + 24);
-  int16x8_t a4 = load_tran_low_to_s16q(input + 32);
-  int16x8_t a5 = load_tran_low_to_s16q(input + 40);
-  int16x8_t a6 = load_tran_low_to_s16q(input + 48);
-  int16x8_t a7 = load_tran_low_to_s16q(input + 56);
-
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
-  add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride);
+  int16x8_t a[8];
+
+  a[0] = load_tran_low_to_s16q(input);
+  a[1] = load_tran_low_to_s16q(input + 8);
+  a[2] = load_tran_low_to_s16q(input + 16);
+  a[3] = load_tran_low_to_s16q(input + 24);
+  a[4] = load_tran_low_to_s16q(input + 32);
+  a[5] = load_tran_low_to_s16q(input + 40);
+  a[6] = load_tran_low_to_s16q(input + 48);
+  a[7] = load_tran_low_to_s16q(input + 56);
+
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_64_1d_bd8(cospis0, cospis1, a);
+  idct8x8_add8x8_neon(a, dest, stride);
 }
 
 void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -111,17 +45,15 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest,
   const int16x4_t cospis0 = vget_low_s16(cospis);     // cospi 0, 8, 16, 24
   const int16x4_t cospisd0 = vget_low_s16(cospisd);   // doubled 0, 8, 16, 24
   const int16x4_t cospisd1 = vget_high_s16(cospisd);  // doubled 4, 12, 20, 28
-  int16x4_t a0, a1, a2, a3, a4, a5, a6, a7;
-  int16x8_t b0, b1, b2, b3, b4, b5, b6, b7;
+  int16x4_t a[8];
+  int16x8_t b[8];
 
-  a0 = load_tran_low_to_s16d(input);
-  a1 = load_tran_low_to_s16d(input + 8);
-  a2 = load_tran_low_to_s16d(input + 16);
-  a3 = load_tran_low_to_s16d(input + 24);
+  a[0] = load_tran_low_to_s16d(input);
+  a[1] = load_tran_low_to_s16d(input + 8);
+  a[2] = load_tran_low_to_s16d(input + 16);
+  a[3] = load_tran_low_to_s16d(input + 24);
 
-  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4,
-                       &a5, &a6, &a7);
-  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6,
-                       a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7);
-  add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride);
+  idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, a);
+  idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a, b);
+  idct8x8_add8x8_neon(b, dest, stride);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/idct_neon.h b/libs/libvpx/vpx_dsp/arm/idct_neon.h
index 6ed02af5acc..c02311326bd 100644
--- a/libs/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/idct_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_IDCT_NEON_H_
-#define VPX_DSP_ARM_IDCT_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_IDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_IDCT_NEON_H_
 
 #include <arm_neon.h>
 
@@ -78,6 +78,28 @@ static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
 
 //------------------------------------------------------------------------------
 
+static INLINE int16x8_t dct_const_round_shift_low_8(const int32x4_t *const in) {
+  return vcombine_s16(vrshrn_n_s32(in[0], DCT_CONST_BITS),
+                      vrshrn_n_s32(in[1], DCT_CONST_BITS));
+}
+
+static INLINE void dct_const_round_shift_low_8_dual(const int32x4_t *const t32,
+                                                    int16x8_t *const d0,
+                                                    int16x8_t *const d1) {
+  *d0 = dct_const_round_shift_low_8(t32 + 0);
+  *d1 = dct_const_round_shift_low_8(t32 + 2);
+}
+
+static INLINE int32x4x2_t
+dct_const_round_shift_high_4x2(const int64x2_t *const in) {
+  int32x4x2_t out;
+  out.val[0] = vcombine_s32(vrshrn_n_s64(in[0], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[1], DCT_CONST_BITS));
+  out.val[1] = vcombine_s32(vrshrn_n_s64(in[2], DCT_CONST_BITS),
+                            vrshrn_n_s64(in[3], DCT_CONST_BITS));
+  return out;
+}
+
 // Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
 static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
                                                       const int16_t a_const) {
@@ -102,24 +124,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
   // input) this function can not use vaddq_s16.
   // In order to match existing behavior and intentionally out of range tests,
   // expand the addition up to 32 bits to prevent truncation.
-  int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vaddl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 // Subtract b from a, then multiply by ab_const. Shift and narrow by
 // DCT_CONST_BITS.
 static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
     const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
-  int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
-  int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
-  temp_low = vmulq_n_s32(temp_low, ab_const);
-  temp_high = vmulq_n_s32(temp_high, ab_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
+  t[1] = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
+  t[0] = vmulq_n_s32(t[0], ab_const);
+  t[1] = vmulq_n_s32(t[1], ab_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
@@ -127,12 +149,12 @@ static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
 static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
     const int16x8_t a, const int16_t a_const, const int16x8_t b,
     const int16_t b_const) {
-  int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const);
-  int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
-  temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
-  temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
-  return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
-                      vrshrn_n_s32(temp_high, DCT_CONST_BITS));
+  int32x4_t t[2];
+  t[0] = vmull_n_s16(vget_low_s16(a), a_const);
+  t[1] = vmull_n_s16(vget_high_s16(a), a_const);
+  t[0] = vmlal_n_s16(t[0], vget_low_s16(b), b_const);
+  t[1] = vmlal_n_s16(t[1], vget_high_s16(b), b_const);
+  return dct_const_round_shift_low_8(t);
 }
 
 //------------------------------------------------------------------------------
@@ -145,53 +167,43 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
 static INLINE int32x4x2_t
 multiply_shift_and_narrow_s32_dual(const int32x4x2_t a, const int32_t a_const) {
   int64x2_t b[4];
-  int32x4x2_t c;
+
   b[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
   b[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
   b[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
   b[3] = vmull_n_s32(vget_high_s32(a.val[1]), a_const);
-  c.val[0] = vcombine_s32(vrshrn_n_s64(b[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(b[1], DCT_CONST_BITS));
-  c.val[1] = vcombine_s32(vrshrn_n_s64(b[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(b[3], DCT_CONST_BITS));
-  return c;
+  return dct_const_round_shift_high_4x2(b);
 }
 
 // Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
 static INLINE int32x4x2_t add_multiply_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
-  const int32x4_t temp_low = vaddq_s32(a.val[0], b.val[0]);
-  const int32x4_t temp_high = vaddq_s32(a.val[1], b.val[1]);
+  int32x4_t t[2];
   int64x2_t c[4];
-  int32x4x2_t d;
-  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
-  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
-  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
-  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+
+  t[0] = vaddq_s32(a.val[0], b.val[0]);
+  t[1] = vaddq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Subtract b from a, then multiply by ab_const. Shift and narrow by
 // DCT_CONST_BITS.
 static INLINE int32x4x2_t sub_multiply_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32x4x2_t b, const int32_t ab_const) {
-  const int32x4_t temp_low = vsubq_s32(a.val[0], b.val[0]);
-  const int32x4_t temp_high = vsubq_s32(a.val[1], b.val[1]);
+  int32x4_t t[2];
   int64x2_t c[4];
-  int32x4x2_t d;
-  c[0] = vmull_n_s32(vget_low_s32(temp_low), ab_const);
-  c[1] = vmull_n_s32(vget_high_s32(temp_low), ab_const);
-  c[2] = vmull_n_s32(vget_low_s32(temp_high), ab_const);
-  c[3] = vmull_n_s32(vget_high_s32(temp_high), ab_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+
+  t[0] = vsubq_s32(a.val[0], b.val[0]);
+  t[1] = vsubq_s32(a.val[1], b.val[1]);
+  c[0] = vmull_n_s32(vget_low_s32(t[0]), ab_const);
+  c[1] = vmull_n_s32(vget_high_s32(t[0]), ab_const);
+  c[2] = vmull_n_s32(vget_low_s32(t[1]), ab_const);
+  c[3] = vmull_n_s32(vget_high_s32(t[1]), ab_const);
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
@@ -200,7 +212,6 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
     const int32x4x2_t a, const int32_t a_const, const int32x4x2_t b,
     const int32_t b_const) {
   int64x2_t c[4];
-  int32x4x2_t d;
   c[0] = vmull_n_s32(vget_low_s32(a.val[0]), a_const);
   c[1] = vmull_n_s32(vget_high_s32(a.val[0]), a_const);
   c[2] = vmull_n_s32(vget_low_s32(a.val[1]), a_const);
@@ -209,72 +220,66 @@ static INLINE int32x4x2_t multiply_accumulate_shift_and_narrow_s32_dual(
   c[1] = vmlal_n_s32(c[1], vget_high_s32(b.val[0]), b_const);
   c[2] = vmlal_n_s32(c[2], vget_low_s32(b.val[1]), b_const);
   c[3] = vmlal_n_s32(c[3], vget_high_s32(b.val[1]), b_const);
-  d.val[0] = vcombine_s32(vrshrn_n_s64(c[0], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[1], DCT_CONST_BITS));
-  d.val[1] = vcombine_s32(vrshrn_n_s64(c[2], DCT_CONST_BITS),
-                          vrshrn_n_s64(c[3], DCT_CONST_BITS));
-  return d;
+  return dct_const_round_shift_high_4x2(c);
 }
 
 // Shift the output down by 6 and add it to the destination buffer.
-static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1,
-                                        const int16x8_t a2, const int16x8_t a3,
-                                        const int16x8_t a4, const int16x8_t a5,
-                                        const int16x8_t a6, const int16x8_t a7,
-                                        uint8_t *b, const int b_stride) {
-  uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
-  int16x8_t c0, c1, c2, c3, c4, c5, c6, c7;
-  b0 = vld1_u8(b);
-  b += b_stride;
-  b1 = vld1_u8(b);
-  b += b_stride;
-  b2 = vld1_u8(b);
-  b += b_stride;
-  b3 = vld1_u8(b);
-  b += b_stride;
-  b4 = vld1_u8(b);
-  b += b_stride;
-  b5 = vld1_u8(b);
-  b += b_stride;
-  b6 = vld1_u8(b);
-  b += b_stride;
-  b7 = vld1_u8(b);
-  b -= (7 * b_stride);
+static INLINE void add_and_store_u8_s16(const int16x8_t *const a, uint8_t *d,
+                                        const int stride) {
+  uint8x8_t b[8];
+  int16x8_t c[8];
+
+  b[0] = vld1_u8(d);
+  d += stride;
+  b[1] = vld1_u8(d);
+  d += stride;
+  b[2] = vld1_u8(d);
+  d += stride;
+  b[3] = vld1_u8(d);
+  d += stride;
+  b[4] = vld1_u8(d);
+  d += stride;
+  b[5] = vld1_u8(d);
+  d += stride;
+  b[6] = vld1_u8(d);
+  d += stride;
+  b[7] = vld1_u8(d);
+  d -= (7 * stride);
 
   // c = b + (a >> 6)
-  c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6);
-  c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6);
-  c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6);
-  c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6);
-  c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6);
-  c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6);
-  c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6);
-  c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6);
-
-  b0 = vqmovun_s16(c0);
-  b1 = vqmovun_s16(c1);
-  b2 = vqmovun_s16(c2);
-  b3 = vqmovun_s16(c3);
-  b4 = vqmovun_s16(c4);
-  b5 = vqmovun_s16(c5);
-  b6 = vqmovun_s16(c6);
-  b7 = vqmovun_s16(c7);
-
-  vst1_u8(b, b0);
-  b += b_stride;
-  vst1_u8(b, b1);
-  b += b_stride;
-  vst1_u8(b, b2);
-  b += b_stride;
-  vst1_u8(b, b3);
-  b += b_stride;
-  vst1_u8(b, b4);
-  b += b_stride;
-  vst1_u8(b, b5);
-  b += b_stride;
-  vst1_u8(b, b6);
-  b += b_stride;
-  vst1_u8(b, b7);
+  c[0] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[0])), a[0], 6);
+  c[1] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[1])), a[1], 6);
+  c[2] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[2])), a[2], 6);
+  c[3] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[3])), a[3], 6);
+  c[4] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[4])), a[4], 6);
+  c[5] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[5])), a[5], 6);
+  c[6] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[6])), a[6], 6);
+  c[7] = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b[7])), a[7], 6);
+
+  b[0] = vqmovun_s16(c[0]);
+  b[1] = vqmovun_s16(c[1]);
+  b[2] = vqmovun_s16(c[2]);
+  b[3] = vqmovun_s16(c[3]);
+  b[4] = vqmovun_s16(c[4]);
+  b[5] = vqmovun_s16(c[5]);
+  b[6] = vqmovun_s16(c[6]);
+  b[7] = vqmovun_s16(c[7]);
+
+  vst1_u8(d, b[0]);
+  d += stride;
+  vst1_u8(d, b[1]);
+  d += stride;
+  vst1_u8(d, b[2]);
+  d += stride;
+  vst1_u8(d, b[3]);
+  d += stride;
+  vst1_u8(d, b[4]);
+  d += stride;
+  vst1_u8(d, b[5]);
+  d += stride;
+  vst1_u8(d, b[6]);
+  d += stride;
+  vst1_u8(d, b[7]);
 }
 
 static INLINE uint8x16_t create_dcq(const int16_t dc) {
@@ -283,56 +288,53 @@ static INLINE uint8x16_t create_dcq(const int16_t dc) {
   return vdupq_n_u8((uint8_t)t);
 }
 
-static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
-                                         int16x8_t *const a0,
-                                         int16x8_t *const a1) {
-  int16x4_t b0, b1, b2, b3;
-  int32x4_t c0, c1, c2, c3;
-  int16x8_t d0, d1;
-
-  transpose_s16_4x4q(a0, a1);
-  b0 = vget_low_s16(*a0);
-  b1 = vget_high_s16(*a0);
-  b2 = vget_low_s16(*a1);
-  b3 = vget_high_s16(*a1);
-  c0 = vmull_lane_s16(b0, cospis, 2);
-  c2 = vmull_lane_s16(b1, cospis, 2);
-  c1 = vsubq_s32(c0, c2);
-  c0 = vaddq_s32(c0, c2);
-  c2 = vmull_lane_s16(b2, cospis, 3);
-  c3 = vmull_lane_s16(b2, cospis, 1);
-  c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
-  c3 = vmlal_lane_s16(c3, b3, cospis, 3);
-  b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
-  b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
-  b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
-  b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
-  d0 = vcombine_s16(b0, b1);
-  d1 = vcombine_s16(b3, b2);
-  *a0 = vaddq_s16(d0, d1);
-  *a1 = vsubq_s16(d0, d1);
-}
-
-static INLINE void idct8x8_12_pass1_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2,
-    int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5,
-    int16x4_t *const io6, int16x4_t *const io7) {
+static INLINE void idct4x4_16_kernel_bd8(int16x8_t *const a) {
+  const int16x4_t cospis = vld1_s16(kCospi);
+  int16x4_t b[4];
+  int32x4_t c[4];
+  int16x8_t d[2];
+
+  b[0] = vget_low_s16(a[0]);
+  b[1] = vget_high_s16(a[0]);
+  b[2] = vget_low_s16(a[1]);
+  b[3] = vget_high_s16(a[1]);
+  c[0] = vmull_lane_s16(b[0], cospis, 2);
+  c[2] = vmull_lane_s16(b[1], cospis, 2);
+  c[1] = vsubq_s32(c[0], c[2]);
+  c[0] = vaddq_s32(c[0], c[2]);
+  c[3] = vmull_lane_s16(b[2], cospis, 3);
+  c[2] = vmull_lane_s16(b[2], cospis, 1);
+  c[3] = vmlsl_lane_s16(c[3], b[3], cospis, 1);
+  c[2] = vmlal_lane_s16(c[2], b[3], cospis, 3);
+  dct_const_round_shift_low_8_dual(c, &d[0], &d[1]);
+  a[0] = vaddq_s16(d[0], d[1]);
+  a[1] = vsubq_s16(d[0], d[1]);
+}
+
+static INLINE void transpose_idct4x4_16_bd8(int16x8_t *const a) {
+  transpose_s16_4x4q(&a[0], &a[1]);
+  idct4x4_16_kernel_bd8(a);
+}
+
+static INLINE void idct8x8_12_pass1_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        int16x4_t *const io) {
   int16x4_t step1[8], step2[8];
   int32x4_t t32[2];
 
-  transpose_s16_4x4d(io0, io1, io2, io3);
+  transpose_s16_4x4d(&io[0], &io[1], &io[2], &io[3]);
 
   // stage 1
-  step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3);
-  step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2);
-  step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1);
-  step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0);
+  step1[4] = vqrdmulh_lane_s16(io[1], cospisd1, 3);
+  step1[5] = vqrdmulh_lane_s16(io[3], cospisd1, 2);
+  step1[6] = vqrdmulh_lane_s16(io[3], cospisd1, 1);
+  step1[7] = vqrdmulh_lane_s16(io[1], cospisd1, 0);
 
   // stage 2
-  step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2);
-  step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3);
-  step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1);
+  step2[1] = vqrdmulh_lane_s16(io[0], cospisd0, 2);
+  step2[2] = vqrdmulh_lane_s16(io[2], cospisd0, 3);
+  step2[3] = vqrdmulh_lane_s16(io[2], cospisd0, 1);
 
   step2[4] = vadd_s16(step1[4], step1[5]);
   step2[5] = vsub_s16(step1[4], step1[5]);
@@ -352,32 +354,27 @@ static INLINE void idct8x8_12_pass1_bd8(
   step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
 
   // stage 4
-  *io0 = vadd_s16(step1[0], step2[7]);
-  *io1 = vadd_s16(step1[1], step1[6]);
-  *io2 = vadd_s16(step1[2], step1[5]);
-  *io3 = vadd_s16(step1[3], step2[4]);
-  *io4 = vsub_s16(step1[3], step2[4]);
-  *io5 = vsub_s16(step1[2], step1[5]);
-  *io6 = vsub_s16(step1[1], step1[6]);
-  *io7 = vsub_s16(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_12_pass2_bd8(
-    const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1,
-    const int16x4_t input0, const int16x4_t input1, const int16x4_t input2,
-    const int16x4_t input3, const int16x4_t input4, const int16x4_t input5,
-    const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0,
-    int16x8_t *const output1, int16x8_t *const output2,
-    int16x8_t *const output3, int16x8_t *const output4,
-    int16x8_t *const output5, int16x8_t *const output6,
-    int16x8_t *const output7) {
+  io[0] = vadd_s16(step1[0], step2[7]);
+  io[1] = vadd_s16(step1[1], step1[6]);
+  io[2] = vadd_s16(step1[2], step1[5]);
+  io[3] = vadd_s16(step1[3], step2[4]);
+  io[4] = vsub_s16(step1[3], step2[4]);
+  io[5] = vsub_s16(step1[2], step1[5]);
+  io[6] = vsub_s16(step1[1], step1[6]);
+  io[7] = vsub_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_pass2_bd8(const int16x4_t cospis0,
+                                        const int16x4_t cospisd0,
+                                        const int16x4_t cospisd1,
+                                        const int16x4_t *const input,
+                                        int16x8_t *const output) {
   int16x8_t in[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
 
-  transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6,
-                    input7, &in[0], &in[1], &in[2], &in[3]);
+  transpose_s16_4x8(input[0], input[1], input[2], input[3], input[4], input[5],
+                    input[6], input[7], &in[0], &in[1], &in[2], &in[3]);
 
   // stage 1
   step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3);
@@ -407,86 +404,64 @@ static INLINE void idct8x8_12_pass2_bd8(
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *output0 = vaddq_s16(step1[0], step2[7]);
-  *output1 = vaddq_s16(step1[1], step1[6]);
-  *output2 = vaddq_s16(step1[2], step1[5]);
-  *output3 = vaddq_s16(step1[3], step2[4]);
-  *output4 = vsubq_s16(step1[3], step2[4]);
-  *output5 = vsubq_s16(step1[2], step1[5]);
-  *output6 = vsubq_s16(step1[1], step1[6]);
-  *output7 = vsubq_s16(step1[0], step2[7]);
-}
-
-static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
-                                     const int16x4_t cospis1,
-                                     int16x8_t *const io0, int16x8_t *const io1,
-                                     int16x8_t *const io2, int16x8_t *const io3,
-                                     int16x8_t *const io4, int16x8_t *const io5,
-                                     int16x8_t *const io6,
-                                     int16x8_t *const io7) {
-  int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h,
-      input_7l, input_7h;
+  output[0] = vaddq_s16(step1[0], step2[7]);
+  output[1] = vaddq_s16(step1[1], step1[6]);
+  output[2] = vaddq_s16(step1[2], step1[5]);
+  output[3] = vaddq_s16(step1[3], step2[4]);
+  output[4] = vsubq_s16(step1[3], step2[4]);
+  output[5] = vsubq_s16(step1[2], step1[5]);
+  output[6] = vsubq_s16(step1[1], step1[6]);
+  output[7] = vsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_64_1d_bd8_kernel(const int16x4_t cospis0,
+                                            const int16x4_t cospis1,
+                                            int16x8_t *const io) {
+  int16x4_t input1l, input1h, input3l, input3h, input5l, input5h, input7l,
+      input7h;
   int16x4_t step1l[4], step1h[4];
   int16x8_t step1[8], step2[8];
   int32x4_t t32[8];
-  int16x4_t t16[8];
-
-  transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7);
 
   // stage 1
-  input_1l = vget_low_s16(*io1);
-  input_1h = vget_high_s16(*io1);
-  input_3l = vget_low_s16(*io3);
-  input_3h = vget_high_s16(*io3);
-  input_5l = vget_low_s16(*io5);
-  input_5h = vget_high_s16(*io5);
-  input_7l = vget_low_s16(*io7);
-  input_7h = vget_high_s16(*io7);
-  step1l[0] = vget_low_s16(*io0);
-  step1h[0] = vget_high_s16(*io0);
-  step1l[1] = vget_low_s16(*io2);
-  step1h[1] = vget_high_s16(*io2);
-  step1l[2] = vget_low_s16(*io4);
-  step1h[2] = vget_high_s16(*io4);
-  step1l[3] = vget_low_s16(*io6);
-  step1h[3] = vget_high_s16(*io6);
-
-  t32[0] = vmull_lane_s16(input_1l, cospis1, 3);
-  t32[1] = vmull_lane_s16(input_1h, cospis1, 3);
-  t32[2] = vmull_lane_s16(input_3l, cospis1, 2);
-  t32[3] = vmull_lane_s16(input_3h, cospis1, 2);
-  t32[4] = vmull_lane_s16(input_3l, cospis1, 1);
-  t32[5] = vmull_lane_s16(input_3h, cospis1, 1);
-  t32[6] = vmull_lane_s16(input_1l, cospis1, 0);
-  t32[7] = vmull_lane_s16(input_1h, cospis1, 0);
-  t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0);
-  t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0);
-  t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1);
-  t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1);
-  t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2);
-  t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
-  t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
-  t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
-  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
-  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
-  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
-  step1[4] = vcombine_s16(t16[0], t16[1]);
-  step1[5] = vcombine_s16(t16[2], t16[3]);
-  step1[6] = vcombine_s16(t16[4], t16[5]);
-  step1[7] = vcombine_s16(t16[6], t16[7]);
+  input1l = vget_low_s16(io[1]);
+  input1h = vget_high_s16(io[1]);
+  input3l = vget_low_s16(io[3]);
+  input3h = vget_high_s16(io[3]);
+  input5l = vget_low_s16(io[5]);
+  input5h = vget_high_s16(io[5]);
+  input7l = vget_low_s16(io[7]);
+  input7h = vget_high_s16(io[7]);
+  step1l[0] = vget_low_s16(io[0]);
+  step1h[0] = vget_high_s16(io[0]);
+  step1l[1] = vget_low_s16(io[2]);
+  step1h[1] = vget_high_s16(io[2]);
+  step1l[2] = vget_low_s16(io[4]);
+  step1h[2] = vget_high_s16(io[4]);
+  step1l[3] = vget_low_s16(io[6]);
+  step1h[3] = vget_high_s16(io[6]);
+
+  t32[0] = vmull_lane_s16(input1l, cospis1, 3);
+  t32[1] = vmull_lane_s16(input1h, cospis1, 3);
+  t32[2] = vmull_lane_s16(input3l, cospis1, 2);
+  t32[3] = vmull_lane_s16(input3h, cospis1, 2);
+  t32[4] = vmull_lane_s16(input3l, cospis1, 1);
+  t32[5] = vmull_lane_s16(input3h, cospis1, 1);
+  t32[6] = vmull_lane_s16(input1l, cospis1, 0);
+  t32[7] = vmull_lane_s16(input1h, cospis1, 0);
+  t32[0] = vmlsl_lane_s16(t32[0], input7l, cospis1, 0);
+  t32[1] = vmlsl_lane_s16(t32[1], input7h, cospis1, 0);
+  t32[2] = vmlal_lane_s16(t32[2], input5l, cospis1, 1);
+  t32[3] = vmlal_lane_s16(t32[3], input5h, cospis1, 1);
+  t32[4] = vmlsl_lane_s16(t32[4], input5l, cospis1, 2);
+  t32[5] = vmlsl_lane_s16(t32[5], input5h, cospis1, 2);
+  t32[6] = vmlal_lane_s16(t32[6], input7l, cospis1, 3);
+  t32[7] = vmlal_lane_s16(t32[7], input7h, cospis1, 3);
+  dct_const_round_shift_low_8_dual(&t32[0], &step1[4], &step1[5]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step1[6], &step1[7]);
 
   // stage 2
   t32[2] = vmull_lane_s16(step1l[0], cospis0, 2);
@@ -503,18 +478,8 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
   t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
   t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
-  t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
-  t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
-  t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
-  step2[0] = vcombine_s16(t16[0], t16[1]);
-  step2[1] = vcombine_s16(t16[2], t16[3]);
-  step2[2] = vcombine_s16(t16[4], t16[5]);
-  step2[3] = vcombine_s16(t16[6], t16[7]);
+  dct_const_round_shift_low_8_dual(&t32[0], &step2[0], &step2[1]);
+  dct_const_round_shift_low_8_dual(&t32[4], &step2[2], &step2[3]);
 
   step2[4] = vaddq_s16(step1[4], step1[5]);
   step2[5] = vsubq_s16(step1[4], step1[5]);
@@ -533,35 +498,25 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
   t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  step1[5] = vcombine_s16(t16[0], t16[1]);
-  step1[6] = vcombine_s16(t16[2], t16[3]);
+  dct_const_round_shift_low_8_dual(t32, &step1[5], &step1[6]);
 
   // stage 4
-  *io0 = vaddq_s16(step1[0], step2[7]);
-  *io1 = vaddq_s16(step1[1], step1[6]);
-  *io2 = vaddq_s16(step1[2], step1[5]);
-  *io3 = vaddq_s16(step1[3], step2[4]);
-  *io4 = vsubq_s16(step1[3], step2[4]);
-  *io5 = vsubq_s16(step1[2], step1[5]);
-  *io6 = vsubq_s16(step1[1], step1[6]);
-  *io7 = vsubq_s16(step1[0], step2[7]);
+  io[0] = vaddq_s16(step1[0], step2[7]);
+  io[1] = vaddq_s16(step1[1], step1[6]);
+  io[2] = vaddq_s16(step1[2], step1[5]);
+  io[3] = vaddq_s16(step1[3], step2[4]);
+  io[4] = vsubq_s16(step1[3], step2[4]);
+  io[5] = vsubq_s16(step1[2], step1[5]);
+  io[6] = vsubq_s16(step1[1], step1[6]);
+  io[7] = vsubq_s16(step1[0], step2[7]);
 }
 
-static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
-                                              int16x8_t *const d0,
-                                              int16x8_t *const d1) {
-  int16x4_t t16[4];
-
-  t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
-  t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
-  t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
-  t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
-  *d0 = vcombine_s16(t16[0], t16[1]);
-  *d1 = vcombine_s16(t16[2], t16[3]);
+static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
+                                     const int16x4_t cospis1,
+                                     int16x8_t *const io) {
+  transpose_s16_8x8(&io[0], &io[1], &io[2], &io[3], &io[4], &io[5], &io[6],
+                    &io[7]);
+  idct8x8_64_1d_bd8_kernel(cospis0, cospis1, io);
 }
 
 static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0,
@@ -584,7 +539,7 @@ static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1,
   int32x4_t t32[4];
 
   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
@@ -596,7 +551,7 @@ static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1,
   idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32);
   t32[2] = vnegq_s32(t32[2]);
   t32[3] = vnegq_s32(t32[3]);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
@@ -611,7 +566,7 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
   t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2);
   t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
@@ -627,7 +582,7 @@ static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
@@ -643,7 +598,7 @@ static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
@@ -659,7 +614,7 @@ static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 1);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 1);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 1);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
@@ -675,7 +630,7 @@ static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
   t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
   t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
@@ -691,7 +646,7 @@ static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
@@ -707,7 +662,7 @@ static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
   t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26N_14_18N, 3);
   t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26N_14_18N, 3);
   t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26N_14_18N, 3);
-  idct16x16_add_wrap_low_8x2(t32, d0, d1);
+  dct_const_round_shift_low_8_dual(t32, d0, d1);
 }
 
 static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
@@ -786,129 +741,153 @@ static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
   vst1q_s16(output, out[15]);
 }
 
-static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
-                                    const int stride) {
-  uint8x8_t d = vld1_u8(*dest);
-  uint16x8_t q;
-
-  res = vrshrq_n_s16(res, 6);
-  q = vaddw_u8(vreinterpretq_u16_s16(res), d);
-  d = vqmovun_s16(vreinterpretq_s16_u16(q));
+static INLINE void idct8x8_add8x1(const int16x8_t a, uint8_t **const dest,
+                                  const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 5);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
   vst1_u8(*dest, d);
   *dest += stride;
 }
 
-static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
-                                           uint16_t **dest, const int stride) {
-  uint16x8_t d = vld1q_u16(*dest);
+static INLINE void idct8x8_add8x8_neon(int16x8_t *const out, uint8_t *dest,
+                                       const int stride) {
+  idct8x8_add8x1(out[0], &dest, stride);
+  idct8x8_add8x1(out[1], &dest, stride);
+  idct8x8_add8x1(out[2], &dest, stride);
+  idct8x8_add8x1(out[3], &dest, stride);
+  idct8x8_add8x1(out[4], &dest, stride);
+  idct8x8_add8x1(out[5], &dest, stride);
+  idct8x8_add8x1(out[6], &dest, stride);
+  idct8x8_add8x1(out[7], &dest, stride);
+}
 
-  res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
-  res = vminq_s16(res, max);
-  d = vqshluq_n_s16(res, 0);
+static INLINE void idct16x16_add8x1(const int16x8_t a, uint8_t **const dest,
+                                    const int stride) {
+  const uint8x8_t s = vld1_u8(*dest);
+  const int16x8_t res = vrshrq_n_s16(a, 6);
+  const uint16x8_t q = vaddw_u8(vreinterpretq_u16_s16(res), s);
+  const uint8x8_t d = vqmovun_s16(vreinterpretq_s16_u16(q));
+  vst1_u8(*dest, d);
+  *dest += stride;
+}
+
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+                                       uint8_t *dest, const int stride) {
+  // Add the result to dest
+  idct16x16_add8x1(out[0], &dest, stride);
+  idct16x16_add8x1(out[1], &dest, stride);
+  idct16x16_add8x1(out[2], &dest, stride);
+  idct16x16_add8x1(out[3], &dest, stride);
+  idct16x16_add8x1(out[4], &dest, stride);
+  idct16x16_add8x1(out[5], &dest, stride);
+  idct16x16_add8x1(out[6], &dest, stride);
+  idct16x16_add8x1(out[7], &dest, stride);
+  idct16x16_add8x1(out[8], &dest, stride);
+  idct16x16_add8x1(out[9], &dest, stride);
+  idct16x16_add8x1(out[10], &dest, stride);
+  idct16x16_add8x1(out[11], &dest, stride);
+  idct16x16_add8x1(out[12], &dest, stride);
+  idct16x16_add8x1(out[13], &dest, stride);
+  idct16x16_add8x1(out[14], &dest, stride);
+  idct16x16_add8x1(out[15], &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1(const int16x8_t a,
+                                           const int16x8_t max,
+                                           uint16_t **const dest,
+                                           const int stride) {
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res0 = vqaddq_s16(a, vreinterpretq_s16_u16(s));
+  const int16x8_t res1 = vminq_s16(res0, max);
+  const uint16x8_t d = vqshluq_n_s16(res1, 0);
   vst1q_u16(*dest, d);
   *dest += stride;
 }
 
-static INLINE void highbd_idct16x16_add8x1_bd8(int16x8_t res, uint16_t **dest,
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+                                           const int stride) {
+  // Add the result to dest
+  const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+  out[0] = vrshrq_n_s16(out[0], 6);
+  out[1] = vrshrq_n_s16(out[1], 6);
+  out[2] = vrshrq_n_s16(out[2], 6);
+  out[3] = vrshrq_n_s16(out[3], 6);
+  out[4] = vrshrq_n_s16(out[4], 6);
+  out[5] = vrshrq_n_s16(out[5], 6);
+  out[6] = vrshrq_n_s16(out[6], 6);
+  out[7] = vrshrq_n_s16(out[7], 6);
+  out[8] = vrshrq_n_s16(out[8], 6);
+  out[9] = vrshrq_n_s16(out[9], 6);
+  out[10] = vrshrq_n_s16(out[10], 6);
+  out[11] = vrshrq_n_s16(out[11], 6);
+  out[12] = vrshrq_n_s16(out[12], 6);
+  out[13] = vrshrq_n_s16(out[13], 6);
+  out[14] = vrshrq_n_s16(out[14], 6);
+  out[15] = vrshrq_n_s16(out[15], 6);
+  highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+  highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
+
+static INLINE void highbd_idct16x16_add8x1_bd8(const int16x8_t a,
+                                               uint16_t **const dest,
                                                const int stride) {
-  uint16x8_t d = vld1q_u16(*dest);
-
-  res = vrsraq_n_s16(vreinterpretq_s16_u16(d), res, 6);
-  d = vmovl_u8(vqmovun_s16(res));
+  const uint16x8_t s = vld1q_u16(*dest);
+  const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), a, 6);
+  const uint16x8_t d = vmovl_u8(vqmovun_s16(res));
   vst1q_u16(*dest, d);
   *dest += stride;
 }
 
 static INLINE void highbd_add_and_store_bd8(const int16x8_t *const a,
-                                            uint16_t *out, const int b_stride) {
-  highbd_idct16x16_add8x1_bd8(a[0], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[1], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[2], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[3], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[4], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[5], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[6], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[7], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[8], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[9], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[10], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[11], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[12], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[13], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[14], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[15], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[16], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[17], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[18], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[19], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[20], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[21], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[22], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[23], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[24], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[25], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[26], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[27], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[28], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[29], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[30], &out, b_stride);
-  highbd_idct16x16_add8x1_bd8(a[31], &out, b_stride);
-}
-
-static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
-                                              uint16_t *dest, const int stride,
-                                              const int bd) {
-  // Add the result to dest
-  const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  int16x8_t o[16];
-  o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
-                      vrshrn_n_s32(out[0].val[1], 6));
-  o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
-                      vrshrn_n_s32(out[1].val[1], 6));
-  o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
-                      vrshrn_n_s32(out[2].val[1], 6));
-  o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
-                      vrshrn_n_s32(out[3].val[1], 6));
-  o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
-                      vrshrn_n_s32(out[4].val[1], 6));
-  o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
-                      vrshrn_n_s32(out[5].val[1], 6));
-  o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
-                      vrshrn_n_s32(out[6].val[1], 6));
-  o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
-                      vrshrn_n_s32(out[7].val[1], 6));
-  o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
-                      vrshrn_n_s32(out[8].val[1], 6));
-  o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
-                      vrshrn_n_s32(out[9].val[1], 6));
-  o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
-                       vrshrn_n_s32(out[10].val[1], 6));
-  o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
-                       vrshrn_n_s32(out[11].val[1], 6));
-  o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
-                       vrshrn_n_s32(out[12].val[1], 6));
-  o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
-                       vrshrn_n_s32(out[13].val[1], 6));
-  o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
-                       vrshrn_n_s32(out[14].val[1], 6));
-  o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
-                       vrshrn_n_s32(out[15].val[1], 6));
-  highbd_idct16x16_add8x1(o[0], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[1], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[2], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[3], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[4], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[5], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[6], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[7], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[8], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[9], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[10], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[11], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[12], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[13], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[14], max, &dest, stride);
-  highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+                                            uint16_t *out, const int stride) {
+  highbd_idct16x16_add8x1_bd8(a[0], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[1], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[2], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[3], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[4], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[5], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[6], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[7], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[8], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[9], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[10], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[11], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[12], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[13], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[14], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[15], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[16], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[17], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[18], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[19], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[20], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[21], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[22], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[23], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[24], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[25], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[26], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[27], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[28], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[29], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[30], &out, stride);
+  highbd_idct16x16_add8x1_bd8(a[31], &out, stride);
 }
 
 void vpx_idct16x16_256_add_half1d(const void *const input, int16_t *output,
@@ -937,4 +916,4 @@ void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output);
 void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride,
                        const int highbd_flag);
 
-#endif  // VPX_DSP_ARM_IDCT_NEON_H_
+#endif  // VPX_VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
index fb1fa6b681d..38e275834b0 100644
--- a/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/intrapred_neon.c
@@ -667,8 +667,6 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
 }
 
-// -----------------------------------------------------------------------------
-
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
index a042d40acb2..a81a9d10132 100644
--- a/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
+++ b/libs/libvpx/vpx_dsp/arm/loopfilter_8_neon.asm
@@ -201,7 +201,7 @@
     str         lr, [sp, #16]              ; thresh1
     add         sp, #4
     pop         {r0-r1, lr}
-    add         r0, r1, lsl #3             ; s + 8 * pitch
+    add         r0, r0, r1, lsl #3         ; s + 8 * pitch
     b           vpx_lpf_vertical_8_neon
     ENDP        ; |vpx_lpf_vertical_8_dual_neon|
 
diff --git a/libs/libvpx/vpx_dsp/arm/mem_neon.h b/libs/libvpx/vpx_dsp/arm/mem_neon.h
index 4efad5333e3..943865b3c28 100644
--- a/libs/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/mem_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_MEM_NEON_H_
-#define VPX_DSP_ARM_MEM_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_VPX_DSP_ARM_MEM_NEON_H_
 
 #include <arm_neon.h>
 #include <assert.h>
@@ -19,6 +19,21 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
+static INLINE int16x4_t create_s16x4_neon(const int16_t c0, const int16_t c1,
+                                          const int16_t c2, const int16_t c3) {
+  return vcreate_s16((uint16_t)c0 | ((uint32_t)c1 << 16) |
+                     ((int64_t)(uint16_t)c2 << 32) | ((int64_t)c3 << 48));
+}
+
+static INLINE int32x2_t create_s32x2_neon(const int32_t c0, const int32_t c1) {
+  return vcreate_s32((uint32_t)c0 | ((int64_t)(uint32_t)c1 << 32));
+}
+
+static INLINE int32x4_t create_s32x4_neon(const int32_t c0, const int32_t c1,
+                                          const int32_t c2, const int32_t c3) {
+  return vcombine_s32(create_s32x2_neon(c0, c1), create_s32x2_neon(c2, c3));
+}
+
 // Helper functions used to load tran_low_t into int16, narrowing if necessary.
 static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -86,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
   if (stride == 4) return vld1_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
-  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
 }
 
@@ -112,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+  a_u32 = vsetq_lane_u32(a, a_u32, 0);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
   return vreinterpretq_u8_u32(a_u32);
 }
 
@@ -166,4 +181,4 @@ static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
   buf += stride;
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
-#endif  // VPX_DSP_ARM_MEM_NEON_H_
+#endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/quantize_neon.c b/libs/libvpx/vpx_dsp/arm/quantize_neon.c
index a0a1e6dd5ad..adef5f6e151 100644
--- a/libs/libvpx/vpx_dsp/arm/quantize_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -15,17 +15,33 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
+static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
+                                               const int16x8_t dequant,
+                                               tran_low_t *dqcoeff) {
+  const int32x4_t dqcoeff_0 =
+      vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  const int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vst1q_s32(dqcoeff, dqcoeff_0);
+  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -38,8 +54,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
     const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -65,17 +81,15 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     qcoeff = vandq_s16(qcoeff, zbin_mask);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
     coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
 
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
 
-    qcoeff = vmulq_s16(qcoeff, dequant);
-
-    store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
@@ -90,8 +104,8 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
     do {
       // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -118,23 +132,24 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
 
-      qcoeff = vmulq_s16(qcoeff, dequant);
-
-      store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
 
       n_coeffs -= 8;
     } while (n_coeffs > 0);
   }
 
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
   {
     const uint16x4_t eob_max_0 =
         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -142,25 +157,50 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
+#endif  // __aarch64__
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 }
 
+static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
+                                                     const int16x8_t dequant,
+                                                     tran_low_t *dqcoeff) {
+  int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+  int32x4_t dqcoeff_1 =
+      vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff, dqcoeff_0);
+  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+#else
+  vst1q_s16(dqcoeff,
+            vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
   const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;  // Because we will always calculate 32*32.
   (void)skip_block;
   assert(!skip_block);
@@ -174,8 +214,8 @@ void vpx_quantize_b_32x32_neon(
     const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
     const int16x8_t dequant = vld1q_s16(dequant_ptr);
     // Add one because the eob does not index from 0.
-    const uint16x8_t iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+    const uint16x8_t v_iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
     const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
     const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -188,8 +228,6 @@ void vpx_quantize_b_32x32_neon(
 
     // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
     int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-    int16x8_t dqcoeff;
-    int32x4_t dqcoeff_0, dqcoeff_1;
 
     qcoeff = vaddq_s16(qcoeff, rounded);
 
@@ -203,25 +241,15 @@ void vpx_quantize_b_32x32_neon(
     qcoeff = vandq_s16(qcoeff, zbin_mask);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
-    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
     coeff_ptr += 8;
-    iscan_ptr += 8;
+    iscan += 8;
 
     store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
 
-    dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
-    dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
-    // Add 1 if negative to round towards zero because the C uses division.
-    dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-    dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
-    dqcoeff =
-        vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
-    store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
@@ -234,8 +262,8 @@ void vpx_quantize_b_32x32_neon(
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
       // Add one because the eob is not its index.
-      const uint16x8_t iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const uint16x8_t v_iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
 
       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
@@ -248,8 +276,6 @@ void vpx_quantize_b_32x32_neon(
 
       // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
       int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-      int16x8_t dqcoeff;
-      int32x4_t dqcoeff_0, dqcoeff_1;
 
       qcoeff = vaddq_s16(qcoeff, rounded);
 
@@ -264,28 +290,22 @@ void vpx_quantize_b_32x32_neon(
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
       coeff_ptr += 8;
-      iscan_ptr += 8;
+      iscan += 8;
 
       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
 
-      dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
-      dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
-
-      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
-
-      dqcoeff =
-          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
-
-      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
     }
   }
 
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
   {
     const uint16x4_t eob_max_0 =
         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -293,4 +313,5 @@ void vpx_quantize_b_32x32_neon(
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
+#endif  // __aarch64__
 }
diff --git a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
index b04de3aff26..06443c69956 100644
--- a/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -10,233 +10,371 @@
 
 #include <arm_neon.h>
 
+#include <assert.h>
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        uint32_t *res) {
+static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
+                                                 const void *const buf1) {
+  uint32_t a;
+  uint32x2_t aa = vdup_n_u32(0);
+  memcpy(&a, buf0, 4);
+  aa = vset_lane_u32(a, aa, 0);
+  memcpy(&a, buf1, 4);
+  aa = vset_lane_u32(a, aa, 1);
+  return vreinterpret_u8_u32(aa);
+}
+
+static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
+                            const uint8_t *const ref_array[4],
+                            const int ref_stride, const int height,
+                            uint32_t *const res) {
   int i;
-  const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
-  for (i = 0; i < 4; ++i) {
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+  uint16x4_t a[2];
+  uint32x4_t r;
+
+  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
+  assert(!(src_stride % sizeof(uint32_t)));
+
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t s = vreinterpret_u8_u32(
+        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
+    const uint8x8_t ref01 = load_unaligned_2_buffers(
+        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
+    const uint8x8_t ref23 = load_unaligned_2_buffers(
+        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
+    abs[0] = vabal_u8(abs[0], s, ref01);
+    abs[1] = vabal_u8(abs[1], s, ref23);
   }
+
+  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
+  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
+  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+  vst1q_u32(res, r);
 }
 
-void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  int i;
-  const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
-  const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
-  for (i = 0; i < 4; ++i) {
-    const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
-    const uint8x16_t ref_1 =
-        load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
-    uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
-    abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
-    abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
-    abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
-    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
-  }
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
+}
+
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
+                        uint32_t *res) {
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
+static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                          uint32_t *const res) {
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+  vst1q_u32(res, r);
 }
 
-static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
-                            const uint8_t *const b[4], int b_stride,
-                            uint32_t *result, const int height) {
+// Can handle 1024 pixels' sad sum (such as 32x32)
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                           uint32_t *const res) {
+  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
+  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
+  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+                                           uint32_t *const res) {
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
+  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
+  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
+  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
+  const uint32x2_t c0 = vpadd_u32(b0, b1);
+  const uint32x2_t c1 = vpadd_u32(b2, b3);
+  vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 4096 pixels' sad sum (such as 64x64)
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
+                                           uint32_t *const res) {
+  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+  const uint32x4_t b0 = vaddq_u32(a0, a1);
+  const uint32x4_t b1 = vaddq_u32(a2, a3);
+  const uint32x4_t b2 = vaddq_u32(a4, a5);
+  const uint32x4_t b3 = vaddq_u32(a6, a7);
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  vst1q_u32(res, vcombine_u32(d0, d1));
+}
+
+static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
+                            uint32_t *res, const int height) {
   int i, j;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    a += a_stride;
+    const uint8x8_t s = vld1_u8(src_ptr);
+    src_ptr += src_stride;
     for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
+      ref_loop[j] += ref_stride;
+      sum[j] = vabal_u8(sum[j], s, b_u8);
     }
   }
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
-  }
+  sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
 }
 
-void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *const ref_array[4], int ref_stride,
                         uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
 }
 
-void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
 }
 
-static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
+                              uint16x8_t *const sum) {
+  const uint8x16_t r = vld1q_u8(ref_ptr);
+  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
+  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
+}
+
+static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
+                             uint32_t *res, const int height) {
   int i, j;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    a += a_stride;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
     for (j = 0; j < 4; ++j) {
-      const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+      sad16_neon(ref_loop[j], s, &sum[j]);
+      ref_loop[j] += ref_stride;
     }
   }
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
-  }
+  sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
-                         const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *const ref_array[4], int ref_stride,
                          uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
 }
 
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
 }
 
-void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
 }
 
-static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
-  int i, j;
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *const ref_array[4], int ref_stride,
+                             const int height, uint16x8_t *const sum) {
+  int i;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    a += a_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
-      const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
-      b_loop[j] += b_stride;
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
-      sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
-      sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
-    }
-  }
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
 
-  for (j = 0; j < 4; ++j) {
-    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
   }
 }
 
-void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
+  uint16x8_t sum[4];
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
+  sad_512_pel_final_neon(sum, res);
 }
 
-void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+  uint16x8_t sum[4];
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
+  sad_1024_pel_final_neon(sum, res);
 }
 
-void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+  uint16x8_t sum[4];
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
+  sad_2048_pel_final_neon(sum, res);
 }
 
-static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
-                          const uint8x16_t b_0, const uint8x16_t b_1,
-                          uint16x8_t *sum) {
-  *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
-  *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
-  *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
-  *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
-}
+////////////////////////////////////////////////////////////////////////////////
 
-static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
-                             const uint8_t *const b[4], int b_stride,
-                             uint32_t *result, const int height) {
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t *res) {
   int i;
-  uint16x8_t sum_0 = vdupq_n_u16(0);
-  uint16x8_t sum_1 = vdupq_n_u16(0);
-  uint16x8_t sum_2 = vdupq_n_u16(0);
-  uint16x8_t sum_3 = vdupq_n_u16(0);
-  uint16x8_t sum_4 = vdupq_n_u16(0);
-  uint16x8_t sum_5 = vdupq_n_u16(0);
-  uint16x8_t sum_6 = vdupq_n_u16(0);
-  uint16x8_t sum_7 = vdupq_n_u16(0);
-  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    a += a_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
-           &sum_1);
-    b_loop[0] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
-           &sum_3);
-    b_loop[1] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
-           &sum_5);
-    b_loop[2] += b_stride;
-    sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
-    sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
-           &sum_7);
-    b_loop[3] += b_stride;
+  for (i = 0; i < 32; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
   }
 
-  result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
-  result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
-  result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
-  result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
+  sad_2048_pel_final_neon(sum, res);
 }
 
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t *res) {
-  sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
-}
+  int i;
+  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
+                                 ref_array[3] };
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
 
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
+  for (i = 0; i < 64; ++i) {
+    uint8x16_t s;
+
+    s = vld1q_u8(src_ptr + 0 * 16);
+    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 1 * 16);
+    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
+    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
+    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+
+    s = vld1q_u8(src_ptr + 2 * 16);
+    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
+
+    s = vld1q_u8(src_ptr + 3 * 16);
+    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
+    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
+    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
+    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
+
+    src_ptr += src_stride;
+    ref_loop[0] += ref_stride;
+    ref_loop[1] += ref_stride;
+    ref_loop[2] += ref_stride;
+    ref_loop[3] += ref_stride;
+  }
+
+  sad_4096_pel_final_neon(sum, res);
 }
diff --git a/libs/libvpx/vpx_dsp/arm/sad_neon.c b/libs/libvpx/vpx_dsp/arm/sad_neon.c
index 9518a166bbf..c4a49e366d5 100644
--- a/libs/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/sad_neon.c
@@ -11,6 +11,7 @@
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
@@ -73,128 +74,132 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
   return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
 }
 
-static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, const int height) {
+static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, a_u8, b_u8);
   }
   return abs;
 }
 
-static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
-                                   const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   const uint8_t *second_pred,
+                                   const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(a);
-    const uint8x8_t b_u8 = vld1_u8(b);
-    const uint8x8_t c_u8 = vld1_u8(c);
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t c_u8 = vld1_u8(second_pred);
     const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 8;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
     abs = vabal_u8(abs, a_u8, avg);
   }
   return abs;
 }
 
-#define sad8xN(n)                                                      \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
-                               const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
-  }                                                                    \
-                                                                       \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                   const uint8_t *ref, int ref_stride, \
-                                   const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                             \
-        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
+#define sad8xN(n)                                                              \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *ref_ptr, int ref_stride) {       \
+    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
+                                   const uint8_t *ref_ptr, int ref_stride,     \
+                                   const uint8_t *second_pred) {               \
+    const uint16x8_t abs =                                                     \
+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                     \
   }
 
 sad8xN(4);
 sad8xN(8);
 sad8xN(16);
 
-static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
   }
   return abs;
 }
 
-static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(a);
-    const uint8x16_t b_u8 = vld1q_u8(b);
-    const uint8x16_t c_u8 = vld1q_u8(c);
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t c_u8 = vld1q_u8(second_pred);
     const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    a += a_stride;
-    b += b_stride;
-    c += 16;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
     abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
     abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
   }
   return abs;
 }
 
-#define sad16xN(n)                                                      \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad16xN(n)                                                            \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
   }
 
 sad16xN(8);
 sad16xN(16);
 sad16xN(32);
 
-static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
@@ -203,24 +208,25 @@ static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
   return abs;
 }
 
-static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(a);
-    const uint8x16_t a_hi = vld1q_u8(a + 16);
-    const uint8x16_t b_lo = vld1q_u8(b);
-    const uint8x16_t b_hi = vld1q_u8(b + 16);
-    const uint8x16_t c_lo = vld1q_u8(c);
-    const uint8x16_t c_hi = vld1q_u8(c + 16);
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t c_lo = vld1q_u8(second_pred);
+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
     const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
     const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
-    a += a_stride;
-    b += b_stride;
-    c += 32;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
     abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
     abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
     abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
@@ -229,43 +235,44 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
   return abs;
 }
 
-#define sad32xN(n)                                                      \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint16x8_t abs =                                              \
-        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+#define sad32xN(n)                                                            \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint16x8_t abs =                                                    \
+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint16x8_t abs =                                                    \
+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);                    \
   }
 
 sad32xN(16);
 sad32xN(32);
 sad32xN(64);
 
-static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
-                                const uint8_t *b, int b_stride,
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
   int i;
   uint16x8_t abs_0 = vdupq_n_u16(0);
   uint16x8_t abs_1 = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    a += a_stride;
-    b += b_stride;
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
@@ -282,33 +289,34 @@ static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
   }
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    const uint8_t *c, const int height) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
   int i;
   uint16x8_t abs_0 = vdupq_n_u16(0);
   uint16x8_t abs_1 = vdupq_n_u16(0);
 
   for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(a);
-    const uint8x16_t a_1 = vld1q_u8(a + 16);
-    const uint8x16_t a_2 = vld1q_u8(a + 32);
-    const uint8x16_t a_3 = vld1q_u8(a + 48);
-    const uint8x16_t b_0 = vld1q_u8(b);
-    const uint8x16_t b_1 = vld1q_u8(b + 16);
-    const uint8x16_t b_2 = vld1q_u8(b + 32);
-    const uint8x16_t b_3 = vld1q_u8(b + 48);
-    const uint8x16_t c_0 = vld1q_u8(c);
-    const uint8x16_t c_1 = vld1q_u8(c + 16);
-    const uint8x16_t c_2 = vld1q_u8(c + 32);
-    const uint8x16_t c_3 = vld1q_u8(c + 48);
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t c_0 = vld1q_u8(second_pred);
+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
     const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
     const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
     const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
     const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    a += a_stride;
-    b += b_stride;
-    c += 64;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
     abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
     abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
@@ -325,19 +333,20 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
   }
 }
 
-#define sad64xN(n)                                                      \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
-                                const uint8_t *ref, int ref_stride) {   \
-    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
-  }                                                                     \
-                                                                        \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    const uint8_t *second_pred) {       \
-    const uint32x4_t abs =                                              \
-        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
-    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+#define sad64xN(n)                                                            \
+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t abs =                                                    \
+        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t abs =                                                    \
+        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);                    \
   }
 
 sad64xN(32);
diff --git a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index 4f58a7832a5..37bfd1cd1fd 100644
--- a/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -97,30 +97,30 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_varianceNxM(n, m)                                 \
-  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
-      const uint8_t *b, int b_stride, uint32_t *sse) {              \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
-    uint8_t temp1[n * m];                                           \
-                                                                    \
-    if (n == 4) {                                                   \
-      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
-                                bilinear_filters[xoffset]);         \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
-                                bilinear_filters[yoffset]);         \
-    } else if (n == 8) {                                            \
-      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
-                                bilinear_filters[xoffset]);         \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
-                                bilinear_filters[yoffset]);         \
-    } else {                                                        \
-      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
-                                 bilinear_filters[xoffset]);        \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
-                                 bilinear_filters[yoffset]);        \
-    }                                                               \
-    return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
+#define sub_pixel_varianceNxM(n, m)                                         \
+  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
+    uint8_t temp1[n * m];                                                   \
+                                                                            \
+    if (n == 4) {                                                           \
+      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
+                                bilinear_filters[x_offset]);                \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
+                                bilinear_filters[y_offset]);                \
+    } else if (n == 8) {                                                    \
+      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
+                                bilinear_filters[x_offset]);                \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
+                                bilinear_filters[y_offset]);                \
+    } else {                                                                \
+      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
+                                 bilinear_filters[x_offset]);               \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
+                                 bilinear_filters[y_offset]);               \
+    }                                                                       \
+    return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
   }
 
 sub_pixel_varianceNxM(4, 4);
@@ -139,34 +139,34 @@ sub_pixel_varianceNxM(64, 64);
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_avg_varianceNxM(n, m)                             \
-  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
-      const uint8_t *b, int b_stride, uint32_t *sse,                \
-      const uint8_t *second_pred) {                                 \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
-    uint8_t temp1[n * m];                                           \
-                                                                    \
-    if (n == 4) {                                                   \
-      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
-                                bilinear_filters[xoffset]);         \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
-                                bilinear_filters[yoffset]);         \
-    } else if (n == 8) {                                            \
-      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
-                                bilinear_filters[xoffset]);         \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
-                                bilinear_filters[yoffset]);         \
-    } else {                                                        \
-      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
-                                 bilinear_filters[xoffset]);        \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
-                                 bilinear_filters[yoffset]);        \
-    }                                                               \
-                                                                    \
-    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
-                                                                    \
-    return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
+#define sub_pixel_avg_varianceNxM(n, m)                                     \
+  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
+      const uint8_t *second_pred) {                                         \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
+    uint8_t temp1[n * m];                                                   \
+                                                                            \
+    if (n == 4) {                                                           \
+      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
+                                bilinear_filters[x_offset]);                \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
+                                bilinear_filters[y_offset]);                \
+    } else if (n == 8) {                                                    \
+      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
+                                bilinear_filters[x_offset]);                \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
+                                bilinear_filters[y_offset]);                \
+    } else {                                                                \
+      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
+                                 bilinear_filters[x_offset]);               \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
+                                 bilinear_filters[y_offset]);               \
+    }                                                                       \
+                                                                            \
+    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                  \
+                                                                            \
+    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
   }
 
 sub_pixel_avg_varianceNxM(4, 4);
diff --git a/libs/libvpx/vpx_dsp/arm/subtract_neon.c b/libs/libvpx/vpx_dsp/arm/subtract_neon.c
index ce81fb630f2..612897e247c 100644
--- a/libs/libvpx/vpx_dsp/arm/subtract_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -9,71 +9,73 @@
  */
 
 #include <arm_neon.h>
+#include <assert.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
                              ptrdiff_t diff_stride, const uint8_t *src,
                              ptrdiff_t src_stride, const uint8_t *pred,
                              ptrdiff_t pred_stride) {
-  int r, c;
+  int r = rows, c;
 
   if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
+    do {
       for (c = 0; c < cols; c += 32) {
-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 =
-            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 =
-            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 =
-            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 =
-            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+        const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+        const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+        const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+        const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
       }
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x16_t v_src = vld1q_u8(&src[0]);
-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo =
-          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi =
-          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+    do {
+      const uint8x16_t s = vld1q_u8(&src[0]);
+      const uint8x16_t p = vld1q_u8(&pred[0]);
+      const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+      const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint8x8_t v_pred = vld1_u8(&pred[0]);
-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+    do {
+      const uint8x8_t s = vld1_u8(&src[0]);
+      const uint8x8_t p = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(s, p);
       vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
       diff += diff_stride;
       pred += pred_stride;
       src += src_stride;
-    }
+    } while (--r);
   } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
+    assert(cols == 4);
+    do {
+      const uint8x8_t s = load_unaligned_u8(src, (int)src_stride);
+      const uint8x8_t p = load_unaligned_u8(pred, (int)pred_stride);
+      const uint16x8_t d = vsubl_u8(s, p);
+      vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+      vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+      diff += 2 * diff_stride;
+      pred += 2 * pred_stride;
+      src += 2 * src_stride;
+      r -= 2;
+    } while (r);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/arm/sum_neon.h b/libs/libvpx/vpx_dsp/arm/sum_neon.h
index d74fe0cde42..9e6833aad3d 100644
--- a/libs/libvpx/vpx_dsp/arm/sum_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/sum_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_SUM_NEON_H_
-#define VPX_DSP_ARM_SUM_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_VPX_DSP_ARM_SUM_NEON_H_
 
 #include <arm_neon.h>
 
@@ -30,18 +30,9 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
                   vreinterpret_u32_u64(vget_high_u64(c)));
 }
 
-static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
-                                                      const uint16x8_t b) {
-  const uint32x4_t c = vpaddlq_u16(a);
-  const uint32x4_t d = vpadalq_u16(c, b);
-  const uint64x2_t e = vpaddlq_u32(d);
-  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
-                  vreinterpret_u32_u64(vget_high_u64(e)));
-}
-
 static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
   const uint64x2_t b = vpaddlq_u32(a);
   return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
                   vreinterpret_u32_u64(vget_high_u64(b)));
 }
-#endif  // VPX_DSP_ARM_SUM_NEON_H_
+#endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c b/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 00000000000..cfefad99388
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+  uint64x1_t s2;
+
+  if (size == 4) {
+    int16x4_t s[4];
+    int32x4_t s0;
+    uint32x2_t s1;
+
+    s[0] = vld1_s16(src + 0 * stride);
+    s[1] = vld1_s16(src + 1 * stride);
+    s[2] = vld1_s16(src + 2 * stride);
+    s[3] = vld1_s16(src + 3 * stride);
+    s0 = vmull_s16(s[0], s[0]);
+    s0 = vmlal_s16(s0, s[1], s[1]);
+    s0 = vmlal_s16(s0, s[2], s[2]);
+    s0 = vmlal_s16(s0, s[3], s[3]);
+    s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
+                   vget_high_u32(vreinterpretq_u32_s32(s0)));
+    s2 = vpaddl_u32(s1);
+  } else {
+    int r = size;
+    uint64x2_t s1 = vdupq_n_u64(0);
+
+    do {
+      int c = size;
+      int32x4_t s0 = vdupq_n_s32(0);
+      const int16_t *src_t = src;
+
+      do {
+        int16x8_t s[8];
+
+        s[0] = vld1q_s16(src_t + 0 * stride);
+        s[1] = vld1q_s16(src_t + 1 * stride);
+        s[2] = vld1q_s16(src_t + 2 * stride);
+        s[3] = vld1q_s16(src_t + 3 * stride);
+        s[4] = vld1q_s16(src_t + 4 * stride);
+        s[5] = vld1q_s16(src_t + 5 * stride);
+        s[6] = vld1q_s16(src_t + 6 * stride);
+        s[7] = vld1q_s16(src_t + 7 * stride);
+        s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
+        s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
+        s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
+        src_t += 8;
+        c -= 8;
+      } while (c);
+
+      s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
+      s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+      src += 8 * stride;
+      r -= 8;
+    } while (r);
+
+    s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+  }
+
+  return vget_lane_u64(s2, 0);
+}
diff --git a/libs/libvpx/vpx_dsp/arm/transpose_neon.h b/libs/libvpx/vpx_dsp/arm/transpose_neon.h
index d85cbcee464..43340e48d9a 100644
--- a/libs/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_
-#define VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
 
 #include <arm_neon.h>
 
@@ -1313,4 +1313,4 @@ static INLINE void load_and_transpose_s32_8x8(
 
   transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
 }
-#endif  // VPX_DSP_ARM_TRANSPOSE_NEON_H_
+#endif  // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/variance_neon.c b/libs/libvpx/vpx_dsp/arm/variance_neon.c
index 61c2c16a724..77b1015b742 100644
--- a/libs/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/variance_neon.c
@@ -27,8 +27,9 @@
 // this limit.
 
 // Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
+static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
   int i;
   int16x8_t sum_s16 = vdupq_n_s16(0);
   int32x4_t sse_lo_s32 = vdupq_n_s32(0);
@@ -38,8 +39,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
   assert(h <= 256);
 
   for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+    const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
     const uint16x8_t diff_lo_u16 =
         vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
     const uint16x8_t diff_hi_u16 =
@@ -61,8 +62,8 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
     sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
                            vget_high_s16(diff_hi_s16));
 
-    a += 4 * a_stride;
-    b += 4 * b_stride;
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
   }
 
   *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
@@ -72,9 +73,9 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 // Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int w, int h, uint32_t *sse,
-                              int *sum) {
+static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride, int w,
+                              int h, uint32_t *sse, int *sum) {
   int i, j;
   int16x8_t sum_s16 = vdupq_n_s16(0);
   int32x4_t sse_lo_s32 = vdupq_n_s32(0);
@@ -86,8 +87,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(a + j);
-      const uint8x16_t b_u8 = vld1q_u8(b + j);
+      const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
+      const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
 
       const uint16x8_t diff_lo_u16 =
           vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
@@ -110,8 +111,8 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
       sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
                              vget_high_s16(diff_hi_s16));
     }
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 
   *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
@@ -121,8 +122,9 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 // Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int h, uint32_t *sse, int *sum) {
+static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride, int h,
+                               uint32_t *sse, int *sum) {
   int i = 0;
   int16x8_t sum_s16 = vdupq_n_s16(0);
   int32x4_t sse_lo_s32 = vdupq_n_s32(0);
@@ -132,10 +134,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
   assert(h <= 128);
 
   do {
-    const uint8x8_t a_0_u8 = vld1_u8(a);
-    const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(b);
-    const uint8x8_t b_1_u8 = vld1_u8(b + b_stride);
+    const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
+    const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
+    const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
     const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
     const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
     const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
@@ -150,8 +152,8 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
                            vget_high_s16(diff_0_s16));
     sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
                            vget_high_s16(diff_1_s16));
-    a += a_stride + a_stride;
-    b += b_stride + b_stride;
+    src_ptr += src_stride + src_stride;
+    ref_ptr += ref_stride + ref_stride;
     i += 2;
   } while (i < h);
 
@@ -161,31 +163,36 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
                        0);
 }
 
-void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8x2(a, a_stride, b, b_stride, 8, sse, sum);
+void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
 }
 
-void vpx_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w16(a, a_stride, b, b_stride, 16, 16, sse, sum);
+void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 
-#define varianceNxM(n, m, shift)                                            \
-  unsigned int vpx_variance##n##x##m##_neon(const uint8_t *a, int a_stride, \
-                                            const uint8_t *b, int b_stride, \
-                                            unsigned int *sse) {            \
-    int sum;                                                                \
-    if (n == 4)                                                             \
-      variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else if (n == 8)                                                        \
-      variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum);           \
-    else                                                                    \
-      variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum);         \
-    if (n * m < 16 * 16)                                                    \
-      return *sse - ((sum * sum) >> shift);                                 \
-    else                                                                    \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);              \
+#define varianceNxM(n, m, shift)                                             \
+  unsigned int vpx_variance##n##x##m##_neon(                                 \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, unsigned int *sse) {                                   \
+    int sum;                                                                 \
+    if (n == 4)                                                              \
+      variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
+                         &sum);                                              \
+    else if (n == 8)                                                         \
+      variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
+                         &sum);                                              \
+    else                                                                     \
+      variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
+                        &sum);                                               \
+    if (n * m < 16 * 16)                                                     \
+      return *sse - ((sum * sum) >> shift);                                  \
+    else                                                                     \
+      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
   }
 
 varianceNxM(4, 4, 4);
@@ -199,58 +206,66 @@ varianceNxM(16, 32, 9);
 varianceNxM(32, 16, 9);
 varianceNxM(32, 32, 10);
 
-unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
+unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum1, sum2;
   uint32_t sse1, sse2;
-  variance_neon_w16(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w16(a + (32 * a_stride), a_stride, b + (32 * b_stride),
-                    b_stride, 32, 32, &sse2, &sum2);
+  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
+                    &sum1);
+  variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
+                    ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
+                    &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
 }
 
-unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
+unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum1, sum2;
   uint32_t sse1, sse2;
-  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
-                    b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
+                    &sum1);
+  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
+                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
+                    &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
 }
 
-unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
+unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum1, sum2;
   uint32_t sse1, sse2;
 
-  variance_neon_w16(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w16(a + (16 * a_stride), a_stride, b + (16 * b_stride),
-                    b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
+                    &sum1);
+  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
+                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
+                    &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w16(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                    b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
+                    ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
+                    &sum2);
   sse1 += sse2;
   sum1 += sum2;
 
-  variance_neon_w16(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                    b_stride, 64, 16, &sse2, &sum2);
+  variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
+                    ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
+                    &sum2);
   *sse = sse1 + sse2;
   sum1 += sum2;
   return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
 }
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
-                               const unsigned char *ref_ptr, int recon_stride,
+unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
+                               const unsigned char *ref_ptr, int ref_stride,
                                unsigned int *sse) {
   int i;
   int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
@@ -267,13 +282,13 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
 
   for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
     q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
+    src_ptr += src_stride;
     q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
+    src_ptr += src_stride;
     q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
+    ref_ptr += ref_stride;
     q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
+    ref_ptr += ref_stride;
 
     q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
     q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
@@ -312,10 +327,9 @@ unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
   return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
 }
 
-unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
-                                   int source_stride,
+unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
-                                   int recon_stride) {
+                                   int ref_stride) {
   int16x4_t d22s16, d24s16, d26s16, d28s16;
   int64x1_t d0s64;
   uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
@@ -324,21 +338,21 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr,
   int64x2_t q1s64;
 
   d0u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
+  src_ptr += src_stride;
   d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
+  ref_ptr += ref_stride;
   d1u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
+  src_ptr += src_stride;
   d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
+  ref_ptr += ref_stride;
   d2u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
+  src_ptr += src_stride;
   d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
+  ref_ptr += ref_stride;
   d3u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
+  src_ptr += src_stride;
   d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
+  ref_ptr += ref_stride;
 
   q11u16 = vsubl_u8(d0u8, d4u8);
   q12u16 = vsubl_u8(d1u8, d5u8);
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
new file mode 100644
index 00000000000..d8e4bcc3a7f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm
@@ -0,0 +1,438 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers*****************************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
new file mode 100644
index 00000000000..7a77747fec6
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm
@@ -0,0 +1,439 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u8         {d6},   [r1]
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u8         {d7},   [r6]
+    vrhadd.u8       d20,    d20,    d6
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vrhadd.u8       d8,     d8,     d7
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    add             r7,     r1,     #8
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vld1.u8         {d0},   [r1]
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u8         {d2},   [r7]
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vrhadd.u8       d8,     d8,     d0
+    vrhadd.u8       d9,     d9,     d2
+    vmlsl.u8        q11,    d1,     d24
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    vmlal.u8        q11,    d13,    d28
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    subeq           r14,    r14,    #2
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    addeq           r1,     r1,     r8
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vdup.16         q10,    r7
+    vld1.u32        {q3},   [r12],  r11
+    add             r7,     r6,     #8
+    moveq           r5,     r10
+    vld1.u8         {d0},   [r6]
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u8         {d2},   [r7]
+    vqrshrun.s16    d11,    q11,    #6
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q6},   [r12],  r11
+    vrhadd.u8       d10,    d10,    d0
+    vld1.u32        {q7},   [r12],  r11
+    vrhadd.u8       d11,    d11,    d2
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    mov             r7,     #0xc000
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    add             r7,     r6,     #8
+    vld1.u8         {d20},  [r6]
+    vld1.u8         {d21},  [r7]
+    vrhadd.u8       d10,    d10,    d20
+    vrhadd.u8       d11,    d11,    d21
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vld1.u32        {d10[0]},       [r1]
+    vld1.u32        {d10[1]},       [r6]
+    vrhadd.u8       d8,     d8,     d10
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
deleted file mode 100644
index 1c2ee506306..00000000000
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_avg_horiz_neon|
-    EXPORT  |vpx_convolve8_avg_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_avg_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_avg_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_avg_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; slightly out of order load to match the existing data
-    vld1.u32        {d6[0]}, [r2], r3
-    vld1.u32        {d7[0]}, [r2], r3
-    vld1.u32        {d6[1]}, [r2], r3
-    vld1.u32        {d7[1]}, [r2], r3
-
-    sub             r2, r2, r3, lsl #2      ; reset for store
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_avg_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_avg_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_avg_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_avg_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    vld1.u32        {d6[0]}, [r5@32], r3
-    vld1.u32        {d6[1]}, [r8@32], r3
-    vld1.u32        {d7[0]}, [r5@32], r3
-    vld1.u32        {d7[1]}, [r8@32], r3
-
-    pld             [r7]
-    pld             [r4]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; average the new value and the dst value
-    vrhadd.u8       q1, q1, q3
-
-    sub             r5, r5, r3, lsl #1      ; reset for store
-    sub             r8, r8, r3, lsl #1
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_avg_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_avg_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
new file mode 100644
index 00000000000..d310a83dad8
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm
@@ -0,0 +1,486 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
new file mode 100644
index 00000000000..c5695fbda84
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm
@@ -0,0 +1,487 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_avg_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_avg_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    add             r14,    r1,     r6
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d6,     d25
+    vrhadd.u8       d10,    d10,    d20
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d20},  [r14]
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    addle           r0,     r0,     r8
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vrhadd.u8       d12,    d12,    d20
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlal.u8        q6,     d16,    d28
+    add             r10,    r10,    r2      ;12*strd
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vld1.u8         {d20},  [r14]
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vrhadd.u8       d12,    d12,    d20
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vst1.8          {d12},  [r14],  r6
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vrhadd.u8       d14,    d14,    d20
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vst1.8          {d14},  [r14],  r6
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vld1.u8         {d20},  [r1]
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vrhadd.u8       d8,     d8,     d20
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    add             r14,    r1,     r6
+    vmlal.u8        q6,     d16,    d28
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vmlsl.u8        q6,     d17,    d29
+    vld1.u8         {d20},  [r14]
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vrhadd.u8       d10,    d10,    d20
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d12,    d12,    d20
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d20},  [r14]
+    vrhadd.u8       d14,    d14,    d20
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vld1.u32        {d20[0]},       [r1]
+    vld1.u32        {d20[1]},       [r3]
+    vrhadd.u8       d0,     d0,     d20
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    mov             r4,     r3
+    vld1.u32        {d20[0]},       [r4],   r6
+    vld1.u32        {d20[1]},       [r4]
+    vrhadd.u8       d8,     d8,     d20
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
new file mode 100644
index 00000000000..fa1b7324660
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlsl.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlal.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlsl.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlal.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlsl.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlsl.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlal.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlal.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlsl.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlal.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlal.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlsl.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlsl.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlal.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlal.u8        q11,    d15,    d29
+    vmlsl.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlsl.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlsl.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlal.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlal.u8        q4,     d5,     d29
+    vmlsl.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
new file mode 100644
index 00000000000..90b2c8fef7a
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm
@@ -0,0 +1,415 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r3 =>  dst_stride
+;    r4 => filter_x0
+;    r8 =>  ht
+;    r10 =>  wd
+
+    EXPORT          |vpx_convolve8_horiz_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_horiz_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+
+start_loop_count
+    ldr             r4,     [sp,    #104]   ;loads pi1_coeff
+    ldr             r8,     [sp,    #108]   ;loads x0_q4
+    add             r4,     r4,     r8,     lsl #4 ;r4 = filter[x0_q4]
+    ldr             r8,     [sp,    #128]   ;loads ht
+    ldr             r10,    [sp,    #124]   ;loads wd
+    vld2.8          {d0,    d1},    [r4]    ;coeff = vld1_s8(pi1_coeff)
+    mov             r11,    #1
+    subs            r14,    r8,     #0      ;checks for ht == 0
+    vabs.s8         d2,     d0              ;vabs_s8(coeff)
+    vdup.8          d24,    d2[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0)
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    vdup.8          d25,    d2[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1)
+    add             r4,     r12,    r2      ;pu1_src_tmp2_8 = pu1_src + src_strd
+    vdup.8          d26,    d2[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2)
+    rsb             r9,     r10,    r2,     lsl #1 ;2*src_strd - wd
+    vdup.8          d27,    d2[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3)
+    rsb             r8,     r10,    r3,     lsl #1 ;2*dst_strd - wd
+    vdup.8          d28,    d2[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4)
+    vdup.8          d29,    d2[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5)
+    vdup.8          d30,    d2[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6)
+    vdup.8          d31,    d2[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7)
+    mov             r7,     r1
+    cmp             r10,    #4
+    ble             outer_loop_4
+
+    cmp             r10,    #24
+    moveq           r10,    #16
+    addeq           r8,     #8
+    addeq           r9,     #8
+    cmp             r10,    #16
+    bge             outer_loop_16
+
+    cmp             r10,    #12
+    addeq           r8,     #4
+    addeq           r9,     #4
+    b               outer_loop_8
+
+outer_loop8_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    mov             r14,    #32
+    add             r1,     #16
+    add             r12,    #16
+    mov             r10,    #8
+    add             r8,     #8
+    add             r9,     #8
+
+outer_loop_8
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_8
+
+inner_loop_8
+    mov             r7,     #0xc000
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {d1},   [r12],  r11
+    vdup.16         q5,     r7
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    mov             r7,     #0x4000
+    vld1.u32        {d4},   [r12],  r11
+    vmlal.u8        q4,     d1,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {d5},   [r12],  r11
+    vmlal.u8        q4,     d3,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d6},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {d7},   [r12],  r11
+    vmlsl.u8        q4,     d2,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlal.u8        q4,     d4,     d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d13},  [r4],   r11
+    vmlsl.u8        q4,     d5,     d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vld1.u32        {d14},  [r4],   r11
+    vmlal.u8        q4,     d6,     d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vld1.u32        {d15},  [r4],   r11
+    vmlsl.u8        q4,     d7,     d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vld1.u32        {d16},  [r4],   r11     ;vector load pu1_src + src_strd
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d15,    d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {d17},  [r4],   r11
+    vmlsl.u8        q5,     d14,    d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {d18},  [r4],   r11
+    vmlal.u8        q5,     d16,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vld1.u32        {d19},  [r4],   r11     ;vector load pu1_src + src_strd
+    vmlsl.u8        q5,     d17,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vmlal.u8        q5,     d18,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d19,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    vqrshrun.s16    d20,    q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlsl.u8        q5,     d12,    d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d13,    d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vst1.8          {d20},  [r1]!           ;store the result pu1_dst
+    vhadd.s16       q5,     q5,     q11
+    subs            r5,     r5,     #8      ;decrement the wd loop
+    vqrshrun.s16    d8,     q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    vst1.8          {d8},   [r6]!           ;store the result pu1_dst
+    cmp             r5,     #4
+    bgt             inner_loop_8
+
+end_inner_loop_8
+    subs            r14,    r14,    #2      ;decrement the ht loop
+    add             r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the dst pointer by
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_8
+
+    ldr             r10,    [sp,    #120]   ;loads wd
+    cmp             r10,    #12
+    beq             outer_loop4_residual
+
+end_loops
+    b               end_func
+
+outer_loop_16
+    str             r0,     [sp,  #-4]!
+    str             r7,     [sp,  #-4]!
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    and             r0,     r12,    #31
+    mov             r7,     #0xc000
+    sub             r5,     r10,    #0      ;checks wd
+    pld             [r4,    r2,     lsl #1]
+    pld             [r12,   r2,     lsl #1]
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    vdup.16         q4,     r7
+    vld1.u32        {q1},   [r12],  r11
+    vld1.u32        {q2},   [r12],  r11
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q7},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q8},   [r12],  r11
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    vld1.u32        {q9},   [r12],  r11
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vdup.16         q10,    r7
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+
+inner_loop_16
+    vmlsl.u8        q10,    d1,     d24
+    vdup.16         q5,     r7
+    vmlal.u8        q10,    d3,     d25
+    mov             r7,     #0x4000
+    vdup.16         q11,    r7
+    vmlsl.u8        q10,    d5,     d26
+    vld1.u32        {q0},   [r4],   r11     ;vector load pu1_src
+    vhadd.s16       q4,     q4,     q11
+    vld1.u32        {q1},   [r4],   r11
+    vmlal.u8        q10,    d7,     d27
+    add             r12,    #8
+    subs            r5,     r5,     #16
+    vmlal.u8        q10,    d13,    d28
+    vld1.u32        {q2},   [r4],   r11
+    vmlsl.u8        q10,    d15,    d29
+    vld1.u32        {q3},   [r4],   r11
+    vqrshrun.s16    d8,     q4,     #6      ;right shift and saturating narrow
+                                            ; result 1
+    vmlal.u8        q10,    d17,    d30
+    vld1.u32        {q6},   [r4],   r11
+    vmlsl.u8        q10,    d19,    d31
+    vld1.u32        {q7},   [r4],   r11
+    vmlsl.u8        q5,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vmlal.u8        q5,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r4],   r11
+    vhadd.s16       q10,    q10,    q11
+    vld1.u32        {q9},   [r4],   r11
+    vmlsl.u8        q5,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vmlal.u8        q5,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    add             r4,     #8
+    mov             r7,     #0xc000
+    vmlal.u8        q5,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vmlsl.u8        q5,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vqrshrun.s16    d9,     q10,    #6
+    vdup.16         q11,    r7
+    vmlal.u8        q5,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q5,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    mov             r7,     #0x4000
+    vmlsl.u8        q11,    d1,     d24
+    vst1.8          {q4},   [r1]!           ;store the result pu1_dst
+    vmlal.u8        q11,    d3,     d25
+    vdup.16         q10,    r7
+    vmlsl.u8        q11,    d5,     d26
+    pld             [r12,   r2,     lsl #2]
+    pld             [r4,    r2,     lsl #2]
+    addeq           r12,    r12,    r9      ;increment the src pointer by
+                                            ; 2*src_strd-wd
+    addeq           r4,     r12,    r2      ;pu1_src + src_strd
+    vmlal.u8        q11,    d7,     d27
+    addeq           r1,     r1,     r8
+    subeq           r14,    r14,    #2
+    vmlal.u8        q11,    d13,    d28
+    vhadd.s16       q5,     q5,     q10
+    vmlsl.u8        q11,    d15,    d29
+    vmlal.u8        q11,    d17,    d30
+    cmp             r14,    #0
+    vmlsl.u8        q11,    d19,    d31
+    vqrshrun.s16    d10,    q5,     #6      ;right shift and saturating narrow
+                                            ; result 2
+    beq             epilog_16
+
+    vld1.u32        {q0},   [r12],  r11     ;vector load pu1_src
+    mov             r7,     #0xc000
+    cmp             r5,     #0
+    vld1.u32        {q1},   [r12],  r11
+    vhadd.s16       q11,    q11,    q10
+    vld1.u32        {q2},   [r12],  r11
+    vdup.16         q4,     r7
+    vld1.u32        {q3},   [r12],  r11
+    vmlsl.u8        q4,     d0,     d24     ;mul_res = vmlsl_u8(src[0_0],
+                                            ; coeffabs_0);
+    vld1.u32        {q6},   [r12],  r11
+    vld1.u32        {q7},   [r12],  r11
+    vmlal.u8        q4,     d2,     d25     ;mul_res = vmlal_u8(src[0_1],
+                                            ; coeffabs_1);
+    vld1.u32        {q8},   [r12],  r11
+    vmlsl.u8        q4,     d4,     d26     ;mul_res = vmlsl_u8(src[0_2],
+                                            ; coeffabs_2);
+    vld1.u32        {q9},   [r12],  r11
+    vqrshrun.s16    d11,    q11,    #6
+    vmlal.u8        q4,     d6,     d27     ;mul_res = vmull_u8(src[0_3],
+                                            ; coeffabs_3);
+    moveq           r5,     r10
+    vmlal.u8        q4,     d12,    d28     ;mul_res = vmlal_u8(src[0_4],
+                                            ; coeffabs_4);
+    vdup.16         q10,    r7
+    vmlsl.u8        q4,     d14,    d29     ;mul_res = vmlsl_u8(src[0_5],
+                                            ; coeffabs_5);
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    vmlal.u8        q4,     d16,    d30     ;mul_res = vmlal_u8(src[0_6],
+                                            ; coeffabs_6);
+    vmlsl.u8        q4,     d18,    d31     ;mul_res = vmlsl_u8(src[0_7],
+                                            ; coeffabs_7);
+    addeq           r6,     r1,     r3      ;pu1_dst + dst_strd
+    b               inner_loop_16
+
+epilog_16
+    mov             r7,     #0x4000
+    ldr             r0,     [sp],   #4
+    ldr             r10,    [sp,    #120]
+    vdup.16         q10,    r7
+    vhadd.s16       q11,    q11,    q10
+    vqrshrun.s16    d11,    q11,    #6
+    vst1.8          {q5},   [r6]!           ;store the result pu1_dst
+    ldr             r7,     [sp],   #4
+    cmp             r10,    #24
+    beq             outer_loop8_residual
+
+end_loops1
+    b               end_func
+
+outer_loop4_residual
+    sub             r12,    r0,     #3      ;pu1_src - 3
+    mov             r1,     r7
+    add             r1,     #8
+    mov             r10,    #4
+    add             r12,    #8
+    mov             r14,    #16
+    add             r8,     #4
+    add             r9,     #4
+
+outer_loop_4
+    add             r6,     r1,     r3      ;pu1_dst + dst_strd
+    add             r4,     r12,    r2      ;pu1_src + src_strd
+    subs            r5,     r10,    #0      ;checks wd
+    ble             end_inner_loop_4
+
+inner_loop_4
+    vld1.u32        {d0},   [r12],  r11     ;vector load pu1_src
+    vld1.u32        {d1},   [r12],  r11
+    vld1.u32        {d2},   [r12],  r11
+    vld1.u32        {d3},   [r12],  r11
+    vld1.u32        {d4},   [r12],  r11
+    vld1.u32        {d5},   [r12],  r11
+    vld1.u32        {d6},   [r12],  r11
+    vld1.u32        {d7},   [r12],  r11
+    sub             r12,    r12,    #4
+    vld1.u32        {d12},  [r4],   r11     ;vector load pu1_src + src_strd
+    vld1.u32        {d13},  [r4],   r11
+    vzip.32         d0,     d12             ;vector zip the i iteration and ii
+                                            ; interation in single register
+    vld1.u32        {d14},  [r4],   r11
+    vzip.32         d1,     d13
+    vld1.u32        {d15},  [r4],   r11
+    vzip.32         d2,     d14
+    vld1.u32        {d16},  [r4],   r11
+    vzip.32         d3,     d15
+    vld1.u32        {d17},  [r4],   r11
+    vzip.32         d4,     d16
+    vld1.u32        {d18},  [r4],   r11
+    vzip.32         d5,     d17
+    vld1.u32        {d19},  [r4],   r11
+    mov             r7,     #0xc000
+    vdup.16         q4,     r7
+    sub             r4,     r4,     #4
+    vzip.32         d6,     d18
+    vzip.32         d7,     d19
+    vmlal.u8        q4,     d1,     d25     ;arithmetic operations for ii
+                                            ; iteration in the same time
+    vmlsl.u8        q4,     d0,     d24
+    vmlsl.u8        q4,     d2,     d26
+    vmlal.u8        q4,     d3,     d27
+    vmlal.u8        q4,     d4,     d28
+    vmlsl.u8        q4,     d5,     d29
+    vmlal.u8        q4,     d6,     d30
+    vmlsl.u8        q4,     d7,     d31
+    mov             r7,     #0x4000
+    vdup.16         q10,    r7
+    vhadd.s16       q4,     q4,     q10
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r1]!           ;store the i iteration result which
+                                            ; is in upper part of the register
+    vst1.32         {d8[1]},[r6]!           ;store the ii iteration result which
+                                            ; is in lower part of the register
+    subs            r5,     r5,     #4      ;decrement the wd by 4
+    bgt             inner_loop_4
+
+end_inner_loop_4
+    subs            r14,    r14,    #2      ;decrement the ht by 4
+    add             r12,    r12,    r9      ;increment the input pointer
+                                            ; 2*src_strd-wd
+    add             r1,     r1,     r8      ;increment the output pointer
+                                            ; 2*dst_strd-wd
+    bgt             outer_loop_4
+
+end_func
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index c1634ed55f6..4f27da9d2f4 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -8,6 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
+
 #include <arm_neon.h>
 
 #include "./vpx_config.h"
@@ -131,3 +134,5 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
                      filters, filter3, filter4);
 }
+
+#endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
deleted file mode 100644
index 5eee15664de..00000000000
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ /dev/null
@@ -1,273 +0,0 @@
-;
-;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    ; These functions are only valid when:
-    ; x_step_q4 == 16
-    ; w%4 == 0
-    ; h%4 == 0
-    ; taps == 8
-    ; VP9_FILTER_WEIGHT == 128
-    ; VP9_FILTER_SHIFT == 7
-
-    EXPORT  |vpx_convolve8_horiz_neon|
-    EXPORT  |vpx_convolve8_vert_neon|
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-    ; Multiply and accumulate by q0
-    MACRO
-    MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7
-    vmull.s16 $dst, $src0, d0[0]
-    vmlal.s16 $dst, $src1, d0[1]
-    vmlal.s16 $dst, $src2, d0[2]
-    vmlal.s16 $dst, $src3, d0[3]
-    vmlal.s16 $dst, $src4, d1[0]
-    vmlal.s16 $dst, $src5, d1[1]
-    vmlal.s16 $dst, $src6, d1[2]
-    vmlal.s16 $dst, $src7, d1[3]
-    MEND
-
-; r0    const uint8_t *src
-; r1    int src_stride
-; r2    uint8_t *dst
-; r3    int dst_stride
-; sp[]const int16_t *filter
-; sp[]int x0_q4
-; sp[]int x_step_q4 ; unused
-; sp[]int y0_q4
-; sp[]int y_step_q4 ; unused
-; sp[]int w
-; sp[]int h
-
-|vpx_convolve8_horiz_neon| PROC
-    push            {r4-r10, lr}
-
-    sub             r0, r0, #3              ; adjust for taps
-
-    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
-    add             r4, r5, lsl #4
-    ldrd            r6, r7, [sp, #52]       ; w, h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    sub             r8, r1, r1, lsl #2      ; -src_stride * 3
-    add             r8, r8, #4              ; -src_stride * 3 + 4
-
-    sub             r4, r3, r3, lsl #2      ; -dst_stride * 3
-    add             r4, r4, #4              ; -dst_stride * 3 + 4
-
-    rsb             r9, r6, r1, lsl #2      ; reset src for outer loop
-    sub             r9, r9, #7
-    rsb             r12, r6, r3, lsl #2     ; reset dst for outer loop
-
-    mov             r10, r6                 ; w loop counter
-
-vpx_convolve8_loop_horiz_v
-    vld1.8          {d24}, [r0], r1
-    vld1.8          {d25}, [r0], r1
-    vld1.8          {d26}, [r0], r1
-    vld1.8          {d27}, [r0], r8
-
-    vtrn.16         q12, q13
-    vtrn.8          d24, d25
-    vtrn.8          d26, d27
-
-    pld             [r0, r1, lsl #2]
-
-    vmovl.u8        q8, d24
-    vmovl.u8        q9, d25
-    vmovl.u8        q10, d26
-    vmovl.u8        q11, d27
-
-    ; save a few instructions in the inner loop
-    vswp            d17, d18
-    vmov            d23, d21
-
-    add             r0, r0, #3
-
-vpx_convolve8_loop_horiz
-    add             r5, r0, #64
-
-    vld1.32         {d28[]}, [r0], r1
-    vld1.32         {d29[]}, [r0], r1
-    vld1.32         {d31[]}, [r0], r1
-    vld1.32         {d30[]}, [r0], r8
-
-    pld             [r5]
-
-    vtrn.16         d28, d31
-    vtrn.16         d29, d30
-    vtrn.8          d28, d29
-    vtrn.8          d31, d30
-
-    pld             [r5, r1]
-
-    ; extract to s16
-    vtrn.32         q14, q15
-    vmovl.u8        q12, d28
-    vmovl.u8        q13, d29
-
-    pld             [r5, r1, lsl #1]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
-    MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
-    MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
-    MULTIPLY_BY_Q0  q15, d22, d18, d19, d23, d24, d26, d27, d25
-
-    pld             [r5, -r8]
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    ; transpose
-    vtrn.16         d2, d3
-    vtrn.32         d2, d3
-    vtrn.8          d2, d3
-
-    vst1.u32        {d2[0]}, [r2@32], r3
-    vst1.u32        {d3[0]}, [r2@32], r3
-    vst1.u32        {d2[1]}, [r2@32], r3
-    vst1.u32        {d3[1]}, [r2@32], r4
-
-    vmov            q8,  q9
-    vmov            d20, d23
-    vmov            q11, q12
-    vmov            q9,  q13
-
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_horiz
-
-    ; outer loop
-    mov             r6, r10                 ; restore w counter
-    add             r0, r0, r9              ; src += src_stride * 4 - w
-    add             r2, r2, r12             ; dst += dst_stride * 4 - w
-    subs            r7, r7, #4              ; h -= 4
-    bgt vpx_convolve8_loop_horiz_v
-
-    pop             {r4-r10, pc}
-
-    ENDP
-
-|vpx_convolve8_vert_neon| PROC
-    push            {r4-r8, lr}
-
-    ; adjust for taps
-    sub             r0, r0, r1
-    sub             r0, r0, r1, lsl #1
-
-    ldr             r4, [sp, #24]           ; filter
-    ldr             r5, [sp, #36]           ; y0_q4
-    add             r4, r5, lsl #4
-    ldr             r6, [sp, #44]           ; w
-    ldr             lr, [sp, #48]           ; h
-
-    vld1.s16        {q0}, [r4]              ; filter
-
-    lsl             r1, r1, #1
-    lsl             r3, r3, #1
-
-vpx_convolve8_loop_vert_h
-    mov             r4, r0
-    add             r7, r0, r1, asr #1
-    mov             r5, r2
-    add             r8, r2, r3, asr #1
-    mov             r12, lr                 ; h loop counter
-
-    vld1.u32        {d16[0]}, [r4], r1
-    vld1.u32        {d16[1]}, [r7], r1
-    vld1.u32        {d18[0]}, [r4], r1
-    vld1.u32        {d18[1]}, [r7], r1
-    vld1.u32        {d20[0]}, [r4], r1
-    vld1.u32        {d20[1]}, [r7], r1
-    vld1.u32        {d22[0]}, [r4], r1
-
-    vmovl.u8        q8, d16
-    vmovl.u8        q9, d18
-    vmovl.u8        q10, d20
-    vmovl.u8        q11, d22
-
-vpx_convolve8_loop_vert
-    ; always process a 4x4 block at a time
-    vld1.u32        {d24[0]}, [r7], r1
-    vld1.u32        {d26[0]}, [r4], r1
-    vld1.u32        {d26[1]}, [r7], r1
-    vld1.u32        {d24[1]}, [r4], r1
-
-    ; extract to s16
-    vmovl.u8        q12, d24
-    vmovl.u8        q13, d26
-
-    pld             [r5]
-    pld             [r8]
-
-    ; src[] * filter
-    MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
-
-    pld             [r5, r3]
-    pld             [r8, r3]
-
-    MULTIPLY_BY_Q0  q2,  d17, d18, d19, d20, d21, d22, d24, d26
-
-    pld             [r7]
-    pld             [r4]
-
-    MULTIPLY_BY_Q0  q14, d18, d19, d20, d21, d22, d24, d26, d27
-
-    pld             [r7, r1]
-    pld             [r4, r1]
-
-    MULTIPLY_BY_Q0  q15, d19, d20, d21, d22, d24, d26, d27, d25
-
-    ; += 64 >> 7
-    vqrshrun.s32    d2, q1, #7
-    vqrshrun.s32    d3, q2, #7
-    vqrshrun.s32    d4, q14, #7
-    vqrshrun.s32    d5, q15, #7
-
-    ; saturate
-    vqmovn.u16      d2, q1
-    vqmovn.u16      d3, q2
-
-    vst1.u32        {d2[0]}, [r5@32], r3
-    vst1.u32        {d2[1]}, [r8@32], r3
-    vst1.u32        {d3[0]}, [r5@32], r3
-    vst1.u32        {d3[1]}, [r8@32], r3
-
-    vmov            q8, q10
-    vmov            d18, d22
-    vmov            d19, d24
-    vmov            q10, q13
-    vmov            d22, d25
-
-    subs            r12, r12, #4            ; h -= 4
-    bgt             vpx_convolve8_loop_vert
-
-    ; outer loop
-    add             r0, r0, #4
-    add             r2, r2, #4
-    subs            r6, r6, #4              ; w -= 4
-    bgt             vpx_convolve8_loop_vert_h
-
-    pop             {r4-r8, pc}
-
-    ENDP
-    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
new file mode 100644
index 00000000000..4470b28b884
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon_asm.h"
+
+/* Type1 and Type2 functions are called depending on the position of the
+ * negative and positive coefficients in the filter. In type1, the filter kernel
+ * used is sub_pel_filters_8lp, in which only the first two and the last two
+ * coefficients are negative. In type2, the negative coefficients are 0, 2, 5 &
+ * 7.
+ */
+
+#define DEFINE_FILTER(dir)                                                   \
+  void vpx_convolve8_##dir##_neon(                                           \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    if (filter == vp9_filter_kernels[1]) {                                   \
+      vpx_convolve8_##dir##_filter_type1_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    } else {                                                                 \
+      vpx_convolve8_##dir##_filter_type2_neon(                               \
+          src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, \
+          y_step_q4, w, h);                                                  \
+    }                                                                        \
+  }
+
+DEFINE_FILTER(horiz);
+DEFINE_FILTER(avg_horiz);
+DEFINE_FILTER(vert);
+DEFINE_FILTER(avg_vert);
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
new file mode 100644
index 00000000000..b123d1cb080
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+#define VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
+
+#define DECLARE_FILTER(dir, type)                                  \
+  void vpx_convolve8_##dir##_filter_##type##_neon(                 \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,      \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+
+DECLARE_FILTER(horiz, type1);
+DECLARE_FILTER(avg_horiz, type1);
+DECLARE_FILTER(horiz, type2);
+DECLARE_FILTER(avg_horiz, type2);
+DECLARE_FILTER(vert, type1);
+DECLARE_FILTER(avg_vert, type1);
+DECLARE_FILTER(vert, type2);
+DECLARE_FILTER(avg_vert, type2);
+
+#endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
new file mode 100644
index 00000000000..2666d4253e4
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm
@@ -0,0 +1,457 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type1_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type1_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlsl.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlal.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlal.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlsl.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlsl.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlal.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlal.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlsl.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlal.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlal.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlsl.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlal.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlal.u8        q6,     d7,     d27
+    vmlsl.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlal.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlal.u8        q7,     d16,    d27
+    vmlsl.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from
+                                            ; sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlsl.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlal.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlsl.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlal.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlal.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlal.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlsl.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
new file mode 100644
index 00000000000..cb5d6d3fe5c
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm
@@ -0,0 +1,455 @@
+;
+;  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+;**************Variables Vs Registers***********************************
+;    r0 => src
+;    r1 => dst
+;    r2 =>  src_stride
+;    r6 =>  dst_stride
+;    r12 => filter_y0
+;    r5 =>  ht
+;    r3 =>  wd
+
+    EXPORT          |vpx_convolve8_vert_filter_type2_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA  ||.text||, CODE, READONLY, ALIGN=2
+
+|vpx_convolve8_vert_filter_type2_neon| PROC
+
+    stmfd           sp!,    {r4  -  r12,    r14} ;stack stores the values of
+                                                 ; the arguments
+    vpush           {d8  -  d15}                 ; stack offset by 64
+    mov             r4,     r1
+    mov             r1,     r2
+    mov             r2,     r4
+    vmov.i16        q15,    #0x4000
+    mov             r11,    #0xc000
+    ldr             r12,    [sp,    #104]   ;load filter
+    ldr             r6,     [sp,    #116]   ;load y0_q4
+    add             r12,    r12,    r6,     lsl #4 ;r12 = filter[y0_q4]
+    mov             r6,     r3
+    ldr             r5,     [sp,    #124]   ;load wd
+    vld2.8          {d0,    d1},    [r12]   ;coeff = vld1_s8(pi1_coeff)
+    sub             r12,    r2,     r2,     lsl #2 ;src_ctrd & pi1_coeff
+    vabs.s8         d0,     d0              ;vabs_s8(coeff)
+    add             r0,     r0,     r12     ;r0->pu1_src    r12->pi1_coeff
+    ldr             r3,     [sp,    #128]   ;load ht
+    subs            r7,     r3,     #0      ;r3->ht
+    vdup.u8         d22,    d0[0]           ;coeffabs_0 = vdup_lane_u8(coeffabs,
+                                            ; 0);
+    cmp             r5,     #8
+    vdup.u8         d23,    d0[1]           ;coeffabs_1 = vdup_lane_u8(coeffabs,
+                                            ; 1);
+    vdup.u8         d24,    d0[2]           ;coeffabs_2 = vdup_lane_u8(coeffabs,
+                                            ; 2);
+    vdup.u8         d25,    d0[3]           ;coeffabs_3 = vdup_lane_u8(coeffabs,
+                                            ; 3);
+    vdup.u8         d26,    d0[4]           ;coeffabs_4 = vdup_lane_u8(coeffabs,
+                                            ; 4);
+    vdup.u8         d27,    d0[5]           ;coeffabs_5 = vdup_lane_u8(coeffabs,
+                                            ; 5);
+    vdup.u8         d28,    d0[6]           ;coeffabs_6 = vdup_lane_u8(coeffabs,
+                                            ; 6);
+    vdup.u8         d29,    d0[7]           ;coeffabs_7 = vdup_lane_u8(coeffabs,
+                                            ; 7);
+    blt             core_loop_wd_4          ;core loop wd 4 jump
+
+    str             r0,     [sp,  #-4]!
+    str             r1,     [sp,  #-4]!
+    bic             r4,     r5,     #7      ;r5 ->wd
+    rsb             r9,     r4,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r4,     r2,     lsl #2 ;r2->src_strd
+    mov             r3,     r5,     lsr #3  ;divide by 8
+    mul             r7,     r3              ;multiply height by width
+    sub             r7,     #4              ;subtract by one for epilog
+
+prolog
+    and             r10,    r0,     #31
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vdup.16         q4,     r11
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    subs            r4,     r4,     #8
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vdup.16         q5,     r11
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    pld             [r3]
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    pld             [r3,    r2]
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r3,     r3,     r2
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    pld             [r3,    r2,     lsl #1]
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d2,     d22
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q6,     d4,     d24
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d5,     d25
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    addle           r1,     r1,     r9
+    vmlal.u8        q7,     d4,     d23
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+    blt             epilog_end              ;jumps to epilog_end
+
+    beq             epilog                  ;jumps to epilog
+
+main_loop_8
+    subs            r4,     r4,     #8
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+                                            ; coeffabs_1);
+    addle           r0,     r0,     r8
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    bicle           r4,     r5,     #7      ;r5 ->wd
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    add             r3,     r0,     r2      ;pu1_src_tmp += src_strd;
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vld1.u8         {d0},   [r0]!           ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vld1.u8         {d1},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vst1.8          {d14},  [r14],  r6
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    add             r14,    r1,     #0
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    add             r1,     r1,     #8
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    addle           r1,     r1,     r9
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vmlal.u8        q6,     d3,     d23
+    add             r10,    r3,     r2,     lsl #3 ; 10*strd - 8+2
+    vmlsl.u8        q6,     d2,     d22
+    add             r10,    r10,    r2      ; 11*strd
+    vmlsl.u8        q6,     d4,     d24
+    vld1.u8         {d2},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vst1.8          {d8},   [r14],  r6      ;vst1_u8(pu1_dst,sto_res);
+    pld             [r10]                   ;11+ 0
+    vmlsl.u8        q6,     d7,     d27
+    pld             [r10,   r2]             ;11+ 1*strd
+    vmlal.u8        q6,     d16,    d28
+    pld             [r10,   r2,     lsl #1] ;11+ 2*strd
+    vmlsl.u8        q6,     d17,    d29
+    add             r10,    r10,    r2      ;12*strd
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    pld             [r10,   r2,     lsl #1] ;11+ 3*strd
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    subs            r7,     r7,     #4
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vld1.u8         {d3},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vhadd.s16       q6,     q6,     q15
+    vdup.16         q4,     r11
+    vmlal.u8        q7,     d7,     d26
+    vld1.u8         {d4},   [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d16,    d27
+    vld1.u8         {d5},   [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d17,    d28
+    vld1.u8         {d6},   [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlsl.u8        q7,     d18,    d29
+    vld1.u8         {d7},   [r3],   r2      ;src_tmp4 = vld1_u8(pu1_src_tmp);
+    vqrshrun.s16    d12,    q6,     #6
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    bgt             main_loop_8             ;jumps to main_loop_8
+
+epilog
+    vmlal.u8        q4,     d1,     d23     ;mul_res1 = vmull_u8(src_tmp2,
+    vmlsl.u8        q4,     d0,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_0);
+    vmlsl.u8        q4,     d2,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_2);
+    vmlal.u8        q4,     d3,     d25     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_3);
+    vhadd.s16       q7,     q7,     q15
+    vdup.16         q5,     r11
+    vmlal.u8        q4,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp1, coeffabs_4);
+    vmlsl.u8        q4,     d5,     d27     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp2, coeffabs_5);
+    vmlal.u8        q4,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; src_tmp3, coeffabs_6);
+    vmlsl.u8        q4,     d7,     d29     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; src_tmp4, coeffabs_7);
+    vst1.8          {d12},  [r14],  r6
+    vqrshrun.s16    d14,    q7,     #6
+    vld1.u8         {d16},  [r3],   r2      ;src_tmp1 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q5,     d2,     d23     ;mul_res2 = vmull_u8(src_tmp3,
+                                            ; coeffabs_1);
+    vmlsl.u8        q5,     d1,     d22     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_0);
+    vmlsl.u8        q5,     d3,     d24     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_2);
+    vmlal.u8        q5,     d4,     d25     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_3);
+    vhadd.s16       q4,     q4,     q15
+    vdup.16         q6,     r11
+    vmlal.u8        q5,     d5,     d26     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp2, coeffabs_4);
+    vmlsl.u8        q5,     d6,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp3, coeffabs_5);
+    vmlal.u8        q5,     d7,     d28     ;mul_res2 = vmlal_u8(mul_res2,
+                                            ; src_tmp4, coeffabs_6);
+    vmlsl.u8        q5,     d16,    d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; src_tmp1, coeffabs_7);
+    vst1.8          {d14},  [r14],  r6
+    vqrshrun.s16    d8,     q4,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d17},  [r3],   r2      ;src_tmp2 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q6,     d3,     d23
+    vmlsl.u8        q6,     d2,     d22
+    vmlsl.u8        q6,     d4,     d24
+    vmlal.u8        q6,     d5,     d25
+    vhadd.s16       q5,     q5,     q15
+    vdup.16         q7,     r11
+    vmlal.u8        q6,     d6,     d26
+    vmlsl.u8        q6,     d7,     d27
+    vmlal.u8        q6,     d16,    d28
+    vmlsl.u8        q6,     d17,    d29
+    add             r14,    r1,     r6
+    vst1.8          {d8},   [r1]!           ;vst1_u8(pu1_dst,sto_res);
+    vqrshrun.s16    d10,    q5,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u8         {d18},  [r3],   r2      ;src_tmp3 = vld1_u8(pu1_src_tmp);
+    vmlal.u8        q7,     d4,     d23
+    vmlsl.u8        q7,     d3,     d22
+    vmlsl.u8        q7,     d5,     d24
+    vmlal.u8        q7,     d6,     d25
+    vhadd.s16       q6,     q6,     q15
+    vmlal.u8        q7,     d7,     d26
+    vmlsl.u8        q7,     d16,    d27
+    vmlal.u8        q7,     d17,    d28
+    vmlsl.u8        q7,     d18,    d29
+    vst1.8          {d10},  [r14],  r6      ;vst1_u8(pu1_dst_tmp,sto_res);
+    vqrshrun.s16    d12,    q6,     #6
+
+epilog_end
+    vst1.8          {d12},  [r14],  r6
+    vhadd.s16       q7,     q7,     q15
+    vqrshrun.s16    d14,    q7,     #6
+    vst1.8          {d14},  [r14],  r6
+
+end_loops
+    tst             r5,     #7
+    ldr             r1,     [sp],   #4
+    ldr             r0,     [sp],   #4
+    vpopeq          {d8  -  d15}
+    ldmfdeq         sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+    mov             r5,     #4
+    add             r0,     r0,     #8
+    add             r1,     r1,     #8
+    mov             r7,     #16
+
+core_loop_wd_4
+    rsb             r9,     r5,     r6,     lsl #2 ;r6->dst_strd    r5    ->wd
+    rsb             r8,     r5,     r2,     lsl #2 ;r2->src_strd
+    vmov.i8         d4,     #0
+
+outer_loop_wd_4
+    subs            r12,    r5,     #0
+    ble             end_inner_loop_wd_4     ;outer loop jump
+
+inner_loop_wd_4
+    add             r3,     r0,     r2
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    subs            r12,    r12,    #4
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vld1.u32        {d4[0]},[r0]            ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 0);
+    vdup.16         q0,     r11
+    vmlal.u8        q0,     d5,     d23     ;mul_res1 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    add             r0,     r0,     #4
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlsl.u8        q0,     d4,     d22     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_0);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q0,     d6,     d24     ;mul_res1 = vmlsl_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_2);
+    vdup.16         q4,     r11
+    vmlal.u8        q4,     d7,     d23
+    vdup.u32        d4,     d7[1]           ;src_tmp1 = vdup_lane_u32(src_tmp4,
+                                            ; 1);
+    vmull.u8        q1,     d7,     d25     ;mul_res2 =
+                                            ; vmull_u8(vreinterpret_u8_u32(src_tmp4), coeffabs_3);
+    vld1.u32        {d4[1]},[r3],   r2      ;src_tmp1 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp1, 1);
+    vmlsl.u8        q4,     d6,     d22
+    vmlal.u8        q0,     d4,     d26     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp1), coeffabs_4);
+    vdup.u32        d5,     d4[1]           ;src_tmp2 = vdup_lane_u32(src_tmp1,
+                                            ; 1);
+    vmlsl.u8        q4,     d4,     d24
+    vld1.u32        {d5[1]},[r3],   r2      ;src_tmp2 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp2, 1);
+    vmlsl.u8        q1,     d5,     d27     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp2), coeffabs_5);
+    vdup.u32        d6,     d5[1]           ;src_tmp3 = vdup_lane_u32(src_tmp2,
+                                            ; 1);
+    vmlal.u8        q4,     d5,     d25
+    vld1.u32        {d6[1]},[r3],   r2      ;src_tmp3 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp3, 1);
+    vmlal.u8        q0,     d6,     d28     ;mul_res1 = vmlal_u8(mul_res1,
+                                            ; vreinterpret_u8_u32(src_tmp3), coeffabs_6);
+    vdup.u32        d7,     d6[1]           ;src_tmp4 = vdup_lane_u32(src_tmp3,
+                                            ; 1);
+    vmlal.u8        q4,     d6,     d26
+    vld1.u32        {d7[1]},[r3],   r2      ;src_tmp4 = vld1_lane_u32((uint32_t
+                                            ; *)pu1_src_tmp, src_tmp4, 1);
+    vmlsl.u8        q1,     d7,     d29     ;mul_res2 = vmlsl_u8(mul_res2,
+                                            ; vreinterpret_u8_u32(src_tmp4), coeffabs_7);
+    vdup.u32        d4,     d7[1]
+    vadd.i16        q0,     q0,     q1      ;mul_res1 = vaddq_u16(mul_res1,
+                                            ; mul_res2);
+    vmlsl.u8        q4,     d7,     d27
+    vld1.u32        {d4[1]},[r3],   r2
+    vmlal.u8        q4,     d4,     d28
+    vdup.u32        d5,     d4[1]
+    vhadd.s16       q0,     q0,     q15
+    vqrshrun.s16    d0,     q0,     #6      ;sto_res = vqmovun_s16(sto_res_tmp);
+    vld1.u32        {d5[1]},[r3]
+    add             r3,     r1,     r6
+    vst1.32         {d0[0]},[r1]            ;vst1_lane_u32((uint32_t *)pu1_dst,
+                                            ; vreinterpret_u32_u8(sto_res), 0);
+    vmlsl.u8        q4,     d5,     d29
+    vst1.32         {d0[1]},[r3],   r6      ;vst1_lane_u32((uint32_t
+                                            ; *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
+    vhadd.s16       q4,     q4,     q15
+    vqrshrun.s16    d8,     q4,     #6
+    vst1.32         {d8[0]},[r3],   r6
+    add             r1,     r1,     #4
+    vst1.32         {d8[1]},[r3]
+    bgt             inner_loop_wd_4
+
+end_inner_loop_wd_4
+    subs            r7,     r7,     #4
+    add             r1,     r1,     r9
+    add             r0,     r0,     r8
+    bgt             outer_loop_wd_4
+
+    vpop            {d8  -  d15}
+    ldmfd           sp!,    {r4  -  r12,    r15} ;reload the registers from sp
+
+    ENDP
+
+    END
diff --git a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 2bf2d890be0..830f3176d7f 100644
--- a/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/libs/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -24,7 +24,8 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   uint8_t temp[64 * 72];
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
-  const int intermediate_height = h + 7;
+  // (+ 1 to make it divisible by 4).
+  const int intermediate_height = h + 8;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
@@ -48,7 +49,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   uint8_t temp[64 * 72];
-  const int intermediate_height = h + 7;
+  const int intermediate_height = h + 8;
 
   assert(y_step_q4 == 16);
   assert(x_step_q4 == 16);
diff --git a/libs/libvpx/vpx_dsp/avg.c b/libs/libvpx/vpx_dsp/avg.c
index a7ac6d95388..1c45e8a73dd 100644
--- a/libs/libvpx/vpx_dsp/avg.c
+++ b/libs/libvpx/vpx_dsp/avg.c
@@ -32,6 +32,166 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
   return (sum + 8) >> 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+// src_diff: 13 bit, dynamic range [-4095, 4095]
+// coeff: 16 bit
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int32_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
@@ -123,6 +283,50 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
   }
 }
 
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// coeff: dynamic range 20 bit.
+// length: value range {16, 64, 256, 1024}.
+int vpx_highbd_satd_c(const tran_low_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 30 bits
+  return satd;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
 int vpx_satd_c(const tran_low_t *coeff, int length) {
diff --git a/libs/libvpx/vpx_dsp/bitreader.h b/libs/libvpx/vpx_dsp/bitreader.h
index 6ee2a58632c..a5927ea2ada 100644
--- a/libs/libvpx/vpx_dsp/bitreader.h
+++ b/libs/libvpx/vpx_dsp/bitreader.h
@@ -8,10 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITREADER_H_
-#define VPX_DSP_BITREADER_H_
+#ifndef VPX_VPX_DSP_BITREADER_H_
+#define VPX_VPX_DSP_BITREADER_H_
 
 #include <stddef.h>
+#include <stdio.h>
 #include <limits.h>
 
 #include "./vpx_config.h"
@@ -19,6 +20,9 @@
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #ifdef __cplusplus
 extern "C" {
@@ -94,7 +98,7 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   }
 
   {
-    register int shift = vpx_norm[range];
+    const unsigned char shift = vpx_norm[(unsigned char)range];
     range <<= shift;
     value <<= shift;
     count -= shift;
@@ -103,6 +107,31 @@ static INLINE int vpx_read(vpx_reader *r, int prob) {
   r->count = count;
   r->range = range;
 
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = bitstream_queue_get_frame_read();
+    int ref_result, ref_prob;
+    bitstream_queue_pop(&ref_result, &ref_prob);
+    if ((int)bit != ref_result) {
+      fprintf(stderr,
+              "\n *** [bit] result error, frame_idx_r %d bit %d ref_result %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_result, queue_r);
+
+      assert(0);
+    }
+    if (prob != ref_prob) {
+      fprintf(stderr,
+              "\n *** [bit] prob error, frame_idx_r %d prob %d ref_prob %d "
+              "queue_r %d\n",
+              frame_idx, prob, ref_prob, queue_r);
+
+      assert(0);
+    }
+  }
+#endif
+
   return bit;
 }
 
@@ -131,4 +160,4 @@ static INLINE int vpx_read_tree(vpx_reader *r, const vpx_tree_index *tree,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITREADER_H_
+#endif  // VPX_VPX_DSP_BITREADER_H_
diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.c b/libs/libvpx/vpx_dsp/bitreader_buffer.c
index 3e16bfa38c2..f59f1f7cb9d 100644
--- a/libs/libvpx/vpx_dsp/bitreader_buffer.c
+++ b/libs/libvpx/vpx_dsp/bitreader_buffer.c
@@ -23,7 +23,7 @@ int vpx_rb_read_bit(struct vpx_read_bit_buffer *rb) {
     rb->bit_offset = off + 1;
     return bit;
   } else {
-    rb->error_handler(rb->error_handler_data);
+    if (rb->error_handler != NULL) rb->error_handler(rb->error_handler_data);
     return 0;
   }
 }
diff --git a/libs/libvpx/vpx_dsp/bitreader_buffer.h b/libs/libvpx/vpx_dsp/bitreader_buffer.h
index 8a48a95ed19..b27703a4dbc 100644
--- a/libs/libvpx/vpx_dsp/bitreader_buffer.h
+++ b/libs/libvpx/vpx_dsp/bitreader_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITREADER_BUFFER_H_
-#define VPX_DSP_BITREADER_BUFFER_H_
+#ifndef VPX_VPX_DSP_BITREADER_BUFFER_H_
+#define VPX_VPX_DSP_BITREADER_BUFFER_H_
 
 #include <limits.h>
 
@@ -44,4 +44,4 @@ int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb, int bits);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITREADER_BUFFER_H_
+#endif  // VPX_VPX_DSP_BITREADER_BUFFER_H_
diff --git a/libs/libvpx/vpx_dsp/bitwriter.c b/libs/libvpx/vpx_dsp/bitwriter.c
index 81e28b309f5..5b41aa54ddc 100644
--- a/libs/libvpx/vpx_dsp/bitwriter.c
+++ b/libs/libvpx/vpx_dsp/bitwriter.c
@@ -12,6 +12,10 @@
 
 #include "./bitwriter.h"
 
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif
+
 void vpx_start_encode(vpx_writer *br, uint8_t *source) {
   br->lowvalue = 0;
   br->range = 255;
@@ -24,8 +28,15 @@ void vpx_start_encode(vpx_writer *br, uint8_t *source) {
 void vpx_stop_encode(vpx_writer *br) {
   int i;
 
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(1);
+#endif
   for (i = 0; i < 32; i++) vpx_write_bit(br, 0);
 
   // Ensure there's no ambigous collision with any index marker bytes
   if ((br->buffer[br->pos - 1] & 0xe0) == 0xc0) br->buffer[br->pos++] = 0;
+
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_set_skip_write(0);
+#endif
 }
diff --git a/libs/libvpx/vpx_dsp/bitwriter.h b/libs/libvpx/vpx_dsp/bitwriter.h
index 41040cf9354..f276feefb1d 100644
--- a/libs/libvpx/vpx_dsp/bitwriter.h
+++ b/libs/libvpx/vpx_dsp/bitwriter.h
@@ -8,12 +8,17 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITWRITER_H_
-#define VPX_DSP_BITWRITER_H_
+#ifndef VPX_VPX_DSP_BITWRITER_H_
+#define VPX_VPX_DSP_BITWRITER_H_
+
+#include <stdio.h>
 
 #include "vpx_ports/mem.h"
 
 #include "vpx_dsp/prob.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "vpx_util/vpx_debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,15 +32,30 @@ typedef struct vpx_writer {
   uint8_t *buffer;
 } vpx_writer;
 
-void vpx_start_encode(vpx_writer *bc, uint8_t *buffer);
-void vpx_stop_encode(vpx_writer *bc);
+void vpx_start_encode(vpx_writer *br, uint8_t *source);
+void vpx_stop_encode(vpx_writer *br);
 
 static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
   unsigned int lowvalue = br->lowvalue;
-  register int shift;
+  int shift;
+
+#if CONFIG_BITSTREAM_DEBUG
+  /*
+  int queue_r = 0;
+  int frame_idx_r = 0;
+  int queue_w = bitstream_queue_get_write();
+  int frame_idx_w = bitstream_queue_get_frame_write();
+  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
+    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
+            frame_idx_w, queue_w);
+    assert(0);
+  }
+  */
+  bitstream_queue_push(bit, probability);
+#endif
 
   split = 1 + (((range - 1) * probability) >> 8);
 
@@ -94,4 +114,4 @@ static INLINE void vpx_write_literal(vpx_writer *w, int data, int bits) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITWRITER_H_
+#endif  // VPX_VPX_DSP_BITWRITER_H_
diff --git a/libs/libvpx/vpx_dsp/bitwriter_buffer.h b/libs/libvpx/vpx_dsp/bitwriter_buffer.h
index a123a2fe8c9..3662cb64df6 100644
--- a/libs/libvpx/vpx_dsp/bitwriter_buffer.h
+++ b/libs/libvpx/vpx_dsp/bitwriter_buffer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_BITWRITER_BUFFER_H_
-#define VPX_DSP_BITWRITER_BUFFER_H_
+#ifndef VPX_VPX_DSP_BITWRITER_BUFFER_H_
+#define VPX_VPX_DSP_BITWRITER_BUFFER_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -35,4 +35,4 @@ void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb, int data,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_BITWRITER_BUFFER_H_
+#endif  // VPX_VPX_DSP_BITWRITER_BUFFER_H_
diff --git a/libs/libvpx/vpx_dsp/deblock.c b/libs/libvpx/vpx_dsp/deblock.c
index 94acbb39195..cc437af66cd 100644
--- a/libs/libvpx/vpx_dsp/deblock.c
+++ b/libs/libvpx/vpx_dsp/deblock.c
@@ -39,11 +39,10 @@ const int16_t vpx_rv[] = {
   9,  10, 13,
 };
 
-void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
-                                            unsigned char *dst_ptr,
-                                            int src_pixels_per_line,
-                                            int dst_pixels_per_line, int cols,
-                                            unsigned char *f, int size) {
+void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src,
+                                            unsigned char *dst, int src_pitch,
+                                            int dst_pitch, int cols,
+                                            unsigned char *flimits, int size) {
   unsigned char *p_src, *p_dst;
   int row;
   int col;
@@ -53,21 +52,28 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
   assert(size >= 8);
   assert(cols >= 8);
 
+#ifdef __clang_analyzer__
+  /* older clang analyzer doesnt understand asserts, insert this until we ugrade analyzer */
+  if (cols < 8) return;
+#endif
+
   for (row = 0; row < size; row++) {
     /* post_proc_down for one row */
-    p_src = src_ptr;
-    p_dst = dst_ptr;
+    p_src = src;
+    p_dst = dst;
 
     for (col = 0; col < cols; col++) {
-      unsigned char p_above2 = p_src[col - 2 * src_pixels_per_line];
-      unsigned char p_above1 = p_src[col - src_pixels_per_line];
-      unsigned char p_below1 = p_src[col + src_pixels_per_line];
-      unsigned char p_below2 = p_src[col + 2 * src_pixels_per_line];
+      unsigned char p_above2 = p_src[col - 2 * src_pitch];
+      unsigned char p_above1 = p_src[col - src_pitch];
+      unsigned char p_below1 = p_src[col + src_pitch];
+      unsigned char p_below2 = p_src[col + 2 * src_pitch];
 
       v = p_src[col];
 
-      if ((abs(v - p_above2) < f[col]) && (abs(v - p_above1) < f[col]) &&
-          (abs(v - p_below1) < f[col]) && (abs(v - p_below2) < f[col])) {
+      if ((abs(v - p_above2) < flimits[col]) &&
+          (abs(v - p_above1) < flimits[col]) &&
+          (abs(v - p_below1) < flimits[col]) &&
+          (abs(v - p_below2) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_above2 + p_above1 + 1) >> 1;
         k2 = (p_below2 + p_below1 + 1) >> 1;
@@ -79,8 +85,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     }
 
     /* now post_proc_across */
-    p_src = dst_ptr;
-    p_dst = dst_ptr;
+    p_src = dst;
+    p_dst = dst;
 
     p_src[-2] = p_src[-1] = p_src[0];
     p_src[cols] = p_src[cols + 1] = p_src[cols - 1];
@@ -88,10 +94,10 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     for (col = 0; col < cols; col++) {
       v = p_src[col];
 
-      if ((abs(v - p_src[col - 2]) < f[col]) &&
-          (abs(v - p_src[col - 1]) < f[col]) &&
-          (abs(v - p_src[col + 1]) < f[col]) &&
-          (abs(v - p_src[col + 2]) < f[col])) {
+      if ((abs(v - p_src[col - 2]) < flimits[col]) &&
+          (abs(v - p_src[col - 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 1]) < flimits[col]) &&
+          (abs(v - p_src[col + 2]) < flimits[col])) {
         unsigned char k1, k2, k3;
         k1 = (p_src[col - 2] + p_src[col - 1] + 1) >> 1;
         k2 = (p_src[col + 2] + p_src[col + 1] + 1) >> 1;
@@ -109,8 +115,8 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
     p_dst[col - 1] = d[(col - 1) & 3];
 
     /* next row */
-    src_ptr += src_pixels_per_line;
-    dst_ptr += dst_pixels_per_line;
+    src += src_pitch;
+    dst += dst_pitch;
   }
 }
 
diff --git a/libs/libvpx/vpx_dsp/fastssim.c b/libs/libvpx/vpx_dsp/fastssim.c
index 0469071a176..6ab6f557e25 100644
--- a/libs/libvpx/vpx_dsp/fastssim.c
+++ b/libs/libvpx/vpx_dsp/fastssim.c
@@ -128,10 +128,12 @@ static void fs_downsample_level(fs_ctx *_ctx, int _l) {
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, w2);
-      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
-                        src1[j1offs + i0] + src1[j1offs + i1];
-      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
-                        src2[j1offs + i0] + src2[j1offs + i1];
+      dst1[j * w + i] =
+          (uint32_t)((int64_t)src1[j0offs + i0] + src1[j0offs + i1] +
+                     src1[j1offs + i0] + src1[j1offs + i1]);
+      dst2[j * w + i] =
+          (uint32_t)((int64_t)src2[j0offs + i0] + src2[j0offs + i1] +
+                     src2[j1offs + i0] + src2[j1offs + i1]);
     }
   }
 }
@@ -220,12 +222,12 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   ssim = _ctx->level[_l].ssim;
   c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
   for (j = 0; j < h; j++) {
-    unsigned mux;
-    unsigned muy;
+    int64_t mux;
+    int64_t muy;
     int i0;
     int i1;
-    mux = 5 * col_sums_x[0];
-    muy = 5 * col_sums_y[0];
+    mux = (int64_t)5 * col_sums_x[0];
+    muy = (int64_t)5 * col_sums_y[0];
     for (i = 1; i < 4; i++) {
       i1 = FS_MINI(i, w - 1);
       mux += col_sums_x[i1];
@@ -237,8 +239,8 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
       if (i + 1 < w) {
         i0 = FS_MAXI(0, i - 4);
         i1 = FS_MINI(i + 4, w - 1);
-        mux += col_sums_x[i1] - col_sums_x[i0];
-        muy += col_sums_x[i1] - col_sums_x[i0];
+        mux += (int)col_sums_x[i1] - (int)col_sums_x[i0];
+        muy += (int)col_sums_x[i1] - (int)col_sums_x[i0];
       }
     }
     if (j + 1 < h) {
@@ -246,8 +248,10 @@ static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
       for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
       for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
       j1offs = FS_MINI(j + 4, h - 1) * w;
-      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
-      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+      for (i = 0; i < w; i++)
+        col_sums_x[i] = (uint32_t)((int64_t)col_sums_x[i] + im1[j1offs + i]);
+      for (i = 0; i < w; i++)
+        col_sums_y[i] = (uint32_t)((int64_t)col_sums_y[i] + im2[j1offs + i]);
     }
   }
 }
@@ -343,18 +347,18 @@ static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
   for (j = 0; j < h + 4; j++) {
     if (j < h - 1) {
       for (i = 0; i < w - 1; i++) {
-        unsigned g1;
-        unsigned g2;
-        unsigned gx;
-        unsigned gy;
-        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
-        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+        int64_t g1;
+        int64_t g2;
+        int64_t gx;
+        int64_t gy;
+        g1 = labs((int64_t)im1[(j + 1) * w + i + 1] - (int64_t)im1[j * w + i]);
+        g2 = labs((int64_t)im1[(j + 1) * w + i] - (int64_t)im1[j * w + i + 1]);
         gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
-        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
-        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        gx_buf[(j & 7) * stride + i + 4] = gx;
-        gy_buf[(j & 7) * stride + i + 4] = gy;
+        g1 = labs((int64_t)im2[(j + 1) * w + i + 1] - (int64_t)im2[j * w + i]);
+        g2 = labs((int64_t)im2[(j + 1) * w + i] - (int64_t)im2[j * w + i + 1]);
+        gy = ((int64_t)4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2));
+        gx_buf[(j & 7) * stride + i + 4] = (uint32_t)gx;
+        gy_buf[(j & 7) * stride + i + 4] = (uint32_t)gy;
       }
     } else {
       memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.c b/libs/libvpx/vpx_dsp/fwd_txfm.c
index 6dcb3ba6681..ef66de0247a 100644
--- a/libs/libvpx/vpx_dsp/fwd_txfm.c
+++ b/libs/libvpx/vpx_dsp/fwd_txfm.c
@@ -87,11 +87,11 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   output[0] = sum * 2;
 }
 
-void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
   tran_low_t intermediate[64];
   int pass;
-  tran_low_t *output = intermediate;
+  tran_low_t *out = intermediate;
   const tran_low_t *in = NULL;
 
   // Transform columns
@@ -133,10 +133,10 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
       t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
+      out[0] = (tran_low_t)fdct_round_shift(t0);
+      out[2] = (tran_low_t)fdct_round_shift(t2);
+      out[4] = (tran_low_t)fdct_round_shift(t1);
+      out[6] = (tran_low_t)fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
@@ -155,19 +155,19 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
+      out[1] = (tran_low_t)fdct_round_shift(t0);
+      out[3] = (tran_low_t)fdct_round_shift(t2);
+      out[5] = (tran_low_t)fdct_round_shift(t1);
+      out[7] = (tran_low_t)fdct_round_shift(t3);
+      out += 8;
     }
     in = intermediate;
-    output = final_output;
+    out = output;
   }
 
   // Rows
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+    for (j = 0; j < 8; ++j) output[j + i * 8] /= 2;
   }
 }
 
@@ -705,9 +705,9 @@ void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) {
   output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
 }
 
-void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -715,16 +715,16 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 0);
     for (j = 0; j < 32; ++j)
-      out[j + i * 32] =
+      output[j + i * 32] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
   }
 }
@@ -732,9 +732,9 @@ void vpx_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
 // Note that although we use dct_32_round in dct32 computation flow,
 // this 2d fdct32x32 for rate-distortion optimization loop is operating
 // within 16 bits precision.
-void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
+void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) {
   int i, j;
-  tran_high_t output[32 * 32];
+  tran_high_t out[32 * 32];
 
   // Columns
   for (i = 0; i < 32; ++i) {
@@ -745,15 +745,15 @@ void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
       // TODO(cd): see quality impact of only doing
       //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
       //           PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
+    for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     vpx_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
+    for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j];
   }
 }
 
@@ -772,14 +772,14 @@ void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
   vpx_fdct4x4_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output,
                           int stride) {
-  vpx_fdct8x8_c(input, final_output, stride);
+  vpx_fdct8x8_c(input, output, stride);
 }
 
-void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output,
                             int stride) {
-  vpx_fdct8x8_1_c(input, final_output, stride);
+  vpx_fdct8x8_1_c(input, output, stride);
 }
 
 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
@@ -792,17 +792,18 @@ void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
   vpx_fdct16x16_1_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
-  vpx_fdct32x32_c(input, out, stride);
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vpx_fdct32x32_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output,
                                int stride) {
-  vpx_fdct32x32_rd_c(input, out, stride);
+  vpx_fdct32x32_rd_c(input, output, stride);
 }
 
-void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *out,
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output,
                               int stride) {
-  vpx_fdct32x32_1_c(input, out, stride);
+  vpx_fdct32x32_1_c(input, output, stride);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/fwd_txfm.h b/libs/libvpx/vpx_dsp/fwd_txfm.h
index 29e139c73ba..a43c8ea7f7f 100644
--- a/libs/libvpx/vpx_dsp/fwd_txfm.h
+++ b/libs/libvpx/vpx_dsp/fwd_txfm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_FWD_TXFM_H_
-#define VPX_DSP_FWD_TXFM_H_
+#ifndef VPX_VPX_DSP_FWD_TXFM_H_
+#define VPX_VPX_DSP_FWD_TXFM_H_
 
 #include "vpx_dsp/txfm_common.h"
 
@@ -22,4 +22,4 @@ static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
 }
 
 void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round);
-#endif  // VPX_DSP_FWD_TXFM_H_
+#endif  // VPX_VPX_DSP_FWD_TXFM_H_
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.c b/libs/libvpx/vpx_dsp/inv_txfm.c
index 0194aa1e186..69de05e7188 100644
--- a/libs/libvpx/vpx_dsp/inv_txfm.c
+++ b/libs/libvpx/vpx_dsp/inv_txfm.c
@@ -67,11 +67,11 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   }
 }
 
-void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
+void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
 
   a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -1346,12 +1346,12 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest,
   }
 }
 
-void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest,
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i;
   tran_high_t a1, e1;
   tran_low_t tmp[4];
-  const tran_low_t *ip = in;
+  const tran_low_t *ip = input;
   tran_low_t *op = tmp;
   (void)bd;
 
diff --git a/libs/libvpx/vpx_dsp/inv_txfm.h b/libs/libvpx/vpx_dsp/inv_txfm.h
index 13137659fae..6eedbeac357 100644
--- a/libs/libvpx/vpx_dsp/inv_txfm.h
+++ b/libs/libvpx/vpx_dsp/inv_txfm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_INV_TXFM_H_
-#define VPX_DSP_INV_TXFM_H_
+#ifndef VPX_VPX_DSP_INV_TXFM_H_
+#define VPX_VPX_DSP_INV_TXFM_H_
 
 #include <assert.h>
 
@@ -76,7 +76,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) {
 // bd of 10 uses trans_low with 18bits, need to remove 14bits
 // bd of 12 uses trans_low with 20bits, need to remove 12bits
 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits
-
 #define WRAPLOW(x) ((((int32_t)check_range(x)) << 16) >> 16)
 #if CONFIG_VP9_HIGHBITDEPTH
 #define HIGHBD_WRAPLOW(x, bd) \
@@ -123,4 +122,4 @@ static INLINE uint8_t clip_pixel_add(uint8_t dest, tran_high_t trans) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_INV_TXFM_H_
+#endif  // VPX_VPX_DSP_INV_TXFM_H_
diff --git a/libs/libvpx/vpx_dsp/loopfilter.c b/libs/libvpx/vpx_dsp/loopfilter.c
index 9866ea37d6d..47f30c96aff 100644
--- a/libs/libvpx/vpx_dsp/loopfilter.c
+++ b/libs/libvpx/vpx_dsp/loopfilter.c
@@ -109,29 +109,30 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
   *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
 }
 
-void vpx_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
+void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -178,31 +179,33 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
 
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
+    filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch, s - 2 * pitch,
+            s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch, s + 3 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
                                  const uint8_t *limit0, const uint8_t *thresh0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
-  vpx_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  vpx_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1);
+  vpx_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0);
+  vpx_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1);
 }
 
 void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
@@ -283,7 +286,8 @@ static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int count) {
   int i;
@@ -291,34 +295,37 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint8_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                  p0 = s[-pitch];
+    const uint8_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                  q3 = s[3 * pitch];
     const int8_t mask =
         filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
     const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 =
-        flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                   s[4 * p], s[5 * p], s[6 * p], s[7 * p]);
-
-    filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-             s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-             s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p,
-             s + 7 * p);
+    const int8_t flat2 = flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch]);
+
+    filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+             s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+             s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch, s + 2 * pitch,
+             s + 3 * pitch, s + 4 * pitch, s + 5 * pitch, s + 6 * pitch,
+             s + 7 * pitch);
     ++s;
   }
 }
 
-void vpx_lpf_horizontal_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1);
 }
 
-void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+  mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2);
 }
 
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+static void mb_lpf_vertical_edge_w(uint8_t *s, int pitch, const uint8_t *blimit,
                                    const uint8_t *limit, const uint8_t *thresh,
                                    int count) {
   int i;
@@ -335,18 +342,18 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
     filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5, s - 4,
              s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6,
              s + 7);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8);
 }
 
-void vpx_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
+void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+  mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -440,7 +447,7 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
   *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
 }
 
-void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh, int bd) {
   int i;
@@ -448,27 +455,28 @@ void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    highbd_filter4(mask, *thresh, s - 2 * pitch, s - 1 * pitch, s,
+                   s + 1 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -517,33 +525,36 @@ static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
   int i;
 
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch], p2 = s[-3 * pitch], p1 = s[-2 * pitch],
+                   p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch], q1 = s[1 * pitch], q2 = s[2 * pitch],
+                   q3 = s[3 * pitch];
 
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * pitch, s - 3 * pitch,
+                   s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                   s + 2 * pitch, s + 3 * pitch, bd);
     ++s;
   }
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_c(s + 8, p, blimit1, limit1, thresh1, bd);
+  vpx_highbd_lpf_horizontal_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_c(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
@@ -639,7 +650,7 @@ static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
   }
 }
 
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int pitch,
                                             const uint8_t *blimit,
                                             const uint8_t *limit,
                                             const uint8_t *thresh, int count,
@@ -649,44 +660,45 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
   // loop filter designed to work using chars so that we can make maximum use
   // of 8 bit simd instructions.
   for (i = 0; i < 8 * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
+    const uint16_t p3 = s[-4 * pitch];
+    const uint16_t p2 = s[-3 * pitch];
+    const uint16_t p1 = s[-2 * pitch];
+    const uint16_t p0 = s[-pitch];
+    const uint16_t q0 = s[0 * pitch];
+    const uint16_t q1 = s[1 * pitch];
+    const uint16_t q2 = s[2 * pitch];
+    const uint16_t q3 = s[3 * pitch];
     const int8_t mask =
         highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
     const int8_t flat =
         highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask5(1, s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, q0,
-                          s[4 * p], s[5 * p], s[6 * p], s[7 * p], bd);
-
-    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * p, s - 7 * p, s - 6 * p,
-                    s - 5 * p, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                    s + 1 * p, s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p,
-                    s + 6 * p, s + 7 * p, bd);
+    const int8_t flat2 = highbd_flat_mask5(
+        1, s[-8 * pitch], s[-7 * pitch], s[-6 * pitch], s[-5 * pitch], p0, q0,
+        s[4 * pitch], s[5 * pitch], s[6 * pitch], s[7 * pitch], bd);
+
+    highbd_filter16(mask, *thresh, flat, flat2, s - 8 * pitch, s - 7 * pitch,
+                    s - 6 * pitch, s - 5 * pitch, s - 4 * pitch, s - 3 * pitch,
+                    s - 2 * pitch, s - 1 * pitch, s, s + 1 * pitch,
+                    s + 2 * pitch, s + 3 * pitch, s + 4 * pitch, s + 5 * pitch,
+                    s + 6 * pitch, s + 7 * pitch, bd);
     ++s;
   }
 }
 
-void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch,
                                          const uint8_t *blimit,
                                          const uint8_t *limit,
                                          const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 2, bd);
 }
 
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int count,
@@ -712,20 +724,20 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
     highbd_filter16(mask, *thresh, flat, flat2, s - 8, s - 7, s - 6, s - 5,
                     s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4,
                     s + 5, s + 6, s + 7, bd);
-    s += p;
+    s += pitch;
   }
 }
 
-void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 8, bd);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit, limit, thresh, 16, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
index 43d2c1146ec..97541411e41 100644
--- a/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/add_noise_msa.c
@@ -9,7 +9,9 @@
  */
 
 #include <stdlib.h>
-#include "./macros_msa.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 void vpx_plane_add_noise_msa(uint8_t *start_ptr, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
diff --git a/libs/libvpx/vpx_dsp/mips/avg_msa.c b/libs/libvpx/vpx_dsp/mips/avg_msa.c
index d0ac7b8e296..3fd18dec568 100644
--- a/libs/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/avg_msa.c
@@ -9,6 +9,7 @@
  */
 #include <stdlib.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -56,6 +57,7 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
   return sum_out;
 }
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
                           int16_t *dst) {
   v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
@@ -391,6 +393,7 @@ int vpx_satd_msa(const int16_t *data, int length) {
 
   return satd;
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
 
 void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
                          const int ref_stride, const int height) {
diff --git a/libs/libvpx/vpx_dsp/mips/common_dspr2.h b/libs/libvpx/vpx_dsp/mips/common_dspr2.h
index 0a42f5cec21..87a5bbab561 100644
--- a/libs/libvpx/vpx_dsp/mips/common_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/common_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_COMMON_MIPS_DSPR2_H_
-#define VPX_COMMON_MIPS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
 
 #include <assert.h>
 #include "./vpx_config.h"
@@ -45,4 +45,4 @@ static INLINE void prefetch_store_streamed(unsigned char *dst) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_COMMON_MIPS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index d9c2bef69ed..cc458c86182 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/mips/convolve_common_dspr2.h"
 #include "vpx_dsp/vpx_convolve.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
@@ -341,7 +342,7 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index fb68ad88139..7a9aa49d8a1 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/mips/convolve_common_dspr2.h"
 #include "vpx_dsp/vpx_convolve.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 #if HAVE_DSPR2
@@ -945,7 +946,7 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index 89f0f41962a..1e7052f6c58 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1322,7 +1322,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   if (filter_x[3] == 0x80) {
     copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
                           intermediate_height, w, intermediate_height);
-  } else if (((const int32_t *)filter_x)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_x) == 2) {
     vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
                         intermediate_height, filter_x, w, intermediate_height);
   } else {
@@ -1365,7 +1365,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   /* copy the src to dst */
   if (filter_y[3] == 0x80) {
     copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
-  } else if (((const int32_t *)filter_y)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
                         filter_y, h, w);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 77e95c84449..09d6f36e561 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -825,7 +825,7 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
                               x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index c329f71ccf6..fd977b53360 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/libs/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -325,7 +325,7 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
                              x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
index 48e440d73c5..14b65bc650c 100644
--- a/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
-#define VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
 
 #include <assert.h>
 
@@ -55,4 +55,4 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_MIPS_VPX_COMMON_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/deblock_msa.c b/libs/libvpx/vpx_dsp/mips/deblock_msa.c
index aafa272fbdf..4e93ff594d0 100644
--- a/libs/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -10,42 +10,42 @@
 
 #include <stdlib.h>
 
-#include "./macros_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 extern const int16_t vpx_rv[];
 
-#define VPX_TRANSPOSE8x16_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, out0,  \
-                                out1, out2, out3, out4, out5, out6, out7,      \
-                                out8, out9, out10, out11, out12, out13, out14, \
-                                out15)                                         \
-  {                                                                            \
-    v8i16 temp0, temp1, temp2, temp3, temp4;                                   \
-    v8i16 temp5, temp6, temp7, temp8, temp9;                                   \
-                                                                               \
-    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                   \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                   \
-    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2,    \
-               temp3);                                                         \
-    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out8, out10);                                    \
-    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                      \
-    ILVRL_W2_UB(temp5, temp4, out12, out14);                                   \
-    out0 = (v16u8)temp6;                                                       \
-    out2 = (v16u8)temp7;                                                       \
-    out4 = (v16u8)temp8;                                                       \
-    out6 = (v16u8)temp9;                                                       \
-    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                      \
-    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                   \
-    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                   \
-    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                   \
-    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                      \
-    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                      \
-    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                      \
-    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                      \
+#define VPX_TRANSPOSE8x16_UB_UB(                                            \
+    in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4,   \
+    out5, out6, out7, out8, out9, out10, out11, out12, out13, out14, out15) \
+  {                                                                         \
+    v8i16 temp0, temp1, temp2, temp3, temp4;                                \
+    v8i16 temp5, temp6, temp7, temp8, temp9;                                \
+                                                                            \
+    ILVR_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp6, temp7);                                \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_SH(temp5, temp4, temp8, temp9);                                \
+    ILVL_B4_SH(in1, in0, in3, in2, in5, in4, in7, in6, temp0, temp1, temp2, \
+               temp3);                                                      \
+    ILVR_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out8, out10);                                 \
+    ILVL_H2_SH(temp1, temp0, temp3, temp2, temp4, temp5);                   \
+    ILVRL_W2_UB(temp5, temp4, out12, out14);                                \
+    out0 = (v16u8)temp6;                                                    \
+    out2 = (v16u8)temp7;                                                    \
+    out4 = (v16u8)temp8;                                                    \
+    out6 = (v16u8)temp9;                                                    \
+    out9 = (v16u8)__msa_ilvl_d((v2i64)out8, (v2i64)out8);                   \
+    out11 = (v16u8)__msa_ilvl_d((v2i64)out10, (v2i64)out10);                \
+    out13 = (v16u8)__msa_ilvl_d((v2i64)out12, (v2i64)out12);                \
+    out15 = (v16u8)__msa_ilvl_d((v2i64)out14, (v2i64)out14);                \
+    out1 = (v16u8)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+    out3 = (v16u8)__msa_ilvl_d((v2i64)out2, (v2i64)out2);                   \
+    out5 = (v16u8)__msa_ilvl_d((v2i64)out4, (v2i64)out4);                   \
+    out7 = (v16u8)__msa_ilvl_d((v2i64)out6, (v2i64)out6);                   \
   }
 
 #define VPX_AVER_IF_RETAIN(above2_in, above1_in, src_in, below1_in, below2_in, \
@@ -509,11 +509,11 @@ void vpx_post_proc_down_and_across_mb_row_msa(uint8_t *src, uint8_t *dst,
   }
 }
 
-void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
-                                   int32_t rows, int32_t cols, int32_t flimit) {
+void vpx_mbpost_proc_across_ip_msa(uint8_t *src, int32_t pitch, int32_t rows,
+                                   int32_t cols, int32_t flimit) {
   int32_t row, col, cnt;
-  uint8_t *src_dup = src_ptr;
-  v16u8 src0, src, tmp_orig;
+  uint8_t *src_dup = src;
+  v16u8 src0, src1, tmp_orig;
   v16u8 tmp = { 0 };
   v16i8 zero = { 0 };
   v8u16 sum_h, src_r_h, src_l_h;
@@ -532,13 +532,13 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
     src_dup[cols + 16] = src_dup[cols - 1];
     tmp_orig = (v16u8)__msa_ldi_b(0);
     tmp_orig[15] = tmp[15];
-    src = LD_UB(src_dup - 8);
-    src[15] = 0;
-    ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+    src1 = LD_UB(src_dup - 8);
+    src1[15] = 0;
+    ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
     src_r_w = __msa_dotp_u_w(src_r_h, src_r_h);
     src_r_w += __msa_dotp_u_w(src_l_h, src_l_h);
     sum_sq = HADD_SW_S32(src_r_w) + 16;
-    sum_h = __msa_hadd_u_h(src, src);
+    sum_h = __msa_hadd_u_h(src1, src1);
     sum = HADD_UH_U32(sum_h);
     {
       v16u8 src7, src8, src_r, src_l;
@@ -567,8 +567,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
           sum_l[cnt + 1] = sum_l[cnt] + sub_l[cnt + 1];
         }
         sum = sum_l[7];
-        src = LD_UB(src_dup + 16 * col);
-        ILVRL_B2_UH(zero, src, src_r_h, src_l_h);
+        src1 = LD_UB(src_dup + 16 * col);
+        ILVRL_B2_UH(zero, src1, src_r_h, src_l_h);
         src7 = (v16u8)((const8 + sum_r + (v8i16)src_r_h) >> 4);
         src8 = (v16u8)((const8 + sum_l + (v8i16)src_l_h) >> 4);
         tmp = (v16u8)__msa_pckev_b((v16i8)src8, (v16i8)src7);
@@ -614,7 +614,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch,
         total3 = (total3 < flimit_vec);
         PCKEV_H2_SH(total1, total0, total3, total2, mask0, mask1);
         mask = __msa_pckev_b((v16i8)mask1, (v16i8)mask0);
-        tmp = __msa_bmz_v(tmp, src, (v16u8)mask);
+        tmp = __msa_bmz_v(tmp, src1, (v16u8)mask);
 
         if (col == 0) {
           uint64_t src_d;
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
index 06fdc951e79..36583e2d240 100644
--- a/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
 static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
diff --git a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
index fd589224d32..c0be56b8197 100644
--- a/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/fwd_txfm_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_FWD_TXFM_MSA_H_
-#define VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
 
 #include "vpx_dsp/mips/txfm_macros_msa.h"
 #include "vpx_dsp/txfm_common.h"
@@ -361,4 +361,4 @@
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride);
 void fdct16x8_1d_row(int16_t *input, int16_t *output);
-#endif  // VPX_DSP_MIPS_FWD_TXFM_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_FWD_TXFM_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c
index 2a211c5677e..7ca61a28ec8 100644
--- a/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/idct16x16_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output) {
diff --git a/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c
index 2ea6136f9bc..053948183a0 100644
--- a/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/idct32x32_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 static void idct32x8_row_transpose_store(const int16_t *input,
diff --git a/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c
index 0a85742f106..56ffec3cba9 100644
--- a/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/idct4x4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c
index 7f77d201918..a383ff20667 100644
--- a/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/idct8x8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/inv_txfm_msa.h"
 
 void vpx_idct8x8_64_add_msa(const int16_t *input, uint8_t *dst,
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
index 27881f0db6c..cbea22f20f7 100644
--- a/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
-#define VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
 
 #include <assert.h>
 
@@ -25,7 +25,6 @@ extern "C" {
 #if HAVE_DSPR2
 #define DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input)                         \
   ({                                                                           \
-                                                                               \
     int32_t tmp, out;                                                          \
     int dct_cost_rounding = DCT_CONST_ROUNDING;                                \
     int in = input;                                                            \
@@ -73,4 +72,4 @@ void iadst16_dspr2(const int16_t *input, int16_t *output);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_INV_TXFM_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
index 1fe9b28e8ad..3b66249ef2e 100644
--- a/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/inv_txfm_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_INV_TXFM_MSA_H_
-#define VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#define VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 #include "vpx_dsp/mips/txfm_macros_msa.h"
@@ -408,4 +408,4 @@ void vpx_idct16_1d_rows_msa(const int16_t *input, int16_t *output);
 void vpx_iadst16_1d_columns_addblk_msa(int16_t *input, uint8_t *dst,
                                        int32_t dst_stride);
 void vpx_iadst16_1d_rows_msa(const int16_t *input, int16_t *output);
-#endif  // VPX_DSP_MIPS_INV_TXFM_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_INV_TXFM_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
index 5b0c73345b7..ec339be8685 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -731,4 +731,4 @@ static INLINE void wide_mbfilter_dspr2(
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
index 38ed0b2a631..9af0b42360f 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_macros_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -432,4 +432,4 @@ extern "C" {
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
index ee111422668..24c492bea0d 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_masks_dspr2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
-#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
 
 #include <stdlib.h>
 
@@ -352,4 +352,4 @@ static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
 }  // extern "C"
 #endif
 
-#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
index 49fd74c25a4..1ea05e0b0b3 100644
--- a/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/loopfilter_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_LOOPFILTER_MSA_H_
-#define VPX_DSP_LOOPFILTER_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
+#define VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -174,4 +174,4 @@
     mask_out = limit_in < (v16u8)mask_out;                                   \
     mask_out = __msa_xori_b(mask_out, 0xff);                                 \
   }
-#endif /* VPX_DSP_LOOPFILTER_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/macros_msa.h b/libs/libvpx/vpx_dsp/mips/macros_msa.h
index f9a446e7bc1..a3a5a4dfeeb 100644
--- a/libs/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/macros_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_MACROS_MSA_H_
-#define VPX_DSP_MIPS_MACROS_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_MACROS_MSA_H_
 
 #include <msa.h>
 
@@ -1966,4 +1966,4 @@
                                                                 \
     tmp1_m;                                                     \
   })
-#endif /* VPX_DSP_MIPS_MACROS_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_MACROS_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/sad_mmi.c b/libs/libvpx/vpx_dsp/mips/sad_mmi.c
index 33bd3fe7f92..4368db5fdb7 100644
--- a/libs/libvpx/vpx_dsp/mips/sad_mmi.c
+++ b/libs/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -341,7 +341,7 @@
                                     const uint8_t *ref_array, int ref_stride, \
                                     uint32_t *sad_array) {                    \
     int i;                                                                    \
-    for (i = 0; i < k; ++i)                                                   \
+    for (i = 0; i < (k); ++i)                                                 \
       sad_array[i] =                                                          \
           vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
   }
diff --git a/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
index 313e06f92dd..572fcabfc04 100644
--- a/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/sub_pixel_variance_msa.c
@@ -27,13 +27,14 @@ static const uint8_t bilinear_filters_msa[8][2] = {
     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
                                                                     \
-    sub += res_l0_m + res_l1_m;                                     \
+    (sub) += res_l0_m + res_l1_m;                                   \
   }
 
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
                                         int32_t src_stride,
@@ -1619,16 +1620,16 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
 
 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
   uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
-      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
-      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
       uint32_t *sse) {                                                        \
     int32_t diff;                                                             \
     uint32_t var;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
       } else {                                                                \
@@ -1638,7 +1639,7 @@ static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
                                                                               \
       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
                                                                               \
@@ -1672,15 +1673,15 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
   uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
       uint32_t *sse, const uint8_t *sec_pred) {                               \
     int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
             v_filter, ht, &diff);                                             \
@@ -1690,7 +1691,7 @@ VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
             &diff);                                                           \
       }                                                                       \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
             &diff);                                                           \
@@ -1719,16 +1720,16 @@ VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
 
 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                              int32_t src_stride,
-                                             int32_t xoffset, int32_t yoffset,
+                                             int32_t x_offset, int32_t y_offset,
                                              const uint8_t *ref_ptr,
                                              int32_t ref_stride, uint32_t *sse,
                                              const uint8_t *sec_pred) {
   int32_t diff;
-  const uint8_t *h_filter = bilinear_filters_msa[xoffset];
-  const uint8_t *v_filter = bilinear_filters_msa[yoffset];
+  const uint8_t *h_filter = bilinear_filters_msa[x_offset];
+  const uint8_t *v_filter = bilinear_filters_msa[y_offset];
 
-  if (yoffset) {
-    if (xoffset) {
+  if (y_offset) {
+    if (x_offset) {
       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
           src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
           v_filter, 64, &diff);
@@ -1738,7 +1739,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
                                                   v_filter, 64, &diff);
     }
   } else {
-    if (xoffset) {
+    if (x_offset) {
       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
                                                   ref_stride, sec_pred,
                                                   h_filter, 64, &diff);
@@ -1753,15 +1754,15 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
 
 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
   uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
       uint32_t *sse, const uint8_t *sec_pred) {                               \
     int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_msa[xoffset];                  \
-    const uint8_t *v_filter = bilinear_filters_msa[yoffset];                  \
+    const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
                                                                               \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
             v_filter, ht, &diff);                                             \
@@ -1771,7 +1772,7 @@ uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
             &diff);                                                           \
       }                                                                       \
     } else {                                                                  \
-      if (xoffset) {                                                          \
+      if (x_offset) {                                                         \
         *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
             &diff);                                                           \
diff --git a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
index f077fa4814a..f27504a2078 100644
--- a/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/txfm_macros_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
-#define VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
+#define VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 
@@ -98,4 +98,4 @@
     SRARI_W4_SW(m4_m, m5_m, tmp2_m, tmp3_m, DCT_CONST_BITS);                  \
     PCKEV_H2_SH(m5_m, m4_m, tmp3_m, tmp2_m, out2, out3);                      \
   }
-#endif  // VPX_DSP_MIPS_TXFM_MACROS_MIPS_MSA_H_
+#endif  // VPX_VPX_DSP_MIPS_TXFM_MACROS_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/mips/variance_mmi.c b/libs/libvpx/vpx_dsp/mips/variance_mmi.c
index 4af60d3634b..c1780c33afa 100644
--- a/libs/libvpx/vpx_dsp/mips/variance_mmi.c
+++ b/libs/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -87,10 +87,10 @@ static const uint8_t bilinear_filters[8][2] = {
   "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
 
 #define VARIANCE_SSE_8                                              \
-  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
-  "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t" \
-  "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t" \
   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
@@ -101,10 +101,10 @@ static const uint8_t bilinear_filters[8][2] = {
 
 #define VARIANCE_SSE_16                                             \
   VARIANCE_SSE_8                                                    \
-  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
-  "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t" \
-  "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t" \
   "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
   "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
@@ -115,11 +115,11 @@ static const uint8_t bilinear_filters[8][2] = {
 
 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
   /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
-  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
   "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
@@ -129,11 +129,11 @@ static const uint8_t bilinear_filters[8][2] = {
 
 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
   /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
-  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
   "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
@@ -169,12 +169,12 @@ static const uint8_t bilinear_filters[8][2] = {
 
 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
   /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
-  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
@@ -190,12 +190,12 @@ static const uint8_t bilinear_filters[8][2] = {
 
 #define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
   /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
-  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
@@ -258,12 +258,12 @@ static const uint8_t bilinear_filters[8][2] = {
   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
                                                                     \
   /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
-  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
@@ -282,12 +282,12 @@ static const uint8_t bilinear_filters[8][2] = {
   VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
                                                                     \
   /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
-  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
-  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
-  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[src_ptr])                    \n\t" \
   "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
   "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
   "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
@@ -357,24 +357,23 @@ static const uint8_t bilinear_filters[8][2] = {
 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 // It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
+static void var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+    int pixel_step, unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
 
-      ++a;
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
@@ -387,28 +386,27 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 // filter is applied horizontally (pixel_step = 1) or vertically
 // (pixel_step = stride). It defines the offset required to move from one input
 // to the next. Output is 8-bit.
-static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *filter) {
+static void var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-      ++a;
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
-static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
-                                       const uint8_t *b, int b_stride,
+static inline uint32_t vpx_variance64x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
                                        uint32_t *sse, int high) {
   int sum;
   double ftmp[12];
@@ -424,57 +422,57 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
     "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x27(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x20(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x27(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x20(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x27(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x20(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x27(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x20(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x2f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x28(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x2f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x28(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x2f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x28(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x2f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x28(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x37(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x30(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x37(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x30(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x37(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x30(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x37(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x30(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x3f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x38(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x3f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x38(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x3f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x38(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x3f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x38(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
@@ -491,9 +489,10 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
       [tmp2]"=&r"(tmp[2]),
-      [a]"+&r"(a),                      [b]"+&r"(b),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
       [sum]"=&r"(sum)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
@@ -501,18 +500,19 @@ static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
   return *sse - (((int64_t)sum * sum) / (64 * high));
 }
 
-#define VPX_VARIANCE64XN(n)                                         \
-  uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
-                                    const uint8_t *b, int b_stride, \
-                                    uint32_t *sse) {                \
-    return vpx_variance64x(a, a_stride, b, b_stride, sse, n);       \
+#define VPX_VARIANCE64XN(n)                                                   \
+  uint32_t vpx_variance64x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance64x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 VPX_VARIANCE64XN(64)
 VPX_VARIANCE64XN(32)
 
-uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, uint32_t *sse) {
+uint32_t vpx_variance32x64_mmi(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               uint32_t *sse) {
   int sum;
   double ftmp[12];
   uint32_t tmp[3];
@@ -527,33 +527,33 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
     "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
-    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8_FOR_W64
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
@@ -570,9 +570,10 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
       [tmp2]"=&r"(tmp[2]),
-      [a]"+&r"(a),                      [b]"+&r"(b),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr),
       [sum]"=&r"(sum)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [sse]"r"(sse)
     : "memory"
   );
@@ -580,8 +581,8 @@ uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
   return *sse - (((int64_t)sum * sum) / 2048);
 }
 
-static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
-                                       const uint8_t *b, int b_stride,
+static inline uint32_t vpx_variance32x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
                                        uint32_t *sse, int high) {
   int sum;
   double ftmp[13];
@@ -598,30 +599,30 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
-    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
-    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x17(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
-    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x1f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -646,8 +647,9 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
@@ -655,18 +657,18 @@ static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
   return *sse - (((int64_t)sum * sum) / (32 * high));
 }
 
-#define VPX_VARIANCE32XN(n)                                         \
-  uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
-                                    const uint8_t *b, int b_stride, \
-                                    uint32_t *sse) {                \
-    return vpx_variance32x(a, a_stride, b, b_stride, sse, n);       \
+#define VPX_VARIANCE32XN(n)                                                   \
+  uint32_t vpx_variance32x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance32x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 VPX_VARIANCE32XN(32)
 VPX_VARIANCE32XN(16)
 
-static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
-                                       const uint8_t *b, int b_stride,
+static inline uint32_t vpx_variance16x(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
                                        uint32_t *sse, int high) {
   int sum;
   double ftmp[13];
@@ -683,20 +685,20 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
-    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x0f(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -721,8 +723,9 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
@@ -730,19 +733,19 @@ static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
   return *sse - (((int64_t)sum * sum) / (16 * high));
 }
 
-#define VPX_VARIANCE16XN(n)                                         \
-  uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
-                                    const uint8_t *b, int b_stride, \
-                                    uint32_t *sse) {                \
-    return vpx_variance16x(a, a_stride, b, b_stride, sse, n);       \
+#define VPX_VARIANCE16XN(n)                                                   \
+  uint32_t vpx_variance16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    uint32_t *sse) {                          \
+    return vpx_variance16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 VPX_VARIANCE16XN(32)
 VPX_VARIANCE16XN(16)
 VPX_VARIANCE16XN(8)
 
-static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
+static inline uint32_t vpx_variance8x(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
                                       uint32_t *sse, int high) {
   int sum;
   double ftmp[13];
@@ -759,15 +762,15 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
     "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
     "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_8
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -792,8 +795,9 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
@@ -801,19 +805,19 @@ static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
   return *sse - (((int64_t)sum * sum) / (8 * high));
 }
 
-#define VPX_VARIANCE8XN(n)                                         \
-  uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
-                                   const uint8_t *b, int b_stride, \
-                                   uint32_t *sse) {                \
-    return vpx_variance8x(a, a_stride, b, b_stride, sse, n);       \
+#define VPX_VARIANCE8XN(n)                                                   \
+  uint32_t vpx_variance8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   uint32_t *sse) {                          \
+    return vpx_variance8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 VPX_VARIANCE8XN(16)
 VPX_VARIANCE8XN(8)
 VPX_VARIANCE8XN(4)
 
-static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
+static inline uint32_t vpx_variance4x(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
                                       uint32_t *sse, int high) {
   int sum;
   double ftmp[12];
@@ -830,15 +834,15 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
     "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
     "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
     "1:                                                         \n\t"
-    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
-    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
-    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
-    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                    \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                    \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[ref_ptr])                    \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ref_ptr])                    \n\t"
     VARIANCE_SSE_SUM_4
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
@@ -862,8 +866,9 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),
       [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
     : "memory"
   );
@@ -871,19 +876,19 @@ static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
   return *sse - (((int64_t)sum * sum) / (4 * high));
 }
 
-#define VPX_VARIANCE4XN(n)                                         \
-  uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
-                                   const uint8_t *b, int b_stride, \
-                                   uint32_t *sse) {                \
-    return vpx_variance4x(a, a_stride, b, b_stride, sse, n);       \
+#define VPX_VARIANCE4XN(n)                                                   \
+  uint32_t vpx_variance4x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   uint32_t *sse) {                          \
+    return vpx_variance4x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 VPX_VARIANCE4XN(8)
 VPX_VARIANCE4XN(4)
 
-static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride, uint32_t *sse,
-                                  uint64_t high) {
+static inline uint32_t vpx_mse16x(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  uint32_t *sse, uint64_t high) {
   double ftmp[12];
   uint32_t tmp[1];
 
@@ -900,8 +905,8 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
     VARIANCE_SSE_16
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -914,8 +919,9 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
@@ -923,19 +929,19 @@ static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
   return *sse;
 }
 
-#define vpx_mse16xN(n)                                         \
-  uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
-                               const uint8_t *b, int b_stride, \
-                               uint32_t *sse) {                \
-    return vpx_mse16x(a, a_stride, b, b_stride, sse, n);       \
+#define vpx_mse16xN(n)                                                   \
+  uint32_t vpx_mse16x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                               const uint8_t *ref_ptr, int ref_stride,   \
+                               uint32_t *sse) {                          \
+    return vpx_mse16x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 vpx_mse16xN(16);
 vpx_mse16xN(8);
 
-static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, uint32_t *sse,
-                                 uint64_t high) {
+static inline uint32_t vpx_mse8x(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride,
+                                 uint32_t *sse, uint64_t high) {
   double ftmp[12];
   uint32_t tmp[1];
 
@@ -952,8 +958,8 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
     VARIANCE_SSE_8
 
     "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
-    MMI_ADDU(%[b], %[b], %[b_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
+    MMI_ADDU(%[ref_ptr], %[ref_ptr], %[ref_stride])
     "bnez       %[tmp0],    1b                                  \n\t"
 
     "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
@@ -966,8 +972,9 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
       [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
       [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
       [tmp0]"=&r"(tmp[0]),
-      [a]"+&r"(a),                      [b]"+&r"(b)
-    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [src_ptr]"+&r"(src_ptr),          [ref_ptr]"+&r"(ref_ptr)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride),
       [high]"r"(&high), [sse]"r"(sse)
     : "memory"
   );
@@ -975,28 +982,29 @@ static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
   return *sse;
 }
 
-#define vpx_mse8xN(n)                                                          \
-  uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride,                  \
-                              const uint8_t *b, int b_stride, uint32_t *sse) { \
-    return vpx_mse8x(a, a_stride, b, b_stride, sse, n);                        \
+#define vpx_mse8xN(n)                                                   \
+  uint32_t vpx_mse8x##n##_mmi(const uint8_t *src_ptr, int src_stride,   \
+                              const uint8_t *ref_ptr, int ref_stride,   \
+                              uint32_t *sse) {                          \
+    return vpx_mse8x(src_ptr, src_stride, ref_ptr, ref_stride, sse, n); \
   }
 
 vpx_mse8xN(16);
 vpx_mse8xN(8);
 
-#define SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse);     \
+#define SUBPIX_VAR(W, H)                                                       \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                              \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[((H) + 1) * (W)];                                          \
+    uint8_t temp2[(H) * (W)];                                                  \
+                                                                               \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+                                      W, bilinear_filters[x_offset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
+                                       bilinear_filters[y_offset]);            \
+                                                                               \
+    return vpx_variance##W##x##H##_mmi(temp2, W, ref_ptr, ref_stride, sse);    \
   }
 
 SUBPIX_VAR(64, 64)
@@ -1006,9 +1014,10 @@ SUBPIX_VAR(32, 32)
 SUBPIX_VAR(32, 16)
 SUBPIX_VAR(16, 32)
 
-static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
-                                              int xoffset, int yoffset,
-                                              uint8_t *temp2, int counter) {
+static inline void var_filter_block2d_bil_16x(const uint8_t *src_ptr,
+                                              int src_stride, int x_offset,
+                                              int y_offset, uint8_t *temp2,
+                                              int counter) {
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[15];
@@ -1016,8 +1025,8 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
 
-  const uint8_t *filter_x = bilinear_filters[xoffset];
-  const uint8_t *filter_y = bilinear_filters[yoffset];
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
 
   __asm__ volatile (
     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
@@ -1031,26 +1040,26 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
     // fdata3: fdata3[0] ~ fdata3[15]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
 
-    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
     // temp2: temp2[0] ~ temp2[15]
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
 
-    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
     // temp2+16*1: temp2[0] ~ temp2[15]
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
 
     "1:                                                         \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
 
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
@@ -1062,43 +1071,44 @@ static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
-      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
       [counter]"+&r"(l_counter)
     : [filter_x0] "f"((uint64_t)filter_x[0]),
       [filter_x1] "f"((uint64_t)filter_x[1]),
       [filter_y0] "f"((uint64_t)filter_y[0]),
       [filter_y1] "f"((uint64_t)filter_y[1]),
-      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
       [mask] "f"(mask)
     : "memory"
   );
 }
 
-#define SUBPIX_VAR16XN(H)                                            \
-  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,      \
-      const uint8_t *b, int b_stride, uint32_t *sse) {               \
-    uint8_t temp2[16 * H];                                           \
-    var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
-                               (H - 2) / 2);                         \
-                                                                     \
-    return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse);    \
+#define SUBPIX_VAR16XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint8_t temp2[16 * (H)];                                                   \
+    var_filter_block2d_bil_16x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                               ((H)-2) / 2);                                   \
+                                                                               \
+    return vpx_variance16x##H##_mmi(temp2, 16, ref_ptr, ref_stride, sse);      \
   }
 
 SUBPIX_VAR16XN(16)
 SUBPIX_VAR16XN(8)
 
-static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             uint8_t *temp2, int counter) {
+static inline void var_filter_block2d_bil_8x(const uint8_t *src_ptr,
+                                             int src_stride, int x_offset,
+                                             int y_offset, uint8_t *temp2,
+                                             int counter) {
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[15];
   mips_reg tmp[2];
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
-  const uint8_t *filter_x = bilinear_filters[xoffset];
-  const uint8_t *filter_y = bilinear_filters[yoffset];
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
 
   __asm__ volatile (
     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
@@ -1112,26 +1122,26 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
     // fdata3: fdata3[0] ~ fdata3[7]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
 
-    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
     // temp2: temp2[0] ~ temp2[7]
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
 
-    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
     // temp2+8*1: temp2[0] ~ temp2[7]
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
 
     "1:                                                         \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
 
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
@@ -1143,44 +1153,45 @@ static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
       [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
       [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
       [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
-      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr), [temp2_ptr] "+&r"(temp2_ptr),
       [counter]"+&r"(l_counter)
     : [filter_x0] "f"((uint64_t)filter_x[0]),
       [filter_x1] "f"((uint64_t)filter_x[1]),
       [filter_y0] "f"((uint64_t)filter_y[0]),
       [filter_y1] "f"((uint64_t)filter_y[1]),
-      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
       [mask] "f"(mask)
     : "memory"
   );
 }
 
-#define SUBPIX_VAR8XN(H)                                            \
-  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
-      const uint8_t *b, int b_stride, uint32_t *sse) {              \
-    uint8_t temp2[8 * H];                                           \
-    var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
-                              (H - 2) / 2);                         \
-                                                                    \
-    return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse);     \
+#define SUBPIX_VAR8XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
+    uint8_t temp2[8 * (H)];                                                   \
+    var_filter_block2d_bil_8x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                              ((H)-2) / 2);                                   \
+                                                                              \
+    return vpx_variance8x##H##_mmi(temp2, 8, ref_ptr, ref_stride, sse);       \
   }
 
 SUBPIX_VAR8XN(16)
 SUBPIX_VAR8XN(8)
 SUBPIX_VAR8XN(4)
 
-static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
-                                             int xoffset, int yoffset,
-                                             uint8_t *temp2, int counter) {
+static inline void var_filter_block2d_bil_4x(const uint8_t *src_ptr,
+                                             int src_stride, int x_offset,
+                                             int y_offset, uint8_t *temp2,
+                                             int counter) {
   uint8_t *temp2_ptr = temp2;
   mips_reg l_counter = counter;
   double ftmp[7];
   mips_reg tmp[2];
   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
   DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
-  const uint8_t *filter_x = bilinear_filters[xoffset];
-  const uint8_t *filter_y = bilinear_filters[yoffset];
+  const uint8_t *filter_x = bilinear_filters[x_offset];
+  const uint8_t *filter_y = bilinear_filters[y_offset];
 
   __asm__ volatile (
     "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
@@ -1193,26 +1204,26 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
     // fdata3: fdata3[0] ~ fdata3[3]
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
 
-    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*1: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
     // temp2: temp2[0] ~ temp2[7]
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
 
-    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    // fdata3 +src_stride*2: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
     // temp2+4*1: temp2[0] ~ temp2[7]
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
 
     "1:                                                         \n\t"
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
 
-    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_stride])
     VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
     MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
     VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
@@ -1220,49 +1231,49 @@ static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
     "bnez       %[counter], 1b                                  \n\t"
     : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
       [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
-      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
+      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [src_ptr] "+&r"(src_ptr),
       [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
     : [filter_x0] "f"((uint64_t)filter_x[0]),
       [filter_x1] "f"((uint64_t)filter_x[1]),
       [filter_y0] "f"((uint64_t)filter_y[0]),
       [filter_y1] "f"((uint64_t)filter_y[1]),
-      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [src_stride] "r"((mips_reg)src_stride), [ff_ph_40] "f"(ff_ph_40),
       [mask] "f"(mask)
     : "memory"
   );
 }
 
-#define SUBPIX_VAR4XN(H)                                            \
-  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
-      const uint8_t *b, int b_stride, uint32_t *sse) {              \
-    uint8_t temp2[4 * H];                                           \
-    var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
-                              (H - 2) / 2);                         \
-                                                                    \
-    return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse);     \
+#define SUBPIX_VAR4XN(H)                                                      \
+  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                                 \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                \
+    uint8_t temp2[4 * (H)];                                                   \
+    var_filter_block2d_bil_4x(src_ptr, src_stride, x_offset, y_offset, temp2, \
+                              ((H)-2) / 2);                                   \
+                                                                              \
+    return vpx_variance4x##H##_mmi(temp2, 4, ref_ptr, ref_stride, sse);       \
   }
 
 SUBPIX_VAR4XN(8)
 SUBPIX_VAR4XN(4)
 
-#define SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                   \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse,                    \
-      const uint8_t *second_pred) {                                     \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
-                                                                        \
-    return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse);     \
+#define SUBPIX_AVG_VAR(W, H)                                                   \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                          \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[((H) + 1) * (W)];                                          \
+    uint8_t temp2[(H) * (W)];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[(H) * (W)]);                            \
+                                                                               \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, (H) + 1, \
+                                      W, bilinear_filters[x_offset]);          \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,              \
+                                       bilinear_filters[y_offset]);            \
+                                                                               \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);                   \
+                                                                               \
+    return vpx_variance##W##x##H##_mmi(temp3, W, ref_ptr, ref_stride, sse);    \
   }
 
 SUBPIX_AVG_VAR(64, 64)
diff --git a/libs/libvpx/vpx_dsp/mips/variance_msa.c b/libs/libvpx/vpx_dsp/mips/variance_msa.c
index 49b2f99230f..444b086a6e8 100644
--- a/libs/libvpx/vpx_dsp/mips/variance_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/variance_msa.c
@@ -33,10 +33,11 @@
     sub += res_l0_m + res_l1_m;                                     \
   }
 
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
 
 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
 
 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
                                     const uint8_t *ref_ptr, int32_t ref_stride,
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index 187a013421a..5b5a1cbc3a5 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -658,7 +658,7 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_hor[cnt] = filter_x[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     switch (w) {
       case 4:
         common_hz_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 5187cea21c9..ba816192a1f 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -538,8 +538,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
@@ -571,8 +571,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
                         x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index ef8c9011406..e6a790dfc6d 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -625,7 +625,7 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index 152dc26104e..792c0f709c4 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -634,7 +634,7 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_hor[cnt] = filter_x[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2) {
     switch (w) {
       case 4:
         common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
new file mode 100644
index 00000000000..ba9ceb86658
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -0,0 +1,716 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx_ports/mem.h"
+
+#define GET_DATA_H_MMI                                     \
+  "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
+  "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
+  "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
+  "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
+  "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
+  "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"
+
+#define GET_DATA_V_MMI                                     \
+  "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
+  "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
+  "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
+  "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
+  "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"
+
+/* clang-format off */
+#define ROUND_POWER_OF_TWO_MMI                             \
+  /* Add para[0] */                                        \
+  "lw         %[tmp0],     0x00(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp6])                          \
+  "punpcklwd  %[ftmp6],    %[ftmp6],    %[ftmp6]     \n\t" \
+  "paddw      %[srcl],     %[srcl],     %[ftmp6]     \n\t" \
+  "paddw      %[srch],     %[srch],     %[ftmp6]     \n\t" \
+  /* Arithmetic right shift para[1] bits */                \
+  "lw         %[tmp0],     0x04(%[para])             \n\t" \
+  MMI_MTC1(%[tmp0],     %[ftmp5])                          \
+  "psraw      %[srcl],     %[srcl],     %[ftmp5]     \n\t" \
+  "psraw      %[srch],     %[srch],     %[ftmp5]     \n\t"
+/* clang-format on */
+
+#define CLIP_PIXEL_MMI                                     \
+  /* Staturated operation */                               \
+  "packsswh   %[srcl],     %[srcl],     %[srch]      \n\t" \
+  "packushb   %[ftmp12],   %[srcl],     %[ftmp0]     \n\t"
+
+static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[5];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  /* clang-format off */
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int y0_q4,
+                              int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_x = filter[x0_q4];
+  double ftmp[14];
+  uint32_t tmp[2];
+  uint32_t para[2];
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= SUBPEL_TAPS / 2 - 1;
+  src_stride -= w;
+  dst_stride -= w;
+  (void)x_step_q4;
+
+  __asm__ volatile(
+    "move       %[tmp1],    %[width]                   \n\t"
+    "xor        %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
+    "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
+    "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
+    "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
+    "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per row */
+    "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
+    "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
+    "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
+    "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
+    "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
+    "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
+    "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],    -0x04)
+    /* Get raw data */
+    GET_DATA_H_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
+    "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
+    "li         %[tmp0],    0x10001                    \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],   1b                         \n\t"
+    "move       %[width],   %[tmp1]                    \n\t"
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],  1b                         \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
+      [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
+      [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
+      [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
+      [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
+      [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
+      [src]"+&r"(src),          [width]"+&r"(w),
+      [dst]"+&r"(dst),          [height]"+&r"(h)
+    : [filter]"r"(filter_x),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int y0_q4,
+                                  int y_step_q4, int32_t w, int32_t h) {
+  const int16_t *filter_y = filter[y0_q4];
+  double ftmp[16];
+  uint32_t tmp[1];
+  uint32_t para[2];
+  ptrdiff_t addr = src_stride;
+  para[0] = (1 << ((FILTER_BITS)-1));
+  para[1] = FILTER_BITS;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  src_stride -= w;
+  dst_stride -= w;
+  (void)y_step_q4;
+
+  __asm__ volatile(
+    "xor        %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
+    "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
+    "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
+    "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
+    "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
+    "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
+    "1:                                                \n\t"
+    /* Get 8 data per column */
+    "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
+    MMI_ADDU(%[tmp0],     %[src],     %[addr])
+    "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
+    MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
+    "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
+    "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
+    "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
+    MMI_ADDIU(%[width],   %[width],   -0x04)
+    /* Get raw data */
+    GET_DATA_V_MMI
+    ROUND_POWER_OF_TWO_MMI
+    CLIP_PIXEL_MMI
+    "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
+    "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
+    "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
+    "li         %[tmp0],     0x10001                   \n\t"
+    MMI_MTC1(%[tmp0],     %[ftmp5])
+    "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
+    "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
+    "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
+    "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
+    MMI_ADDIU(%[dst],     %[dst],      0x04)
+    MMI_ADDIU(%[src],     %[src],      0x04)
+    /* Loop count */
+    "bnez       %[width],    1b                        \n\t"
+    MMI_SUBU(%[width],    %[addr],     %[src_stride])
+    MMI_ADDU(%[src],      %[src],      %[src_stride])
+    MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
+    MMI_ADDIU(%[height],  %[height],   -0x01)
+    "bnez       %[height],   1b                        \n\t"
+    : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
+      [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
+      [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
+      [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
+      [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
+      [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
+      [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
+      [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
+      [src]"+&r"(src),          [dst]"+&r"(dst),
+      [width]"+&r"(w),          [height]"+&r"(h),
+      [tmp0]"=&r"(tmp[0])
+    : [filter]"r"(filter_y),    [para]"r"(para),
+      [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride),
+      [addr]"r"((mips_reg)addr)
+    : "memory"
+  );
+}
+
+void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  if (w & 0x03) {
+    for (y = 0; y < h; ++y) {
+      for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    double ftmp[4];
+    uint32_t tmp[2];
+    src_stride -= w;
+    dst_stride -= w;
+
+    __asm__ volatile(
+      "move       %[tmp1],    %[width]                  \n\t"
+      "xor        %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
+      "li         %[tmp0],    0x10001                   \n\t"
+      MMI_MTC1(%[tmp0],    %[ftmp3])
+      "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
+      "1:                                               \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
+      "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
+      "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
+      "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
+      "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
+      "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
+      "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
+      MMI_ADDIU(%[width],  %[width],   -0x04)
+      MMI_ADDIU(%[dst],    %[dst],     0x04)
+      MMI_ADDIU(%[src],    %[src],     0x04)
+      "bnez       %[width],   1b                        \n\t"
+      "move       %[width],   %[tmp1]                   \n\t"
+      MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
+      MMI_ADDU(%[src],     %[src],     %[src_stride])
+      MMI_ADDIU(%[height], %[height],  -0x01)
+      "bnez       %[height],  1b                        \n\t"
+      : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
+        [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
+        [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
+        [src]"+&r"(src),        [dst]"+&r"(dst),
+        [width]"+&r"(w),        [height]"+&r"(h)
+      : [src_stride]"r"((mips_reg)src_stride),
+        [dst_stride]"r"((mips_reg)dst_stride)
+      : "memory"
+    );
+  }
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *y_filters, int y0_q4,
+                              int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+          dst[y * dst_stride] +
+              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+          1);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const InterpKernel *x_filters, int x0_q4,
+                               int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = ROUND_POWER_OF_TWO(
+          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w & 0x03) {
+    convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
+                   64, filter, x0_q4, x_step_q4, w, intermediate_height);
+    convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                  filter, y0_q4, y_step_q4, w, h);
+  } else {
+    convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                       temp, 64, filter, x0_q4, x_step_q4, w,
+                       intermediate_height);
+    convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                      filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
+void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                             int32_t w, int32_t h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                   w, h);
+  else
+    convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+}
+
+void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                  h);
+  else
+    convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                 int w, int h) {
+  (void)y0_q4;
+  (void)y_step_q4;
+  if (w & 0x03)
+    convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, w, h);
+  else
+    convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                int w, int h) {
+  (void)x0_q4;
+  (void)x_step_q4;
+  if (w & 0x03)
+    convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
+                      y_step_q4, w, h);
+  else
+    convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
+                          y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
+  assert(w <= 64);
+  assert(h <= 64);
+
+  vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
+                    y_step_q4, w, h);
+  vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+}
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
index d35a5a7a639..c942167587b 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -558,8 +558,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_x)[0] == 0 &&
-      ((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_hv_2ht_2vt_4w_msa(src, (int32_t)src_stride, dst,
@@ -591,8 +591,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
-  } else if (((const int32_t *)filter_x)[0] == 0 ||
-             ((const int32_t *)filter_y)[0] == 0) {
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
     vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
                     y0_q4, y_step_q4, w, h);
   } else {
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 13fce0077c9..195228689e0 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -641,7 +641,7 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
     filt_ver[cnt] = filter_y[cnt];
   }
 
-  if (((const int32_t *)filter_y)[0] == 0) {
+  if (vpx_get_filter_taps(filter_y) == 2) {
     switch (w) {
       case 4:
         common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
diff --git a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index d53244596be..a0280c5434b 100644
--- a/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/libs/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
-#define VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#ifndef VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
+#define VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
 
 #include "vpx_dsp/mips/macros_msa.h"
 #include "vpx_dsp/vpx_filter.h"
@@ -119,4 +119,4 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);             \
     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                            \
   }
-#endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
+#endif  // VPX_VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_
diff --git a/libs/libvpx/vpx_dsp/postproc.h b/libs/libvpx/vpx_dsp/postproc.h
index 43cb5c8e8de..37f993f8147 100644
--- a/libs/libvpx/vpx_dsp/postproc.h
+++ b/libs/libvpx/vpx_dsp/postproc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_POSTPROC_H_
-#define VPX_DSP_POSTPROC_H_
+#ifndef VPX_VPX_DSP_POSTPROC_H_
+#define VPX_VPX_DSP_POSTPROC_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,4 +22,4 @@ int vpx_setup_noise(double sigma, int8_t *noise, int size);
 }
 #endif
 
-#endif  // VPX_DSP_POSTPROC_H_
+#endif  // VPX_VPX_DSP_POSTPROC_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
index 2c5d9a4f6a5..7ac873f9fc0 100644
--- a/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
+++ b/libs/libvpx/vpx_dsp/ppc/bitdepth_conversion_vsx.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
-#define VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#ifndef VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#define VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -44,4 +44,4 @@ static INLINE void store_tran_low(int16x8_t v, int32_t c, tran_low_t *s) {
 #endif
 }
 
-#endif  // VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
+#endif  // VPX_VPX_DSP_PPC_BITDEPTH_CONVERSION_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c b/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c
new file mode 100644
index 00000000000..21299116966
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/deblock_vsx.c
@@ -0,0 +1,374 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+extern const int16_t vpx_rv[];
+
+static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
+                                       0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
+                                       0x1C, 0x1D, 0x1E, 0x1F };
+
+static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                     0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                     0x1C, 0x1D, 0x1E, 0x1F };
+
+static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
+                                      uint8x16_t filter) {
+  const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
+  const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
+  const uint8x16_t k3 = vec_avg(k1, k2);
+  const uint8x16_t f_a = vec_max(vec_absd(v, ctx[0]), vec_absd(v, ctx[1]));
+  const uint8x16_t f_b = vec_max(vec_absd(v, ctx[2]), vec_absd(v, ctx[3]));
+  const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
+  return vec_sel(v, vec_avg(k3, v), mask);
+}
+
+static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
+                            int stride) {
+  ctx[0] = vec_vsx_ld(col - 2 * stride, src);
+  ctx[1] = vec_vsx_ld(col - stride, src);
+  ctx[2] = vec_vsx_ld(col + stride, src);
+  ctx[3] = vec_vsx_ld(col + 2 * stride, src);
+}
+
+static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
+                            uint8x16_t v, uint8x16_t right_ctx) {
+  static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                      0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+                                      0x1A, 0x1B, 0x1C, 0x1D };
+
+  static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+                                      0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+                                      0x1B, 0x1C, 0x1D, 0x1E };
+
+  static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+                                      0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
+                                      0x0D, 0x0E, 0x0F, 0x10 };
+
+  static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                      0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                      0x0E, 0x0F, 0x10, 0x11 };
+  ctx[0] = vec_perm(left_ctx, v, l2_perm);
+  ctx[1] = vec_perm(left_ctx, v, l1_perm);
+  ctx[2] = vec_perm(v, right_ctx, r1_perm);
+  ctx[3] = vec_perm(v, right_ctx, r2_perm);
+}
+void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
+                                              unsigned char *dst_ptr,
+                                              int src_pixels_per_line,
+                                              int dst_pixels_per_line, int cols,
+                                              unsigned char *f, int size) {
+  int row, col;
+  uint8x16_t ctx[4], out, v, left_ctx;
+
+  for (row = 0; row < size; row++) {
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    /* now post_proc_across */
+    left_ctx = vec_splats(dst_ptr[0]);
+    v = vec_vsx_ld(0, dst_ptr);
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = (col + 16 == cols)
+                                       ? vec_splats(dst_ptr[cols - 1])
+                                       : vec_vsx_ld(col, dst_ptr + 16);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+      left_ctx = v;
+      v = right_ctx;
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
+// C: s[c + 7]
+static INLINE int16x8_t next7l_s16(uint8x16_t c) {
+  static const uint8x16_t next7_perm = {
+    0x07, 0x10, 0x08, 0x11, 0x09, 0x12, 0x0A, 0x13,
+    0x0B, 0x14, 0x0C, 0x15, 0x0D, 0x16, 0x0E, 0x17,
+  };
+  return (int16x8_t)vec_perm(c, vec_zeros_u8, next7_perm);
+}
+
+// Slide across window and add.
+static INLINE int16x8_t slide_sum_s16(int16x8_t x) {
+  // x = A B C D E F G H
+  //
+  // 0 A B C D E F G
+  const int16x8_t sum1 = vec_add(x, vec_slo(x, vec_splats((int8_t)(2 << 3))));
+  // 0 0 A B C D E F
+  const int16x8_t sum2 = vec_add(vec_slo(x, vec_splats((int8_t)(4 << 3))),
+                                 // 0 0 0 A B C D E
+                                 vec_slo(x, vec_splats((int8_t)(6 << 3))));
+  // 0 0 0 0 A B C D
+  const int16x8_t sum3 = vec_add(vec_slo(x, vec_splats((int8_t)(8 << 3))),
+                                 // 0 0 0 0 0 A B C
+                                 vec_slo(x, vec_splats((int8_t)(10 << 3))));
+  // 0 0 0 0 0 0 A B
+  const int16x8_t sum4 = vec_add(vec_slo(x, vec_splats((int8_t)(12 << 3))),
+                                 // 0 0 0 0 0 0 0 A
+                                 vec_slo(x, vec_splats((int8_t)(14 << 3))));
+  return vec_add(vec_add(sum1, sum2), vec_add(sum3, sum4));
+}
+
+// Slide across window and add.
+static INLINE int32x4_t slide_sumsq_s32(int32x4_t xsq_even, int32x4_t xsq_odd) {
+  //   0 A C E
+  // + 0 B D F
+  int32x4_t sumsq_1 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(4 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(4 << 3))));
+  //   0 0 A C
+  // + 0 0 B D
+  int32x4_t sumsq_2 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(8 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(8 << 3))));
+  //   0 0 0 A
+  // + 0 0 0 B
+  int32x4_t sumsq_3 = vec_add(vec_slo(xsq_even, vec_splats((int8_t)(12 << 3))),
+                              vec_slo(xsq_odd, vec_splats((int8_t)(12 << 3))));
+  sumsq_1 = vec_add(sumsq_1, xsq_even);
+  sumsq_2 = vec_add(sumsq_2, sumsq_3);
+  return vec_add(sumsq_1, sumsq_2);
+}
+
+// C: (b + sum + val) >> 4
+static INLINE int16x8_t filter_s16(int16x8_t b, int16x8_t sum, int16x8_t val) {
+  return vec_sra(vec_add(vec_add(b, sum), val), vec_splats((uint16_t)4));
+}
+
+// C: sumsq * 15 - sum * sum
+static INLINE bool16x8_t mask_s16(int32x4_t sumsq_even, int32x4_t sumsq_odd,
+                                  int16x8_t sum, int32x4_t lim) {
+  static const uint8x16_t mask_merge = { 0x00, 0x01, 0x10, 0x11, 0x04, 0x05,
+                                         0x14, 0x15, 0x08, 0x09, 0x18, 0x19,
+                                         0x0C, 0x0D, 0x1C, 0x1D };
+  const int32x4_t sumsq_odd_scaled =
+      vec_mul(sumsq_odd, vec_splats((int32_t)15));
+  const int32x4_t sumsq_even_scaled =
+      vec_mul(sumsq_even, vec_splats((int32_t)15));
+  const int32x4_t thres_odd = vec_sub(sumsq_odd_scaled, vec_mulo(sum, sum));
+  const int32x4_t thres_even = vec_sub(sumsq_even_scaled, vec_mule(sum, sum));
+
+  const bool32x4_t mask_odd = vec_cmplt(thres_odd, lim);
+  const bool32x4_t mask_even = vec_cmplt(thres_even, lim);
+  return vec_perm((bool16x8_t)mask_even, (bool16x8_t)mask_odd, mask_merge);
+}
+
+void vpx_mbpost_proc_across_ip_vsx(unsigned char *src, int pitch, int rows,
+                                   int cols, int flimit) {
+  int row, col;
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+
+  for (row = 0; row < rows; row++) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    // Fill left context with first col.
+    int16x8_t left_ctx = vec_splats((int16_t)src[0]);
+    int16_t s = src[0] * 9;
+    int32_t ssq = src[0] * src[0] * 9 + 16;
+
+    // Fill the next 6 columns of the sliding window with cols 2 to 7.
+    for (col = 1; col <= 6; ++col) {
+      s += src[col];
+      ssq += src[col] * src[col];
+    }
+    // Set this sum to every element in the window.
+    sum = vec_splats(s);
+    sumsq_even = vec_splats(ssq);
+    sumsq_odd = vec_splats(ssq);
+
+    for (col = 0; col < cols; col += 8) {
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const uint8x16_t val = vec_vsx_ld(0, src + col);
+      const int16x8_t val_high = unpack_to_s16_h(val);
+
+      // C: s[c + 7]
+      const int16x8_t right_ctx = (col + 8 == cols)
+                                      ? vec_splats((int16_t)src[col + 7])
+                                      : next7l_s16(val);
+
+      // C: x = s[c + 7] - s[c - 8];
+      const int16x8_t x = vec_sub(right_ctx, left_ctx);
+      const int32x4_t xsq_even =
+          vec_sub(vec_mule(right_ctx, right_ctx), vec_mule(left_ctx, left_ctx));
+      const int32x4_t xsq_odd =
+          vec_sub(vec_mulo(right_ctx, right_ctx), vec_mulo(left_ctx, left_ctx));
+
+      const int32x4_t sumsq_tmp = slide_sumsq_s32(xsq_even, xsq_odd);
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_even = vec_add(sumsq_even, sumsq_tmp);
+      // B D F G
+      // A C E G
+      // 0 B D F
+      // 0 A C E
+      // 0 0 B D
+      // 0 0 A C
+      // 0 0 0 B
+      // 0 0 0 A
+      sumsq_odd = vec_add(sumsq_odd, vec_add(sumsq_tmp, xsq_odd));
+
+      sum = vec_add(sum, slide_sum_s16(x));
+
+      // C: (8 + sum + s[c]) >> 4
+      filtered = filter_s16(vec_splats((int16_t)8), sum, val_high);
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(val_high, filtered, mask);
+
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, src + col), load_merge);
+      vec_vsx_st(out, 0, src + col);
+
+      // Update window sum and square sum
+      sum = vec_splat(sum, 7);
+      sumsq_even = vec_splat(sumsq_odd, 3);
+      sumsq_odd = vec_splat(sumsq_odd, 3);
+
+      // C: s[c - 8] (for next iteration)
+      left_ctx = val_high;
+    }
+    src += pitch;
+  }
+}
+
+void vpx_mbpost_proc_down_vsx(uint8_t *dst, int pitch, int rows, int cols,
+                              int flimit) {
+  int col, row, i;
+  int16x8_t window[16];
+  const int32x4_t lim = vec_splats(flimit);
+
+  // 8 columns are processed at a time.
+  assert(cols % 8 == 0);
+  // If rows is less than 8 the bottom border extension fails.
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    // The sum is signed and requires at most 13 bits.
+    // (8 bits + sign) * 15 (4 bits)
+    int16x8_t r1, sum;
+    // The sum of squares requires at most 20 bits.
+    // (16 bits + sign) * 15 (4 bits)
+    int32x4_t sumsq_even, sumsq_odd;
+
+    r1 = unpack_to_s16_h(vec_vsx_ld(0, dst));
+    // Fill sliding window with first row.
+    for (i = 0; i <= 8; i++) {
+      window[i] = r1;
+    }
+    // First 9 rows of the sliding window are the same.
+    // sum = r1 * 9
+    sum = vec_mladd(r1, vec_splats((int16_t)9), vec_zeros_s16);
+
+    // sumsq = r1 * r1 * 9
+    sumsq_even = vec_mule(sum, r1);
+    sumsq_odd = vec_mulo(sum, r1);
+
+    // Fill the next 6 rows of the sliding window with rows 2 to 7.
+    for (i = 1; i <= 6; ++i) {
+      const int16x8_t next_row = unpack_to_s16_h(vec_vsx_ld(i * pitch, dst));
+      window[i + 8] = next_row;
+      sum = vec_add(sum, next_row);
+      sumsq_odd = vec_add(sumsq_odd, vec_mulo(next_row, next_row));
+      sumsq_even = vec_add(sumsq_even, vec_mule(next_row, next_row));
+    }
+
+    for (row = 0; row < rows; row++) {
+      int32x4_t d15_even, d15_odd, d0_even, d0_odd;
+      bool16x8_t mask;
+      int16x8_t filtered, masked;
+      uint8x16_t out;
+
+      const int16x8_t rv = vec_vsx_ld(0, vpx_rv + (row & 127));
+
+      // Move the sliding window
+      if (row + 7 < rows) {
+        window[15] = unpack_to_s16_h(vec_vsx_ld((row + 7) * pitch, dst));
+      } else {
+        window[15] = window[14];
+      }
+
+      // C: sum += s[7 * pitch] - s[-8 * pitch];
+      sum = vec_add(sum, vec_sub(window[15], window[0]));
+
+      // C: sumsq += s[7 * pitch] * s[7 * pitch] - s[-8 * pitch] * s[-8 *
+      // pitch];
+      // Optimization Note: Caching a squared-window for odd and even is
+      // slower than just repeating the multiplies.
+      d15_odd = vec_mulo(window[15], window[15]);
+      d15_even = vec_mule(window[15], window[15]);
+      d0_odd = vec_mulo(window[0], window[0]);
+      d0_even = vec_mule(window[0], window[0]);
+      sumsq_odd = vec_add(sumsq_odd, vec_sub(d15_odd, d0_odd));
+      sumsq_even = vec_add(sumsq_even, vec_sub(d15_even, d0_even));
+
+      // C: (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4
+      filtered = filter_s16(rv, sum, window[8]);
+
+      // C: sumsq * 15 - sum * sum
+      mask = mask_s16(sumsq_even, sumsq_odd, sum, lim);
+      masked = vec_sel(window[8], filtered, mask);
+
+      // TODO(ltrudeau) If cols % 16 == 0, we could just process 16 per
+      // iteration
+      out = vec_perm((uint8x16_t)masked, vec_vsx_ld(0, dst + row * pitch),
+                     load_merge);
+      vec_vsx_st(out, 0, dst + row * pitch);
+
+      // Optimization Note: Turns out that the following loop is faster than
+      // using pointers to manage the sliding window.
+      for (i = 1; i < 16; i++) {
+        window[i - 1] = window[i];
+      }
+    }
+    dst += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c b/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
new file mode 100644
index 00000000000..328b0e31301
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/fdct32x32_vsx.c
@@ -0,0 +1,553 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/ppc/transpose_vsx.h"
+#include "vpx_dsp/ppc/txfm_common_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Returns ((a +/- b) * cospi16 + (2 << 13)) >> 14.
+static INLINE void single_butterfly(int16x8_t a, int16x8_t b, int16x8_t *add,
+                                    int16x8_t *sub) {
+  // Since a + b can overflow 16 bits, the multiplication is distributed
+  // (a * c +/- b * c).
+  const int32x4_t ac_e = vec_mule(a, cospi16_v);
+  const int32x4_t ac_o = vec_mulo(a, cospi16_v);
+  const int32x4_t bc_e = vec_mule(b, cospi16_v);
+  const int32x4_t bc_o = vec_mulo(b, cospi16_v);
+
+  // Reuse the same multiplies for sum and difference.
+  const int32x4_t sum_e = vec_add(ac_e, bc_e);
+  const int32x4_t sum_o = vec_add(ac_o, bc_o);
+  const int32x4_t diff_e = vec_sub(ac_e, bc_e);
+  const int32x4_t diff_o = vec_sub(ac_o, bc_o);
+
+  // Add rounding offset
+  const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+  const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+  const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+  const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+  const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+  const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+  const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+  const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+  // There's no pack operation for even and odd, so we need to permute.
+  *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+  *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// Returns (a * c1 +/- b * c2 + (2 << 13)) >> 14
+static INLINE void double_butterfly(int16x8_t a, int16x8_t c1, int16x8_t b,
+                                    int16x8_t c2, int16x8_t *add,
+                                    int16x8_t *sub) {
+  const int32x4_t ac1_o = vec_mulo(a, c1);
+  const int32x4_t ac1_e = vec_mule(a, c1);
+  const int32x4_t ac2_o = vec_mulo(a, c2);
+  const int32x4_t ac2_e = vec_mule(a, c2);
+
+  const int32x4_t bc1_o = vec_mulo(b, c1);
+  const int32x4_t bc1_e = vec_mule(b, c1);
+  const int32x4_t bc2_o = vec_mulo(b, c2);
+  const int32x4_t bc2_e = vec_mule(b, c2);
+
+  const int32x4_t sum_o = vec_add(ac1_o, bc2_o);
+  const int32x4_t sum_e = vec_add(ac1_e, bc2_e);
+  const int32x4_t diff_o = vec_sub(ac2_o, bc1_o);
+  const int32x4_t diff_e = vec_sub(ac2_e, bc1_e);
+
+  // Add rounding offset
+  const int32x4_t rsum_o = vec_add(sum_o, vec_dct_const_rounding);
+  const int32x4_t rsum_e = vec_add(sum_e, vec_dct_const_rounding);
+  const int32x4_t rdiff_o = vec_add(diff_o, vec_dct_const_rounding);
+  const int32x4_t rdiff_e = vec_add(diff_e, vec_dct_const_rounding);
+
+  const int32x4_t ssum_o = vec_sra(rsum_o, vec_dct_const_bits);
+  const int32x4_t ssum_e = vec_sra(rsum_e, vec_dct_const_bits);
+  const int32x4_t sdiff_o = vec_sra(rdiff_o, vec_dct_const_bits);
+  const int32x4_t sdiff_e = vec_sra(rdiff_e, vec_dct_const_bits);
+
+  // There's no pack operation for even and odd, so we need to permute.
+  *add = (int16x8_t)vec_perm(ssum_e, ssum_o, vec_perm_odd_even_pack);
+  *sub = (int16x8_t)vec_perm(sdiff_e, sdiff_o, vec_perm_odd_even_pack);
+}
+
+// While other architecture combine the load and the stage 1 operations, Power9
+// benchmarking show no benefit in such an approach.
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+  // Tried out different combinations of load and shift instructions, this is
+  // the fastest one.
+  {
+    const int16x8_t l0 = vec_vsx_ld(0, a);
+    const int16x8_t l1 = vec_vsx_ld(0, a + stride);
+    const int16x8_t l2 = vec_vsx_ld(0, a + 2 * stride);
+    const int16x8_t l3 = vec_vsx_ld(0, a + 3 * stride);
+    const int16x8_t l4 = vec_vsx_ld(0, a + 4 * stride);
+    const int16x8_t l5 = vec_vsx_ld(0, a + 5 * stride);
+    const int16x8_t l6 = vec_vsx_ld(0, a + 6 * stride);
+    const int16x8_t l7 = vec_vsx_ld(0, a + 7 * stride);
+
+    const int16x8_t l8 = vec_vsx_ld(0, a + 8 * stride);
+    const int16x8_t l9 = vec_vsx_ld(0, a + 9 * stride);
+    const int16x8_t l10 = vec_vsx_ld(0, a + 10 * stride);
+    const int16x8_t l11 = vec_vsx_ld(0, a + 11 * stride);
+    const int16x8_t l12 = vec_vsx_ld(0, a + 12 * stride);
+    const int16x8_t l13 = vec_vsx_ld(0, a + 13 * stride);
+    const int16x8_t l14 = vec_vsx_ld(0, a + 14 * stride);
+    const int16x8_t l15 = vec_vsx_ld(0, a + 15 * stride);
+
+    b[0] = vec_sl(l0, vec_dct_scale_log2);
+    b[1] = vec_sl(l1, vec_dct_scale_log2);
+    b[2] = vec_sl(l2, vec_dct_scale_log2);
+    b[3] = vec_sl(l3, vec_dct_scale_log2);
+    b[4] = vec_sl(l4, vec_dct_scale_log2);
+    b[5] = vec_sl(l5, vec_dct_scale_log2);
+    b[6] = vec_sl(l6, vec_dct_scale_log2);
+    b[7] = vec_sl(l7, vec_dct_scale_log2);
+
+    b[8] = vec_sl(l8, vec_dct_scale_log2);
+    b[9] = vec_sl(l9, vec_dct_scale_log2);
+    b[10] = vec_sl(l10, vec_dct_scale_log2);
+    b[11] = vec_sl(l11, vec_dct_scale_log2);
+    b[12] = vec_sl(l12, vec_dct_scale_log2);
+    b[13] = vec_sl(l13, vec_dct_scale_log2);
+    b[14] = vec_sl(l14, vec_dct_scale_log2);
+    b[15] = vec_sl(l15, vec_dct_scale_log2);
+  }
+  {
+    const int16x8_t l16 = vec_vsx_ld(0, a + 16 * stride);
+    const int16x8_t l17 = vec_vsx_ld(0, a + 17 * stride);
+    const int16x8_t l18 = vec_vsx_ld(0, a + 18 * stride);
+    const int16x8_t l19 = vec_vsx_ld(0, a + 19 * stride);
+    const int16x8_t l20 = vec_vsx_ld(0, a + 20 * stride);
+    const int16x8_t l21 = vec_vsx_ld(0, a + 21 * stride);
+    const int16x8_t l22 = vec_vsx_ld(0, a + 22 * stride);
+    const int16x8_t l23 = vec_vsx_ld(0, a + 23 * stride);
+
+    const int16x8_t l24 = vec_vsx_ld(0, a + 24 * stride);
+    const int16x8_t l25 = vec_vsx_ld(0, a + 25 * stride);
+    const int16x8_t l26 = vec_vsx_ld(0, a + 26 * stride);
+    const int16x8_t l27 = vec_vsx_ld(0, a + 27 * stride);
+    const int16x8_t l28 = vec_vsx_ld(0, a + 28 * stride);
+    const int16x8_t l29 = vec_vsx_ld(0, a + 29 * stride);
+    const int16x8_t l30 = vec_vsx_ld(0, a + 30 * stride);
+    const int16x8_t l31 = vec_vsx_ld(0, a + 31 * stride);
+
+    b[16] = vec_sl(l16, vec_dct_scale_log2);
+    b[17] = vec_sl(l17, vec_dct_scale_log2);
+    b[18] = vec_sl(l18, vec_dct_scale_log2);
+    b[19] = vec_sl(l19, vec_dct_scale_log2);
+    b[20] = vec_sl(l20, vec_dct_scale_log2);
+    b[21] = vec_sl(l21, vec_dct_scale_log2);
+    b[22] = vec_sl(l22, vec_dct_scale_log2);
+    b[23] = vec_sl(l23, vec_dct_scale_log2);
+
+    b[24] = vec_sl(l24, vec_dct_scale_log2);
+    b[25] = vec_sl(l25, vec_dct_scale_log2);
+    b[26] = vec_sl(l26, vec_dct_scale_log2);
+    b[27] = vec_sl(l27, vec_dct_scale_log2);
+    b[28] = vec_sl(l28, vec_dct_scale_log2);
+    b[29] = vec_sl(l29, vec_dct_scale_log2);
+    b[30] = vec_sl(l30, vec_dct_scale_log2);
+    b[31] = vec_sl(l31, vec_dct_scale_log2);
+  }
+}
+
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  vec_vsx_st(b[0], 0, a);
+  vec_vsx_st(b[8], 0, a + 8);
+  vec_vsx_st(b[16], 0, a + 16);
+  vec_vsx_st(b[24], 0, a + 24);
+
+  vec_vsx_st(b[1], 0, a + 32);
+  vec_vsx_st(b[9], 0, a + 40);
+  vec_vsx_st(b[17], 0, a + 48);
+  vec_vsx_st(b[25], 0, a + 56);
+
+  vec_vsx_st(b[2], 0, a + 64);
+  vec_vsx_st(b[10], 0, a + 72);
+  vec_vsx_st(b[18], 0, a + 80);
+  vec_vsx_st(b[26], 0, a + 88);
+
+  vec_vsx_st(b[3], 0, a + 96);
+  vec_vsx_st(b[11], 0, a + 104);
+  vec_vsx_st(b[19], 0, a + 112);
+  vec_vsx_st(b[27], 0, a + 120);
+
+  vec_vsx_st(b[4], 0, a + 128);
+  vec_vsx_st(b[12], 0, a + 136);
+  vec_vsx_st(b[20], 0, a + 144);
+  vec_vsx_st(b[28], 0, a + 152);
+
+  vec_vsx_st(b[5], 0, a + 160);
+  vec_vsx_st(b[13], 0, a + 168);
+  vec_vsx_st(b[21], 0, a + 176);
+  vec_vsx_st(b[29], 0, a + 184);
+
+  vec_vsx_st(b[6], 0, a + 192);
+  vec_vsx_st(b[14], 0, a + 200);
+  vec_vsx_st(b[22], 0, a + 208);
+  vec_vsx_st(b[30], 0, a + 216);
+
+  vec_vsx_st(b[7], 0, a + 224);
+  vec_vsx_st(b[15], 0, a + 232);
+  vec_vsx_st(b[23], 0, a + 240);
+  vec_vsx_st(b[31], 0, a + 248);
+}
+
+// Returns 1 if negative 0 if positive
+static INLINE int16x8_t vec_sign_s16(int16x8_t a) {
+  return vec_sr(a, vec_shift_sign_s16);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const int16x8_t sign = vec_sign_s16(a);
+  return vec_sra(vec_sub(vec_add(a, vec_twos_s16), sign), vec_dct_scale_log2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t sign = vec_sign_s16(a);
+  return vec_sra(vec_add(vec_add(a, vec_ones_s16), sign), vec_dct_scale_log2);
+}
+
+static void fdct32_vsx(const int16x8_t *in, int16x8_t *out, int pass) {
+  int16x8_t temp0[32];  // Hold stages: 1, 4, 7
+  int16x8_t temp1[32];  // Hold stages: 2, 5
+  int16x8_t temp2[32];  // Hold stages: 3, 6
+  int i;
+
+  // Stage 1
+  // Unrolling this loops actually slows down Power9 benchmarks
+  for (i = 0; i < 16; i++) {
+    temp0[i] = vec_add(in[i], in[31 - i]);
+    // pass through to stage 3.
+    temp1[i + 16] = vec_sub(in[15 - i], in[i + 16]);
+  }
+
+  // Stage 2
+  // Unrolling this loops actually slows down Power9 benchmarks
+  for (i = 0; i < 8; i++) {
+    temp1[i] = vec_add(temp0[i], temp0[15 - i]);
+    temp1[i + 8] = vec_sub(temp0[7 - i], temp0[i + 8]);
+  }
+
+  // Apply butterflies (in place) on pass through to stage 3.
+  single_butterfly(temp1[27], temp1[20], &temp1[27], &temp1[20]);
+  single_butterfly(temp1[26], temp1[21], &temp1[26], &temp1[21]);
+  single_butterfly(temp1[25], temp1[22], &temp1[25], &temp1[22]);
+  single_butterfly(temp1[24], temp1[23], &temp1[24], &temp1[23]);
+
+  // dump the magnitude by 4, hence the intermediate values are within
+  // the range of 16 bits.
+  if (pass) {
+    temp1[0] = add_round_shift_s16(temp1[0]);
+    temp1[1] = add_round_shift_s16(temp1[1]);
+    temp1[2] = add_round_shift_s16(temp1[2]);
+    temp1[3] = add_round_shift_s16(temp1[3]);
+    temp1[4] = add_round_shift_s16(temp1[4]);
+    temp1[5] = add_round_shift_s16(temp1[5]);
+    temp1[6] = add_round_shift_s16(temp1[6]);
+    temp1[7] = add_round_shift_s16(temp1[7]);
+    temp1[8] = add_round_shift_s16(temp1[8]);
+    temp1[9] = add_round_shift_s16(temp1[9]);
+    temp1[10] = add_round_shift_s16(temp1[10]);
+    temp1[11] = add_round_shift_s16(temp1[11]);
+    temp1[12] = add_round_shift_s16(temp1[12]);
+    temp1[13] = add_round_shift_s16(temp1[13]);
+    temp1[14] = add_round_shift_s16(temp1[14]);
+    temp1[15] = add_round_shift_s16(temp1[15]);
+
+    temp1[16] = add_round_shift_s16(temp1[16]);
+    temp1[17] = add_round_shift_s16(temp1[17]);
+    temp1[18] = add_round_shift_s16(temp1[18]);
+    temp1[19] = add_round_shift_s16(temp1[19]);
+    temp1[20] = add_round_shift_s16(temp1[20]);
+    temp1[21] = add_round_shift_s16(temp1[21]);
+    temp1[22] = add_round_shift_s16(temp1[22]);
+    temp1[23] = add_round_shift_s16(temp1[23]);
+    temp1[24] = add_round_shift_s16(temp1[24]);
+    temp1[25] = add_round_shift_s16(temp1[25]);
+    temp1[26] = add_round_shift_s16(temp1[26]);
+    temp1[27] = add_round_shift_s16(temp1[27]);
+    temp1[28] = add_round_shift_s16(temp1[28]);
+    temp1[29] = add_round_shift_s16(temp1[29]);
+    temp1[30] = add_round_shift_s16(temp1[30]);
+    temp1[31] = add_round_shift_s16(temp1[31]);
+  }
+
+  // Stage 3
+  temp2[0] = vec_add(temp1[0], temp1[7]);
+  temp2[1] = vec_add(temp1[1], temp1[6]);
+  temp2[2] = vec_add(temp1[2], temp1[5]);
+  temp2[3] = vec_add(temp1[3], temp1[4]);
+  temp2[5] = vec_sub(temp1[2], temp1[5]);
+  temp2[6] = vec_sub(temp1[1], temp1[6]);
+  temp2[8] = temp1[8];
+  temp2[9] = temp1[9];
+
+  single_butterfly(temp1[13], temp1[10], &temp2[13], &temp2[10]);
+  single_butterfly(temp1[12], temp1[11], &temp2[12], &temp2[11]);
+  temp2[14] = temp1[14];
+  temp2[15] = temp1[15];
+
+  temp2[18] = vec_add(temp1[18], temp1[21]);
+  temp2[19] = vec_add(temp1[19], temp1[20]);
+
+  temp2[20] = vec_sub(temp1[19], temp1[20]);
+  temp2[21] = vec_sub(temp1[18], temp1[21]);
+
+  temp2[26] = vec_sub(temp1[29], temp1[26]);
+  temp2[27] = vec_sub(temp1[28], temp1[27]);
+
+  temp2[28] = vec_add(temp1[28], temp1[27]);
+  temp2[29] = vec_add(temp1[29], temp1[26]);
+
+  // Pass through Stage 4
+  temp0[7] = vec_sub(temp1[0], temp1[7]);
+  temp0[4] = vec_sub(temp1[3], temp1[4]);
+  temp0[16] = vec_add(temp1[16], temp1[23]);
+  temp0[17] = vec_add(temp1[17], temp1[22]);
+  temp0[22] = vec_sub(temp1[17], temp1[22]);
+  temp0[23] = vec_sub(temp1[16], temp1[23]);
+  temp0[24] = vec_sub(temp1[31], temp1[24]);
+  temp0[25] = vec_sub(temp1[30], temp1[25]);
+  temp0[30] = vec_add(temp1[30], temp1[25]);
+  temp0[31] = vec_add(temp1[31], temp1[24]);
+
+  // Stage 4
+  temp0[0] = vec_add(temp2[0], temp2[3]);
+  temp0[1] = vec_add(temp2[1], temp2[2]);
+  temp0[2] = vec_sub(temp2[1], temp2[2]);
+  temp0[3] = vec_sub(temp2[0], temp2[3]);
+  single_butterfly(temp2[6], temp2[5], &temp0[6], &temp0[5]);
+
+  temp0[9] = vec_add(temp2[9], temp2[10]);
+  temp0[10] = vec_sub(temp2[9], temp2[10]);
+  temp0[13] = vec_sub(temp2[14], temp2[13]);
+  temp0[14] = vec_add(temp2[14], temp2[13]);
+
+  double_butterfly(temp2[29], cospi8_v, temp2[18], cospi24_v, &temp0[29],
+                   &temp0[18]);
+  double_butterfly(temp2[28], cospi8_v, temp2[19], cospi24_v, &temp0[28],
+                   &temp0[19]);
+  double_butterfly(temp2[27], cospi24_v, temp2[20], cospi8m_v, &temp0[27],
+                   &temp0[20]);
+  double_butterfly(temp2[26], cospi24_v, temp2[21], cospi8m_v, &temp0[26],
+                   &temp0[21]);
+
+  // Pass through Stage 5
+  temp1[8] = vec_add(temp2[8], temp2[11]);
+  temp1[11] = vec_sub(temp2[8], temp2[11]);
+  temp1[12] = vec_sub(temp2[15], temp2[12]);
+  temp1[15] = vec_add(temp2[15], temp2[12]);
+
+  // Stage 5
+  // 0 and 1 pass through to 0 and 16 at the end
+  single_butterfly(temp0[0], temp0[1], &out[0], &out[16]);
+
+  // 2 and 3 pass through to 8 and 24 at the end
+  double_butterfly(temp0[3], cospi8_v, temp0[2], cospi24_v, &out[8], &out[24]);
+
+  temp1[4] = vec_add(temp0[4], temp0[5]);
+  temp1[5] = vec_sub(temp0[4], temp0[5]);
+  temp1[6] = vec_sub(temp0[7], temp0[6]);
+  temp1[7] = vec_add(temp0[7], temp0[6]);
+
+  double_butterfly(temp0[14], cospi8_v, temp0[9], cospi24_v, &temp1[14],
+                   &temp1[9]);
+  double_butterfly(temp0[13], cospi24_v, temp0[10], cospi8m_v, &temp1[13],
+                   &temp1[10]);
+
+  temp1[17] = vec_add(temp0[17], temp0[18]);
+  temp1[18] = vec_sub(temp0[17], temp0[18]);
+
+  temp1[21] = vec_sub(temp0[22], temp0[21]);
+  temp1[22] = vec_add(temp0[22], temp0[21]);
+
+  temp1[25] = vec_add(temp0[25], temp0[26]);
+  temp1[26] = vec_sub(temp0[25], temp0[26]);
+
+  temp1[29] = vec_sub(temp0[30], temp0[29]);
+  temp1[30] = vec_add(temp0[30], temp0[29]);
+
+  // Pass through Stage 6
+  temp2[16] = vec_add(temp0[16], temp0[19]);
+  temp2[19] = vec_sub(temp0[16], temp0[19]);
+  temp2[20] = vec_sub(temp0[23], temp0[20]);
+  temp2[23] = vec_add(temp0[23], temp0[20]);
+  temp2[24] = vec_add(temp0[24], temp0[27]);
+  temp2[27] = vec_sub(temp0[24], temp0[27]);
+  temp2[28] = vec_sub(temp0[31], temp0[28]);
+  temp2[31] = vec_add(temp0[31], temp0[28]);
+
+  // Stage 6
+  // 4 and 7 pass through to 4 and 28 at the end
+  double_butterfly(temp1[7], cospi4_v, temp1[4], cospi28_v, &out[4], &out[28]);
+  // 5 and 6 pass through to 20 and 12 at the end
+  double_butterfly(temp1[6], cospi20_v, temp1[5], cospi12_v, &out[20],
+                   &out[12]);
+  temp2[8] = vec_add(temp1[8], temp1[9]);
+  temp2[9] = vec_sub(temp1[8], temp1[9]);
+  temp2[10] = vec_sub(temp1[11], temp1[10]);
+  temp2[11] = vec_add(temp1[11], temp1[10]);
+  temp2[12] = vec_add(temp1[12], temp1[13]);
+  temp2[13] = vec_sub(temp1[12], temp1[13]);
+  temp2[14] = vec_sub(temp1[15], temp1[14]);
+  temp2[15] = vec_add(temp1[15], temp1[14]);
+
+  double_butterfly(temp1[30], cospi4_v, temp1[17], cospi28_v, &temp2[30],
+                   &temp2[17]);
+  double_butterfly(temp1[29], cospi28_v, temp1[18], cospi4m_v, &temp2[29],
+                   &temp2[18]);
+  double_butterfly(temp1[26], cospi20_v, temp1[21], cospi12_v, &temp2[26],
+                   &temp2[21]);
+  double_butterfly(temp1[25], cospi12_v, temp1[22], cospi20m_v, &temp2[25],
+                   &temp2[22]);
+
+  // Stage 7
+  double_butterfly(temp2[15], cospi2_v, temp2[8], cospi30_v, &out[2], &out[30]);
+  double_butterfly(temp2[14], cospi18_v, temp2[9], cospi14_v, &out[18],
+                   &out[14]);
+  double_butterfly(temp2[13], cospi10_v, temp2[10], cospi22_v, &out[10],
+                   &out[22]);
+  double_butterfly(temp2[12], cospi26_v, temp2[11], cospi6_v, &out[26],
+                   &out[6]);
+
+  temp0[16] = vec_add(temp2[16], temp2[17]);
+  temp0[17] = vec_sub(temp2[16], temp2[17]);
+  temp0[18] = vec_sub(temp2[19], temp2[18]);
+  temp0[19] = vec_add(temp2[19], temp2[18]);
+  temp0[20] = vec_add(temp2[20], temp2[21]);
+  temp0[21] = vec_sub(temp2[20], temp2[21]);
+  temp0[22] = vec_sub(temp2[23], temp2[22]);
+  temp0[23] = vec_add(temp2[23], temp2[22]);
+  temp0[24] = vec_add(temp2[24], temp2[25]);
+  temp0[25] = vec_sub(temp2[24], temp2[25]);
+  temp0[26] = vec_sub(temp2[27], temp2[26]);
+  temp0[27] = vec_add(temp2[27], temp2[26]);
+  temp0[28] = vec_add(temp2[28], temp2[29]);
+  temp0[29] = vec_sub(temp2[28], temp2[29]);
+  temp0[30] = vec_sub(temp2[31], temp2[30]);
+  temp0[31] = vec_add(temp2[31], temp2[30]);
+
+  // Final stage --- outputs indices are bit-reversed.
+  double_butterfly(temp0[31], cospi1_v, temp0[16], cospi31_v, &out[1],
+                   &out[31]);
+  double_butterfly(temp0[30], cospi17_v, temp0[17], cospi15_v, &out[17],
+                   &out[15]);
+  double_butterfly(temp0[29], cospi9_v, temp0[18], cospi23_v, &out[9],
+                   &out[23]);
+  double_butterfly(temp0[28], cospi25_v, temp0[19], cospi7_v, &out[25],
+                   &out[7]);
+  double_butterfly(temp0[27], cospi5_v, temp0[20], cospi27_v, &out[5],
+                   &out[27]);
+  double_butterfly(temp0[26], cospi21_v, temp0[21], cospi11_v, &out[21],
+                   &out[11]);
+  double_butterfly(temp0[25], cospi13_v, temp0[22], cospi19_v, &out[13],
+                   &out[19]);
+  double_butterfly(temp0[24], cospi29_v, temp0[23], cospi3_v, &out[29],
+                   &out[3]);
+
+  if (pass == 0) {
+    for (i = 0; i < 32; i++) {
+      out[i] = sub_round_shift(out[i]);
+    }
+  }
+}
+
+void vpx_fdct32x32_rd_vsx(const int16_t *input, tran_low_t *out, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+  int16x8_t temp6[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  fdct32_vsx(temp0, temp1, 0);
+
+  load(input + 8, stride, temp0);
+  fdct32_vsx(temp0, temp2, 0);
+
+  load(input + 16, stride, temp0);
+  fdct32_vsx(temp0, temp3, 0);
+
+  load(input + 24, stride, temp0);
+  fdct32_vsx(temp0, temp4, 0);
+
+  // Generate the top row by munging the first set of 8 from each one
+  // together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out, temp6);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 8 * 32, temp6);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 16 * 32, temp6);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  fdct32_vsx(temp0, temp5, 1);
+
+  transpose_8x8(&temp5[0], &temp6[0]);
+  transpose_8x8(&temp5[8], &temp6[8]);
+  transpose_8x8(&temp5[16], &temp6[16]);
+  transpose_8x8(&temp5[24], &temp6[24]);
+
+  store(out + 24 * 32, temp6);
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c
index 6273460f190..a4c8322ff2e 100644
--- a/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c
+++ b/libs/libvpx/vpx_dsp/ppc/intrapred_vsx.c
@@ -35,6 +35,8 @@ void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
 
 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
@@ -87,6 +89,7 @@ void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
   dst += stride;
   vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
 }
+#endif
 
 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
@@ -233,6 +236,8 @@ void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
   H_PREDICTOR_32(v15_1);
 }
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
@@ -311,6 +316,7 @@ void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
   val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
 }
+#endif
 
 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
                               int16x8_t ah, int16x8_t al, int16x8_t tl) {
@@ -547,6 +553,8 @@ void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
   dc_fill_predictor_32x32(dst, stride, avg32(above));
 }
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
   const uint8x16_t a0 = vec_vsx_ld(0, above);
   const uint8x16_t l0 = vec_vsx_ld(0, left);
@@ -559,6 +567,7 @@ static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
                    3);
 }
+#endif
 
 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
   const uint8x16_t a0 = vec_vsx_ld(0, above);
@@ -573,10 +582,13 @@ static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
                    3);
 }
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
   dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
 }
+#endif
 
 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
@@ -615,6 +627,8 @@ static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
                                 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x16_t af = vec_vsx_ld(0, above);
@@ -633,6 +647,7 @@ void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
     row = vec_perm(row, above_right, sl1);
   }
 }
+#endif
 
 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
@@ -674,6 +689,8 @@ void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
   }
 }
 
+// TODO(crbug.com/webm/1522): Fix test failures.
+#if 0
 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x16_t af = vec_vsx_ld(0, above);
@@ -696,6 +713,7 @@ void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
     row1 = vec_perm(row1, above_right, sl1);
   }
 }
+#endif
 
 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
index d43a9fd1841..e99412ecab9 100644
--- a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
+++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -14,67 +14,129 @@
 
 #include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
 #include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/inv_txfm.h"
 
-static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
-                              16364, 16364, 16364, 16364 };
-static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
-                              16305, 16305, 16305, 16305 };
-static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
-                              16207, 16207, 16207, 16207 };
-static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
-                              16069, 16069, 16069, 16069 };
-static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
-                               -16069, -16069, -16069, -16069 };
-static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
-                              15893, 15893, 15893, 15893 };
-static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
-                              15679, 15679, 15679, 15679 };
-static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
-                              15426, 15426, 15426, 15426 };
-static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
-                              15137, 15137, 15137, 15137 };
-static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
-                               -15137, -15137, -15137, -15137 };
-static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
-                              14811, 14811, 14811, 14811 };
-static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
-                               14449, 14449, 14449, 14449 };
-static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
-                               14053, 14053, 14053, 14053 };
-static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
-                               13623, 13623, 13623, 13623 };
-static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
-                               13160, 13160, 13160, 13160 };
-static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
-                               12665, 12665, 12665, 12665 };
-static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
-                               12140, 12140, 12140, 12140 };
-static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
-                               11585, 11585, 11585, 11585 };
-static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
-                               11003, 11003, 11003, 11003 };
-static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
-                               10394, 10394, 10394, 10394 };
-static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
-static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
-static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
-                                -9102, -9102, -9102, -9102 };
-static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
-static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
-static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
-static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
-static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
-                                -6270, -6270, -6270, -6270 };
-static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
-static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
-static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
-static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
-static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
-static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
-static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                                    16364, 16364, 16364, 16364 };
+static const int16x8_t cospi1m_v = { -16364, -16364, -16364, -16364,
+                                     -16364, -16364, -16364, -16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                                    16305, 16305, 16305, 16305 };
+static const int16x8_t cospi2m_v = { -16305, -16305, -16305, -16305,
+                                     -16305, -16305, -16305, -16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                                    16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                                    16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                                     -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                                    15893, 15893, 15893, 15893 };
+static const int16x8_t cospi5m_v = { -15893, -15893, -15893, -15893,
+                                     -15893, -15893, -15893, -15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                                    15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                                    15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                                    15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                                     -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                                    14811, 14811, 14811, 14811 };
+static const int16x8_t cospi9m_v = { -14811, -14811, -14811, -14811,
+                                     -14811, -14811, -14811, -14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                                     14449, 14449, 14449, 14449 };
+static const int16x8_t cospi10m_v = { -14449, -14449, -14449, -14449,
+                                      -14449, -14449, -14449, -14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                                     14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                                     13623, 13623, 13623, 13623 };
+static const int16x8_t cospi12m_v = { -13623, -13623, -13623, -13623,
+                                      -13623, -13623, -13623, -13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                                     13160, 13160, 13160, 13160 };
+static const int16x8_t cospi13m_v = { -13160, -13160, -13160, -13160,
+                                      -13160, -13160, -13160, -13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                                     12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                                     12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                                     11585, 11585, 11585, 11585 };
+static const int16x8_t cospi16m_v = { -11585, -11585, -11585, -11585,
+                                      -11585, -11585, -11585, -11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                                     11003, 11003, 11003, 11003 };
+static const int16x8_t cospi17m_v = { -11003, -11003, -11003, -11003,
+                                      -11003, -11003, -11003, -11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                                     10394, 10394, 10394, 10394 };
+static const int16x8_t cospi18m_v = { -10394, -10394, -10394, -10394,
+                                      -10394, -10394, -10394, -10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+                                     9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+                                     9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                      -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+                                     8423, 8423, 8423, 8423 };
+static const int16x8_t cospi21m_v = { -8423, -8423, -8423, -8423,
+                                      -8423, -8423, -8423, -8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+                                     7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+                                     7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+                                     6270, 6270, 6270, 6270 };
+static const int16x8_t cospi24m_v = { -6270, -6270, -6270, -6270,
+                                      -6270, -6270, -6270, -6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+                                     5520, 5520, 5520, 5520 };
+static const int16x8_t cospi25m_v = { -5520, -5520, -5520, -5520,
+                                      -5520, -5520, -5520, -5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+                                     4756, 4756, 4756, 4756 };
+static const int16x8_t cospi26m_v = { -4756, -4756, -4756, -4756,
+                                      -4756, -4756, -4756, -4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+                                     3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+                                     3196, 3196, 3196, 3196 };
+static const int16x8_t cospi28m_v = { -3196, -3196, -3196, -3196,
+                                      -3196, -3196, -3196, -3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+                                     2404, 2404, 2404, 2404 };
+static const int16x8_t cospi29m_v = { -2404, -2404, -2404, -2404,
+                                      -2404, -2404, -2404, -2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+                                     1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+static const int16x8_t sinpi_1_9_v = { 5283, 5283, 5283, 5283,
+                                       5283, 5283, 5283, 5283 };
+static const int16x8_t sinpi_2_9_v = { 9929, 9929, 9929, 9929,
+                                       9929, 9929, 9929, 9929 };
+static const int16x8_t sinpi_3_9_v = { 13377, 13377, 13377, 13377,
+                                       13377, 13377, 13377, 13377 };
+static const int16x8_t sinpi_4_9_v = { 15212, 15212, 15212, 15212,
+                                       15212, 15212, 15212, 15212 };
+
+static uint8x16_t tr8_mask0 = {
+  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+
+static uint8x16_t tr8_mask1 = {
+  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
+  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
 
 #define ROUND_SHIFT_INIT                                               \
   const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
@@ -107,19 +169,18 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
   out1 = vec_sub(step0, step1);                                               \
   out1 = vec_perm(out1, out1, mask0);
 
-void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  int32x4_t temp1, temp2, temp3, temp4;
-  int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
-  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
-                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
-  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
-                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
-  int16x8_t v0 = load_tran_low(0, input);
-  int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
-  int16x8_t t0 = vec_mergeh(v0, v1);
-  int16x8_t t1 = vec_mergel(v0, v1);
+#define PACK_STORE(v0, v1)                                \
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, tr8_mask0), v0); \
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, tr8_mask0), v1); \
+  output_v = vec_packsu(tmp16_0, tmp16_1);                \
+                                                          \
+  vec_vsx_st(output_v, 0, tmp_dest);                      \
+  for (i = 0; i < 4; i++)                                 \
+    for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
 
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+                            int stride) {
+  int i, j;
   uint8x16_t dest0 = vec_vsx_ld(0, dest);
   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
@@ -129,31 +190,45 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
   int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  int16x8_t tmp16_0, tmp16_1;
   uint8x16_t output_v;
   uint8_t tmp_dest[16];
-  ROUND_SHIFT_INIT
   PIXEL_ADD_INIT;
 
-  v0 = vec_mergeh(t0, t1);
-  v1 = vec_mergel(t0, t1);
-
-  IDCT4(v0, v1, t_out0, t_out1);
-  // transpose
-  t0 = vec_mergeh(t_out0, t_out1);
-  t1 = vec_mergel(t_out0, t_out1);
-  v0 = vec_mergeh(t0, t1);
-  v1 = vec_mergel(t0, t1);
-  IDCT4(v0, v1, t_out0, t_out1);
-
-  PIXEL_ADD4(v0, t_out0);
-  PIXEL_ADD4(v1, t_out1);
-  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
-  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
-  output_v = vec_packsu(tmp16_0, tmp16_1);
-
-  vec_vsx_st(output_v, 0, tmp_dest);
-  for (int i = 0; i < 4; i++)
-    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+  PIXEL_ADD4(out[0], in[0]);
+  PIXEL_ADD4(out[1], in[1]);
+
+  PACK_STORE(out[0], out[1]);
+}
+
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  int16x8_t step0, step1, tmp16_0;
+  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+  int16x8_t t0 = vec_mergeh(in[0], in[1]);
+  int16x8_t t1 = vec_mergel(in[0], in[1]);
+  ROUND_SHIFT_INIT
+
+  in[0] = vec_mergeh(t0, t1);
+  in[1] = vec_mergel(t0, t1);
+
+  IDCT4(in[0], in[1], out[0], out[1]);
+}
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t in[2], out[2];
+
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+  // Rows
+  vpx_idct4_vsx(in, out);
+
+  // Columns
+  vpx_idct4_vsx(out, in);
+
+  vpx_round_store4x4_vsx(in, out, dest, stride);
 }
 
 #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
@@ -255,28 +330,20 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
 #define PIXEL_ADD(in, out, add, shiftx) \
   out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
 
-static uint8x16_t tr8_mask0 = {
-  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
-  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
-};
-static uint8x16_t tr8_mask1 = {
-  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
-  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
-};
-void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  int32x4_t temp10, temp11;
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out) {
   int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
-  int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
-      tmp16_2, tmp16_3;
-  int16x8_t src0 = load_tran_low(0, input);
-  int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
-  int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
-  int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
-  int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
-  int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
-  int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
-  int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
+  int16x8_t tmp16_0, tmp16_1, tmp16_2, tmp16_3;
+  int32x4_t temp10, temp11;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+               out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+  IDCT8(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+}
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride) {
+  uint8x16_t zerov = vec_splat_u8(0);
   uint8x16_t dest0 = vec_vsx_ld(0, dest);
   uint8x16_t dest1 = vec_vsx_ld(stride, dest);
   uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
@@ -285,7 +352,6 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
   uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
   uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
   uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
-  uint8x16_t zerov = vec_splat_u8(0);
   int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
   int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
   int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
@@ -297,23 +363,15 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
   int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
   uint16x8_t shift5 = vec_splat_u16(5);
   uint8x16_t output0, output1, output2, output3;
-  ROUND_SHIFT_INIT;
 
-  TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
-               tmp3, tmp4, tmp5, tmp6, tmp7);
-
-  IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
-  TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
-               src3, src4, src5, src6, src7);
-  IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
-  PIXEL_ADD(src0, d_u0, add, shift5);
-  PIXEL_ADD(src1, d_u1, add, shift5);
-  PIXEL_ADD(src2, d_u2, add, shift5);
-  PIXEL_ADD(src3, d_u3, add, shift5);
-  PIXEL_ADD(src4, d_u4, add, shift5);
-  PIXEL_ADD(src5, d_u5, add, shift5);
-  PIXEL_ADD(src6, d_u6, add, shift5);
-  PIXEL_ADD(src7, d_u7, add, shift5);
+  PIXEL_ADD(in[0], d_u0, add, shift5);
+  PIXEL_ADD(in[1], d_u1, add, shift5);
+  PIXEL_ADD(in[2], d_u2, add, shift5);
+  PIXEL_ADD(in[3], d_u3, add, shift5);
+  PIXEL_ADD(in[4], d_u4, add, shift5);
+  PIXEL_ADD(in[5], d_u5, add, shift5);
+  PIXEL_ADD(in[6], d_u6, add, shift5);
+  PIXEL_ADD(in[7], d_u7, add, shift5);
   output0 = vec_packsu(d_u0, d_u1);
   output1 = vec_packsu(d_u2, d_u3);
   output2 = vec_packsu(d_u4, d_u5);
@@ -329,24 +387,24 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
   vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
 }
 
-#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
-                     in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
-  in0 = load(offset, source);                                                  \
-  in1 = load((step) + (offset), source);                                       \
-  in2 = load(2 * (step) + (offset), source);                                   \
-  in3 = load(3 * (step) + (offset), source);                                   \
-  in4 = load(4 * (step) + (offset), source);                                   \
-  in5 = load(5 * (step) + (offset), source);                                   \
-  in6 = load(6 * (step) + (offset), source);                                   \
-  in7 = load(7 * (step) + (offset), source);                                   \
-  in8 = load(8 * (step) + (offset), source);                                   \
-  in9 = load(9 * (step) + (offset), source);                                   \
-  inA = load(10 * (step) + (offset), source);                                  \
-  inB = load(11 * (step) + (offset), source);                                  \
-  inC = load(12 * (step) + (offset), source);                                  \
-  inD = load(13 * (step) + (offset), source);                                  \
-  inE = load(14 * (step) + (offset), source);                                  \
-  inF = load(15 * (step) + (offset), source);
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t src[8], tmp[8];
+
+  src[0] = load_tran_low(0, input);
+  src[1] = load_tran_low(8 * sizeof(*input), input);
+  src[2] = load_tran_low(16 * sizeof(*input), input);
+  src[3] = load_tran_low(24 * sizeof(*input), input);
+  src[4] = load_tran_low(32 * sizeof(*input), input);
+  src[5] = load_tran_low(40 * sizeof(*input), input);
+  src[6] = load_tran_low(48 * sizeof(*input), input);
+  src[7] = load_tran_low(56 * sizeof(*input), input);
+
+  vpx_idct8_vsx(src, tmp);
+  vpx_idct8_vsx(tmp, src);
+
+  vpx_round_store8x8_vsx(src, dest, stride);
+}
 
 #define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
   tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
@@ -446,9 +504,9 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
   tmp16_0 = vec_mergeh(outA, outD);                                            \
   tmp16_1 = vec_mergel(outA, outD);                                            \
   temp10 =                                                                     \
-      vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
+      vec_sub(vec_mule(tmp16_0, cospi24m_v), vec_mulo(tmp16_0, cospi8_v));     \
   temp11 =                                                                     \
-      vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
+      vec_sub(vec_mule(tmp16_1, cospi24m_v), vec_mulo(tmp16_1, cospi8_v));     \
   DCT_CONST_ROUND_SHIFT(temp10);                                               \
   DCT_CONST_ROUND_SHIFT(temp11);                                               \
   inA = vec_packs(temp10, temp11);                                             \
@@ -520,95 +578,131 @@ void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
   PIXEL_ADD(in1, d_ul, add, shift6);             \
   vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
 
-void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
+static void half_idct16x8_vsx(int16x8_t *src) {
+  int16x8_t tmp0[8], tmp1[8];
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t tmp16_0, tmp16_1;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src[0], src[2], src[4], src[6], src[8], src[10], src[12],
+               src[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src[1], src[3], src[5], src[7], src[9], src[11], src[13],
+               src[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         src[0], src[2], src[4], src[6], src[8], src[10], src[12], src[14],
+         src[1], src[3], src[5], src[7], src[9], src[11], src[13], src[15]);
+}
+
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1) {
+  int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
   int32x4_t temp10, temp11, temp20, temp21, temp30;
-  int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
-      src11, src12, src13, src14, src15, src16, src17;
-  int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
-      src31, src32, src33, src34, src35, src36, src37;
-  int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
-      tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
-  int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
-      tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
-  uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
-      dest9, destA, destB, destC, destD, destE, destF;
+  int16x8_t tmp16_0, tmp16_1;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+               tmp2[6], tmp2[7]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+               tmp3[6], tmp3[7]);
+
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+         src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+         src1[12], src1[14]);
+
+  IDCT16(tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+         tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+         src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+         src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+         src1[13], src1[15]);
+}
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+                              int stride) {
+  uint8x16_t destv[16];
   int16x8_t d_uh, d_ul;
-  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
-  uint16x8_t shift6 = vec_splat_u16(6);
   uint8x16_t zerov = vec_splat_u8(0);
+  uint16x8_t shift6 = vec_splat_u16(6);
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+
+  // load dest
+  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, destv);
+
+  PIXEL_ADD_STORE16(src0[0], src0[1], destv[0], 0);
+  PIXEL_ADD_STORE16(src0[2], src0[3], destv[1], stride);
+  PIXEL_ADD_STORE16(src0[4], src0[5], destv[2], 2 * stride);
+  PIXEL_ADD_STORE16(src0[6], src0[7], destv[3], 3 * stride);
+  PIXEL_ADD_STORE16(src0[8], src0[9], destv[4], 4 * stride);
+  PIXEL_ADD_STORE16(src0[10], src0[11], destv[5], 5 * stride);
+  PIXEL_ADD_STORE16(src0[12], src0[13], destv[6], 6 * stride);
+  PIXEL_ADD_STORE16(src0[14], src0[15], destv[7], 7 * stride);
+
+  PIXEL_ADD_STORE16(src1[0], src1[1], destv[8], 8 * stride);
+  PIXEL_ADD_STORE16(src1[2], src1[3], destv[9], 9 * stride);
+  PIXEL_ADD_STORE16(src1[4], src1[5], destv[10], 10 * stride);
+  PIXEL_ADD_STORE16(src1[6], src1[7], destv[11], 11 * stride);
+  PIXEL_ADD_STORE16(src1[8], src1[9], destv[12], 12 * stride);
+  PIXEL_ADD_STORE16(src1[10], src1[11], destv[13], 13 * stride);
+  PIXEL_ADD_STORE16(src1[12], src1[13], destv[14], 14 * stride);
+  PIXEL_ADD_STORE16(src1[14], src1[15], destv[15], 15 * stride);
+}
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int16x8_t src0[16], src1[16];
+  int16x8_t tmp0[8], tmp1[8], tmp2[8], tmp3[8];
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t tmp16_0, tmp16_1;
   ROUND_SHIFT_INIT;
 
-  // transform rows
-  // load and transform the upper half of 16x16 matrix
-  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
-               src11, src02, src12, src03, src13, src04, src14, src05, src15,
-               src06, src16, src07, src17);
-  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
-               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
-  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
-               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
-  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
-         tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
-         src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
-         src16, src17);
-  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
-               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
-  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
-               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
-
-  // load and transform the lower half of 16x16 matrix
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src0);
   LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
-               8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
-               src23, src33, src24, src34, src25, src35, src26, src36, src27,
-               src37);
-  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
-               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
-  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
-               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
-  IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
-         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
-         src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
-         src36, src37);
-  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
-               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
-  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
-               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+               8 * sizeof(*input), src1);
+
+  // transform rows
+  // transform the upper half of 16x16 matrix
+  half_idct16x8_vsx(src0);
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+
+  // transform the lower half of 16x16 matrix
+  half_idct16x8_vsx(src1);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5],
+               tmp2[6], tmp2[7]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5],
+               tmp3[6], tmp3[7]);
 
   // transform columns
   // left half first
-  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
-         tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
-         src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
-         src26, src27);
+  IDCT16(tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5], tmp0[6], tmp0[7],
+         tmp2[0], tmp2[1], tmp2[2], tmp2[3], tmp2[4], tmp2[5], tmp2[6], tmp2[7],
+         src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+         src0[14], src1[0], src1[2], src1[4], src1[6], src1[8], src1[10],
+         src1[12], src1[14]);
   // right half
-  IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
-         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
-         src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
-         src36, src37);
+  IDCT16(tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5], tmp1[6], tmp1[7],
+         tmp3[0], tmp3[1], tmp3[2], tmp3[3], tmp3[4], tmp3[5], tmp3[6], tmp3[7],
+         src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+         src0[15], src1[1], src1[3], src1[5], src1[7], src1[9], src1[11],
+         src1[13], src1[15]);
 
-  // load dest
-  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
-               dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
-               destE, destF);
-
-  PIXEL_ADD_STORE16(src00, src10, dest0, 0);
-  PIXEL_ADD_STORE16(src01, src11, dest1, stride);
-  PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
-  PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
-  PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
-  PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
-  PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
-  PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
-
-  PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
-  PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
-  PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
-  PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
-  PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
-  PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
-  PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
-  PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
+  vpx_round_store16x16_vsx(src0, src1, dest, stride);
 }
 
 #define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
@@ -980,15 +1074,15 @@ void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
   PIXEL_ADD(in3, d_ul, add, shift6);                       \
   vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
 
-#define ADD_STORE_BLOCK(in, offset)                                      \
-  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
-  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
-  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
-  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
-  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
-  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
-  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
-  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
+#define ADD_STORE_BLOCK(in, offset)                                        \
+  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], (offset) + 0); \
+  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], (offset) + 1); \
+  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], (offset) + 2); \
+  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], (offset) + 3); \
+  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], (offset) + 4); \
+  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], (offset) + 5); \
+  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], (offset) + 6); \
+  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], (offset) + 7);
 
 void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
@@ -1061,3 +1155,674 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
   ADD_STORE_BLOCK(src2, 16);
   ADD_STORE_BLOCK(src3, 24);
 }
+
+#define TRANSFORM_COLS           \
+  v32_a = vec_add(v32_a, v32_c); \
+  v32_d = vec_sub(v32_d, v32_b); \
+  v32_e = vec_sub(v32_a, v32_d); \
+  v32_e = vec_sra(v32_e, one);   \
+  v32_b = vec_sub(v32_e, v32_b); \
+  v32_c = vec_sub(v32_e, v32_c); \
+  v32_a = vec_sub(v32_a, v32_b); \
+  v32_d = vec_add(v32_d, v32_c); \
+  v_a = vec_packs(v32_a, v32_b); \
+  v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT             \
+  tmp_a = vec_mergeh(v_a, v_c);   \
+  tmp_c = vec_mergel(v_a, v_c);   \
+  v_a = vec_mergeh(tmp_a, tmp_c); \
+  v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int16x8_t v_a = load_tran_low(0, input);
+  int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t tmp_a, tmp_c;
+  uint16x8_t two = vec_splat_u16(2);
+  uint32x4_t one = vec_splat_u32(1);
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+  int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+  int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+  int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  int i, j;
+
+  v_a = vec_sra(v_a, two);
+  v_c = vec_sra(v_c, two);
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  TRANSPOSE_WHT;
+
+  v32_a = vec_unpackh(v_a);
+  v32_c = vec_unpackl(v_a);
+  v32_d = vec_unpackh(v_c);
+  v32_b = vec_unpackl(v_c);
+
+  TRANSFORM_COLS;
+
+  PACK_STORE(v_a, v_c);
+}
+
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out) {
+  int16x8_t sinpi_1_3_v, sinpi_4_2_v, sinpi_2_3_v, sinpi_1_4_v, sinpi_12_n3_v;
+  int32x4_t v_v[5], u_v[4];
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t tmp0, tmp1;
+  int16x8_t zero16v = vec_splat_s16(0);
+  uint32x4_t shift16 = vec_sl(vec_splat_u32(8), vec_splat_u32(1));
+  ROUND_SHIFT_INIT;
+
+  sinpi_1_3_v = vec_mergel(sinpi_1_9_v, sinpi_3_9_v);
+  sinpi_4_2_v = vec_mergel(sinpi_4_9_v, sinpi_2_9_v);
+  sinpi_2_3_v = vec_mergel(sinpi_2_9_v, sinpi_3_9_v);
+  sinpi_1_4_v = vec_mergel(sinpi_1_9_v, sinpi_4_9_v);
+  sinpi_12_n3_v = vec_mergel(vec_add(sinpi_1_9_v, sinpi_2_9_v),
+                             vec_sub(zero16v, sinpi_3_9_v));
+
+  tmp0 = (int16x8_t)vec_mergeh((int32x4_t)in[0], (int32x4_t)in[1]);
+  tmp1 = (int16x8_t)vec_mergel((int32x4_t)in[0], (int32x4_t)in[1]);
+  in[0] = (int16x8_t)vec_mergeh((int32x4_t)tmp0, (int32x4_t)tmp1);
+  in[1] = (int16x8_t)vec_mergel((int32x4_t)tmp0, (int32x4_t)tmp1);
+
+  v_v[0] = vec_msum(in[0], sinpi_1_3_v, zerov);
+  v_v[1] = vec_msum(in[1], sinpi_4_2_v, zerov);
+  v_v[2] = vec_msum(in[0], sinpi_2_3_v, zerov);
+  v_v[3] = vec_msum(in[1], sinpi_1_4_v, zerov);
+  v_v[4] = vec_msum(in[0], sinpi_12_n3_v, zerov);
+
+  in[0] = vec_sub(in[0], in[1]);
+  in[1] = (int16x8_t)vec_sra((int32x4_t)in[1], shift16);
+  in[0] = vec_add(in[0], in[1]);
+  in[0] = (int16x8_t)vec_sl((int32x4_t)in[0], shift16);
+
+  u_v[0] = vec_add(v_v[0], v_v[1]);
+  u_v[1] = vec_sub(v_v[2], v_v[3]);
+  u_v[2] = vec_msum(in[0], sinpi_1_3_v, zerov);
+  u_v[3] = vec_sub(v_v[1], v_v[3]);
+  u_v[3] = vec_add(u_v[3], v_v[4]);
+
+  DCT_CONST_ROUND_SHIFT(u_v[0]);
+  DCT_CONST_ROUND_SHIFT(u_v[1]);
+  DCT_CONST_ROUND_SHIFT(u_v[2]);
+  DCT_CONST_ROUND_SHIFT(u_v[3]);
+
+  out[0] = vec_packs(u_v[0], u_v[1]);
+  out[1] = vec_packs(u_v[2], u_v[3]);
+}
+
+#define MSUM_ROUND_SHIFT(a, b, cospi) \
+  b = vec_msums(a, cospi, zerov);     \
+  DCT_CONST_ROUND_SHIFT(b);
+
+#define IADST_WRAPLOW(in0, in1, tmp0, tmp1, out, cospi) \
+  MSUM_ROUND_SHIFT(in0, tmp0, cospi);                   \
+  MSUM_ROUND_SHIFT(in1, tmp1, cospi);                   \
+  out = vec_packs(tmp0, tmp1);
+
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t tmp0[16], tmp1[16];
+
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t zero16v = vec_splat_s16(0);
+  int16x8_t cospi_p02_p30_v = vec_mergel(cospi2_v, cospi30_v);
+  int16x8_t cospi_p30_m02_v = vec_mergel(cospi30_v, cospi2m_v);
+  int16x8_t cospi_p10_p22_v = vec_mergel(cospi10_v, cospi22_v);
+  int16x8_t cospi_p22_m10_v = vec_mergel(cospi22_v, cospi10m_v);
+  int16x8_t cospi_p18_p14_v = vec_mergel(cospi18_v, cospi14_v);
+  int16x8_t cospi_p14_m18_v = vec_mergel(cospi14_v, cospi18m_v);
+  int16x8_t cospi_p26_p06_v = vec_mergel(cospi26_v, cospi6_v);
+  int16x8_t cospi_p06_m26_v = vec_mergel(cospi6_v, cospi26m_v);
+  int16x8_t cospi_p08_p24_v = vec_mergel(cospi8_v, cospi24_v);
+  int16x8_t cospi_p24_m08_v = vec_mergel(cospi24_v, cospi8m_v);
+  int16x8_t cospi_m24_p08_v = vec_mergel(cospi24m_v, cospi8_v);
+  int16x8_t cospi_p16_m16_v = vec_mergel(cospi16_v, cospi16m_v);
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], out[0],
+               out[1], out[2], out[3], out[4], out[5], out[6], out[7]);
+
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  in[0] = vec_mergeh(out[7], out[0]);
+  in[1] = vec_mergel(out[7], out[0]);
+  in[2] = vec_mergeh(out[5], out[2]);
+  in[3] = vec_mergel(out[5], out[2]);
+  in[4] = vec_mergeh(out[3], out[4]);
+  in[5] = vec_mergel(out[3], out[4]);
+  in[6] = vec_mergeh(out[1], out[6]);
+  in[7] = vec_mergel(out[1], out[6]);
+
+  tmp1[0] = vec_msum(in[0], cospi_p02_p30_v, zerov);
+  tmp1[1] = vec_msum(in[1], cospi_p02_p30_v, zerov);
+  tmp1[2] = vec_msum(in[0], cospi_p30_m02_v, zerov);
+  tmp1[3] = vec_msum(in[1], cospi_p30_m02_v, zerov);
+  tmp1[4] = vec_msum(in[2], cospi_p10_p22_v, zerov);
+  tmp1[5] = vec_msum(in[3], cospi_p10_p22_v, zerov);
+  tmp1[6] = vec_msum(in[2], cospi_p22_m10_v, zerov);
+  tmp1[7] = vec_msum(in[3], cospi_p22_m10_v, zerov);
+  tmp1[8] = vec_msum(in[4], cospi_p18_p14_v, zerov);
+  tmp1[9] = vec_msum(in[5], cospi_p18_p14_v, zerov);
+  tmp1[10] = vec_msum(in[4], cospi_p14_m18_v, zerov);
+  tmp1[11] = vec_msum(in[5], cospi_p14_m18_v, zerov);
+  tmp1[12] = vec_msum(in[6], cospi_p26_p06_v, zerov);
+  tmp1[13] = vec_msum(in[7], cospi_p26_p06_v, zerov);
+  tmp1[14] = vec_msum(in[6], cospi_p06_m26_v, zerov);
+  tmp1[15] = vec_msum(in[7], cospi_p06_m26_v, zerov);
+
+  tmp0[0] = vec_add(tmp1[0], tmp1[8]);
+  tmp0[1] = vec_add(tmp1[1], tmp1[9]);
+  tmp0[2] = vec_add(tmp1[2], tmp1[10]);
+  tmp0[3] = vec_add(tmp1[3], tmp1[11]);
+  tmp0[4] = vec_add(tmp1[4], tmp1[12]);
+  tmp0[5] = vec_add(tmp1[5], tmp1[13]);
+  tmp0[6] = vec_add(tmp1[6], tmp1[14]);
+  tmp0[7] = vec_add(tmp1[7], tmp1[15]);
+  tmp0[8] = vec_sub(tmp1[0], tmp1[8]);
+  tmp0[9] = vec_sub(tmp1[1], tmp1[9]);
+  tmp0[10] = vec_sub(tmp1[2], tmp1[10]);
+  tmp0[11] = vec_sub(tmp1[3], tmp1[11]);
+  tmp0[12] = vec_sub(tmp1[4], tmp1[12]);
+  tmp0[13] = vec_sub(tmp1[5], tmp1[13]);
+  tmp0[14] = vec_sub(tmp1[6], tmp1[14]);
+  tmp0[15] = vec_sub(tmp1[7], tmp1[15]);
+
+  // shift and rounding
+  DCT_CONST_ROUND_SHIFT(tmp0[0]);
+  DCT_CONST_ROUND_SHIFT(tmp0[1]);
+  DCT_CONST_ROUND_SHIFT(tmp0[2]);
+  DCT_CONST_ROUND_SHIFT(tmp0[3]);
+  DCT_CONST_ROUND_SHIFT(tmp0[4]);
+  DCT_CONST_ROUND_SHIFT(tmp0[5]);
+  DCT_CONST_ROUND_SHIFT(tmp0[6]);
+  DCT_CONST_ROUND_SHIFT(tmp0[7]);
+  DCT_CONST_ROUND_SHIFT(tmp0[8]);
+  DCT_CONST_ROUND_SHIFT(tmp0[9]);
+  DCT_CONST_ROUND_SHIFT(tmp0[10]);
+  DCT_CONST_ROUND_SHIFT(tmp0[11]);
+  DCT_CONST_ROUND_SHIFT(tmp0[12]);
+  DCT_CONST_ROUND_SHIFT(tmp0[13]);
+  DCT_CONST_ROUND_SHIFT(tmp0[14]);
+  DCT_CONST_ROUND_SHIFT(tmp0[15]);
+
+  // back to 16-bit
+  out[0] = vec_packs(tmp0[0], tmp0[1]);
+  out[1] = vec_packs(tmp0[2], tmp0[3]);
+  out[2] = vec_packs(tmp0[4], tmp0[5]);
+  out[3] = vec_packs(tmp0[6], tmp0[7]);
+  out[4] = vec_packs(tmp0[8], tmp0[9]);
+  out[5] = vec_packs(tmp0[10], tmp0[11]);
+  out[6] = vec_packs(tmp0[12], tmp0[13]);
+  out[7] = vec_packs(tmp0[14], tmp0[15]);
+
+  // stage 2
+  in[0] = vec_add(out[0], out[2]);
+  in[1] = vec_add(out[1], out[3]);
+  in[2] = vec_sub(out[0], out[2]);
+  in[3] = vec_sub(out[1], out[3]);
+  in[4] = vec_mergeh(out[4], out[5]);
+  in[5] = vec_mergel(out[4], out[5]);
+  in[6] = vec_mergeh(out[6], out[7]);
+  in[7] = vec_mergel(out[6], out[7]);
+
+  tmp1[0] = vec_msum(in[4], cospi_p08_p24_v, zerov);
+  tmp1[1] = vec_msum(in[5], cospi_p08_p24_v, zerov);
+  tmp1[2] = vec_msum(in[4], cospi_p24_m08_v, zerov);
+  tmp1[3] = vec_msum(in[5], cospi_p24_m08_v, zerov);
+  tmp1[4] = vec_msum(in[6], cospi_m24_p08_v, zerov);
+  tmp1[5] = vec_msum(in[7], cospi_m24_p08_v, zerov);
+  tmp1[6] = vec_msum(in[6], cospi_p08_p24_v, zerov);
+  tmp1[7] = vec_msum(in[7], cospi_p08_p24_v, zerov);
+
+  tmp0[0] = vec_add(tmp1[0], tmp1[4]);
+  tmp0[1] = vec_add(tmp1[1], tmp1[5]);
+  tmp0[2] = vec_add(tmp1[2], tmp1[6]);
+  tmp0[3] = vec_add(tmp1[3], tmp1[7]);
+  tmp0[4] = vec_sub(tmp1[0], tmp1[4]);
+  tmp0[5] = vec_sub(tmp1[1], tmp1[5]);
+  tmp0[6] = vec_sub(tmp1[2], tmp1[6]);
+  tmp0[7] = vec_sub(tmp1[3], tmp1[7]);
+
+  DCT_CONST_ROUND_SHIFT(tmp0[0]);
+  DCT_CONST_ROUND_SHIFT(tmp0[1]);
+  DCT_CONST_ROUND_SHIFT(tmp0[2]);
+  DCT_CONST_ROUND_SHIFT(tmp0[3]);
+  DCT_CONST_ROUND_SHIFT(tmp0[4]);
+  DCT_CONST_ROUND_SHIFT(tmp0[5]);
+  DCT_CONST_ROUND_SHIFT(tmp0[6]);
+  DCT_CONST_ROUND_SHIFT(tmp0[7]);
+
+  in[4] = vec_packs(tmp0[0], tmp0[1]);
+  in[5] = vec_packs(tmp0[2], tmp0[3]);
+  in[6] = vec_packs(tmp0[4], tmp0[5]);
+  in[7] = vec_packs(tmp0[6], tmp0[7]);
+
+  // stage 3
+  out[0] = vec_mergeh(in[2], in[3]);
+  out[1] = vec_mergel(in[2], in[3]);
+  out[2] = vec_mergeh(in[6], in[7]);
+  out[3] = vec_mergel(in[6], in[7]);
+
+  IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[2], cospi16_v);
+  IADST_WRAPLOW(out[0], out[1], tmp0[0], tmp0[1], in[3], cospi_p16_m16_v);
+  IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[6], cospi16_v);
+  IADST_WRAPLOW(out[2], out[3], tmp0[0], tmp0[1], in[7], cospi_p16_m16_v);
+
+  out[0] = in[0];
+  out[2] = in[6];
+  out[4] = in[3];
+  out[6] = in[5];
+
+  out[1] = vec_sub(zero16v, in[4]);
+  out[3] = vec_sub(zero16v, in[2]);
+  out[5] = vec_sub(zero16v, in[7]);
+  out[7] = vec_sub(zero16v, in[1]);
+}
+
+static void iadst16x8_vsx(int16x8_t *in, int16x8_t *out) {
+  int32x4_t tmp0[32], tmp1[32];
+  int16x8_t tmp16_0[8];
+  int16x8_t cospi_p01_p31 = vec_mergel(cospi1_v, cospi31_v);
+  int16x8_t cospi_p31_m01 = vec_mergel(cospi31_v, cospi1m_v);
+  int16x8_t cospi_p05_p27 = vec_mergel(cospi5_v, cospi27_v);
+  int16x8_t cospi_p27_m05 = vec_mergel(cospi27_v, cospi5m_v);
+  int16x8_t cospi_p09_p23 = vec_mergel(cospi9_v, cospi23_v);
+  int16x8_t cospi_p23_m09 = vec_mergel(cospi23_v, cospi9m_v);
+  int16x8_t cospi_p13_p19 = vec_mergel(cospi13_v, cospi19_v);
+  int16x8_t cospi_p19_m13 = vec_mergel(cospi19_v, cospi13m_v);
+  int16x8_t cospi_p17_p15 = vec_mergel(cospi17_v, cospi15_v);
+  int16x8_t cospi_p15_m17 = vec_mergel(cospi15_v, cospi17m_v);
+  int16x8_t cospi_p21_p11 = vec_mergel(cospi21_v, cospi11_v);
+  int16x8_t cospi_p11_m21 = vec_mergel(cospi11_v, cospi21m_v);
+  int16x8_t cospi_p25_p07 = vec_mergel(cospi25_v, cospi7_v);
+  int16x8_t cospi_p07_m25 = vec_mergel(cospi7_v, cospi25m_v);
+  int16x8_t cospi_p29_p03 = vec_mergel(cospi29_v, cospi3_v);
+  int16x8_t cospi_p03_m29 = vec_mergel(cospi3_v, cospi29m_v);
+  int16x8_t cospi_p04_p28 = vec_mergel(cospi4_v, cospi28_v);
+  int16x8_t cospi_p28_m04 = vec_mergel(cospi28_v, cospi4m_v);
+  int16x8_t cospi_p20_p12 = vec_mergel(cospi20_v, cospi12_v);
+  int16x8_t cospi_p12_m20 = vec_mergel(cospi12_v, cospi20m_v);
+  int16x8_t cospi_m28_p04 = vec_mergel(cospi28m_v, cospi4_v);
+  int16x8_t cospi_m12_p20 = vec_mergel(cospi12m_v, cospi20_v);
+  int16x8_t cospi_p08_p24 = vec_mergel(cospi8_v, cospi24_v);
+  int16x8_t cospi_p24_m08 = vec_mergel(cospi24_v, cospi8m_v);
+  int16x8_t cospi_m24_p08 = vec_mergel(cospi24m_v, cospi8_v);
+  int32x4_t zerov = vec_splat_s32(0);
+  ROUND_SHIFT_INIT;
+
+  tmp16_0[0] = vec_mergeh(in[15], in[0]);
+  tmp16_0[1] = vec_mergel(in[15], in[0]);
+  tmp16_0[2] = vec_mergeh(in[13], in[2]);
+  tmp16_0[3] = vec_mergel(in[13], in[2]);
+  tmp16_0[4] = vec_mergeh(in[11], in[4]);
+  tmp16_0[5] = vec_mergel(in[11], in[4]);
+  tmp16_0[6] = vec_mergeh(in[9], in[6]);
+  tmp16_0[7] = vec_mergel(in[9], in[6]);
+  tmp16_0[8] = vec_mergeh(in[7], in[8]);
+  tmp16_0[9] = vec_mergel(in[7], in[8]);
+  tmp16_0[10] = vec_mergeh(in[5], in[10]);
+  tmp16_0[11] = vec_mergel(in[5], in[10]);
+  tmp16_0[12] = vec_mergeh(in[3], in[12]);
+  tmp16_0[13] = vec_mergel(in[3], in[12]);
+  tmp16_0[14] = vec_mergeh(in[1], in[14]);
+  tmp16_0[15] = vec_mergel(in[1], in[14]);
+
+  tmp0[0] = vec_msum(tmp16_0[0], cospi_p01_p31, zerov);
+  tmp0[1] = vec_msum(tmp16_0[1], cospi_p01_p31, zerov);
+  tmp0[2] = vec_msum(tmp16_0[0], cospi_p31_m01, zerov);
+  tmp0[3] = vec_msum(tmp16_0[1], cospi_p31_m01, zerov);
+  tmp0[4] = vec_msum(tmp16_0[2], cospi_p05_p27, zerov);
+  tmp0[5] = vec_msum(tmp16_0[3], cospi_p05_p27, zerov);
+  tmp0[6] = vec_msum(tmp16_0[2], cospi_p27_m05, zerov);
+  tmp0[7] = vec_msum(tmp16_0[3], cospi_p27_m05, zerov);
+  tmp0[8] = vec_msum(tmp16_0[4], cospi_p09_p23, zerov);
+  tmp0[9] = vec_msum(tmp16_0[5], cospi_p09_p23, zerov);
+  tmp0[10] = vec_msum(tmp16_0[4], cospi_p23_m09, zerov);
+  tmp0[11] = vec_msum(tmp16_0[5], cospi_p23_m09, zerov);
+  tmp0[12] = vec_msum(tmp16_0[6], cospi_p13_p19, zerov);
+  tmp0[13] = vec_msum(tmp16_0[7], cospi_p13_p19, zerov);
+  tmp0[14] = vec_msum(tmp16_0[6], cospi_p19_m13, zerov);
+  tmp0[15] = vec_msum(tmp16_0[7], cospi_p19_m13, zerov);
+  tmp0[16] = vec_msum(tmp16_0[8], cospi_p17_p15, zerov);
+  tmp0[17] = vec_msum(tmp16_0[9], cospi_p17_p15, zerov);
+  tmp0[18] = vec_msum(tmp16_0[8], cospi_p15_m17, zerov);
+  tmp0[19] = vec_msum(tmp16_0[9], cospi_p15_m17, zerov);
+  tmp0[20] = vec_msum(tmp16_0[10], cospi_p21_p11, zerov);
+  tmp0[21] = vec_msum(tmp16_0[11], cospi_p21_p11, zerov);
+  tmp0[22] = vec_msum(tmp16_0[10], cospi_p11_m21, zerov);
+  tmp0[23] = vec_msum(tmp16_0[11], cospi_p11_m21, zerov);
+  tmp0[24] = vec_msum(tmp16_0[12], cospi_p25_p07, zerov);
+  tmp0[25] = vec_msum(tmp16_0[13], cospi_p25_p07, zerov);
+  tmp0[26] = vec_msum(tmp16_0[12], cospi_p07_m25, zerov);
+  tmp0[27] = vec_msum(tmp16_0[13], cospi_p07_m25, zerov);
+  tmp0[28] = vec_msum(tmp16_0[14], cospi_p29_p03, zerov);
+  tmp0[29] = vec_msum(tmp16_0[15], cospi_p29_p03, zerov);
+  tmp0[30] = vec_msum(tmp16_0[14], cospi_p03_m29, zerov);
+  tmp0[31] = vec_msum(tmp16_0[15], cospi_p03_m29, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[16]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[17]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[18]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[19]);
+  tmp1[4] = vec_add(tmp0[4], tmp0[20]);
+  tmp1[5] = vec_add(tmp0[5], tmp0[21]);
+  tmp1[6] = vec_add(tmp0[6], tmp0[22]);
+  tmp1[7] = vec_add(tmp0[7], tmp0[23]);
+  tmp1[8] = vec_add(tmp0[8], tmp0[24]);
+  tmp1[9] = vec_add(tmp0[9], tmp0[25]);
+  tmp1[10] = vec_add(tmp0[10], tmp0[26]);
+  tmp1[11] = vec_add(tmp0[11], tmp0[27]);
+  tmp1[12] = vec_add(tmp0[12], tmp0[28]);
+  tmp1[13] = vec_add(tmp0[13], tmp0[29]);
+  tmp1[14] = vec_add(tmp0[14], tmp0[30]);
+  tmp1[15] = vec_add(tmp0[15], tmp0[31]);
+  tmp1[16] = vec_sub(tmp0[0], tmp0[16]);
+  tmp1[17] = vec_sub(tmp0[1], tmp0[17]);
+  tmp1[18] = vec_sub(tmp0[2], tmp0[18]);
+  tmp1[19] = vec_sub(tmp0[3], tmp0[19]);
+  tmp1[20] = vec_sub(tmp0[4], tmp0[20]);
+  tmp1[21] = vec_sub(tmp0[5], tmp0[21]);
+  tmp1[22] = vec_sub(tmp0[6], tmp0[22]);
+  tmp1[23] = vec_sub(tmp0[7], tmp0[23]);
+  tmp1[24] = vec_sub(tmp0[8], tmp0[24]);
+  tmp1[25] = vec_sub(tmp0[9], tmp0[25]);
+  tmp1[26] = vec_sub(tmp0[10], tmp0[26]);
+  tmp1[27] = vec_sub(tmp0[11], tmp0[27]);
+  tmp1[28] = vec_sub(tmp0[12], tmp0[28]);
+  tmp1[29] = vec_sub(tmp0[13], tmp0[29]);
+  tmp1[30] = vec_sub(tmp0[14], tmp0[30]);
+  tmp1[31] = vec_sub(tmp0[15], tmp0[31]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+  DCT_CONST_ROUND_SHIFT(tmp1[16]);
+  DCT_CONST_ROUND_SHIFT(tmp1[17]);
+  DCT_CONST_ROUND_SHIFT(tmp1[18]);
+  DCT_CONST_ROUND_SHIFT(tmp1[19]);
+  DCT_CONST_ROUND_SHIFT(tmp1[20]);
+  DCT_CONST_ROUND_SHIFT(tmp1[21]);
+  DCT_CONST_ROUND_SHIFT(tmp1[22]);
+  DCT_CONST_ROUND_SHIFT(tmp1[23]);
+  DCT_CONST_ROUND_SHIFT(tmp1[24]);
+  DCT_CONST_ROUND_SHIFT(tmp1[25]);
+  DCT_CONST_ROUND_SHIFT(tmp1[26]);
+  DCT_CONST_ROUND_SHIFT(tmp1[27]);
+  DCT_CONST_ROUND_SHIFT(tmp1[28]);
+  DCT_CONST_ROUND_SHIFT(tmp1[29]);
+  DCT_CONST_ROUND_SHIFT(tmp1[30]);
+  DCT_CONST_ROUND_SHIFT(tmp1[31]);
+
+  in[0] = vec_packs(tmp1[0], tmp1[1]);
+  in[1] = vec_packs(tmp1[2], tmp1[3]);
+  in[2] = vec_packs(tmp1[4], tmp1[5]);
+  in[3] = vec_packs(tmp1[6], tmp1[7]);
+  in[4] = vec_packs(tmp1[8], tmp1[9]);
+  in[5] = vec_packs(tmp1[10], tmp1[11]);
+  in[6] = vec_packs(tmp1[12], tmp1[13]);
+  in[7] = vec_packs(tmp1[14], tmp1[15]);
+  in[8] = vec_packs(tmp1[16], tmp1[17]);
+  in[9] = vec_packs(tmp1[18], tmp1[19]);
+  in[10] = vec_packs(tmp1[20], tmp1[21]);
+  in[11] = vec_packs(tmp1[22], tmp1[23]);
+  in[12] = vec_packs(tmp1[24], tmp1[25]);
+  in[13] = vec_packs(tmp1[26], tmp1[27]);
+  in[14] = vec_packs(tmp1[28], tmp1[29]);
+  in[15] = vec_packs(tmp1[30], tmp1[31]);
+
+  // stage 2
+  tmp16_0[0] = vec_mergeh(in[8], in[9]);
+  tmp16_0[1] = vec_mergel(in[8], in[9]);
+  tmp16_0[2] = vec_mergeh(in[10], in[11]);
+  tmp16_0[3] = vec_mergel(in[10], in[11]);
+  tmp16_0[4] = vec_mergeh(in[12], in[13]);
+  tmp16_0[5] = vec_mergel(in[12], in[13]);
+  tmp16_0[6] = vec_mergeh(in[14], in[15]);
+  tmp16_0[7] = vec_mergel(in[14], in[15]);
+
+  tmp0[0] = vec_msum(tmp16_0[0], cospi_p04_p28, zerov);
+  tmp0[1] = vec_msum(tmp16_0[1], cospi_p04_p28, zerov);
+  tmp0[2] = vec_msum(tmp16_0[0], cospi_p28_m04, zerov);
+  tmp0[3] = vec_msum(tmp16_0[1], cospi_p28_m04, zerov);
+  tmp0[4] = vec_msum(tmp16_0[2], cospi_p20_p12, zerov);
+  tmp0[5] = vec_msum(tmp16_0[3], cospi_p20_p12, zerov);
+  tmp0[6] = vec_msum(tmp16_0[2], cospi_p12_m20, zerov);
+  tmp0[7] = vec_msum(tmp16_0[3], cospi_p12_m20, zerov);
+  tmp0[8] = vec_msum(tmp16_0[4], cospi_m28_p04, zerov);
+  tmp0[9] = vec_msum(tmp16_0[5], cospi_m28_p04, zerov);
+  tmp0[10] = vec_msum(tmp16_0[4], cospi_p04_p28, zerov);
+  tmp0[11] = vec_msum(tmp16_0[5], cospi_p04_p28, zerov);
+  tmp0[12] = vec_msum(tmp16_0[6], cospi_m12_p20, zerov);
+  tmp0[13] = vec_msum(tmp16_0[7], cospi_m12_p20, zerov);
+  tmp0[14] = vec_msum(tmp16_0[6], cospi_p20_p12, zerov);
+  tmp0[15] = vec_msum(tmp16_0[7], cospi_p20_p12, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[8]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[9]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[10]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[11]);
+  tmp1[4] = vec_add(tmp0[4], tmp0[12]);
+  tmp1[5] = vec_add(tmp0[5], tmp0[13]);
+  tmp1[6] = vec_add(tmp0[6], tmp0[14]);
+  tmp1[7] = vec_add(tmp0[7], tmp0[15]);
+  tmp1[8] = vec_sub(tmp0[0], tmp0[8]);
+  tmp1[9] = vec_sub(tmp0[1], tmp0[9]);
+  tmp1[10] = vec_sub(tmp0[2], tmp0[10]);
+  tmp1[11] = vec_sub(tmp0[3], tmp0[11]);
+  tmp1[12] = vec_sub(tmp0[4], tmp0[12]);
+  tmp1[13] = vec_sub(tmp0[5], tmp0[13]);
+  tmp1[14] = vec_sub(tmp0[6], tmp0[14]);
+  tmp1[15] = vec_sub(tmp0[7], tmp0[15]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+  tmp16_0[0] = vec_add(in[0], in[4]);
+  tmp16_0[1] = vec_add(in[1], in[5]);
+  tmp16_0[2] = vec_add(in[2], in[6]);
+  tmp16_0[3] = vec_add(in[3], in[7]);
+  tmp16_0[4] = vec_sub(in[0], in[4]);
+  tmp16_0[5] = vec_sub(in[1], in[5]);
+  tmp16_0[6] = vec_sub(in[2], in[6]);
+  tmp16_0[7] = vec_sub(in[3], in[7]);
+  tmp16_0[8] = vec_packs(tmp1[0], tmp1[1]);
+  tmp16_0[9] = vec_packs(tmp1[2], tmp1[3]);
+  tmp16_0[10] = vec_packs(tmp1[4], tmp1[5]);
+  tmp16_0[11] = vec_packs(tmp1[6], tmp1[7]);
+  tmp16_0[12] = vec_packs(tmp1[8], tmp1[9]);
+  tmp16_0[13] = vec_packs(tmp1[10], tmp1[11]);
+  tmp16_0[14] = vec_packs(tmp1[12], tmp1[13]);
+  tmp16_0[15] = vec_packs(tmp1[14], tmp1[15]);
+
+  // stage 3
+  in[0] = vec_mergeh(tmp16_0[4], tmp16_0[5]);
+  in[1] = vec_mergel(tmp16_0[4], tmp16_0[5]);
+  in[2] = vec_mergeh(tmp16_0[6], tmp16_0[7]);
+  in[3] = vec_mergel(tmp16_0[6], tmp16_0[7]);
+  in[4] = vec_mergeh(tmp16_0[12], tmp16_0[13]);
+  in[5] = vec_mergel(tmp16_0[12], tmp16_0[13]);
+  in[6] = vec_mergeh(tmp16_0[14], tmp16_0[15]);
+  in[7] = vec_mergel(tmp16_0[14], tmp16_0[15]);
+
+  tmp0[0] = vec_msum(in[0], cospi_p08_p24, zerov);
+  tmp0[1] = vec_msum(in[1], cospi_p08_p24, zerov);
+  tmp0[2] = vec_msum(in[0], cospi_p24_m08, zerov);
+  tmp0[3] = vec_msum(in[1], cospi_p24_m08, zerov);
+  tmp0[4] = vec_msum(in[2], cospi_m24_p08, zerov);
+  tmp0[5] = vec_msum(in[3], cospi_m24_p08, zerov);
+  tmp0[6] = vec_msum(in[2], cospi_p08_p24, zerov);
+  tmp0[7] = vec_msum(in[3], cospi_p08_p24, zerov);
+  tmp0[8] = vec_msum(in[4], cospi_p08_p24, zerov);
+  tmp0[9] = vec_msum(in[5], cospi_p08_p24, zerov);
+  tmp0[10] = vec_msum(in[4], cospi_p24_m08, zerov);
+  tmp0[11] = vec_msum(in[5], cospi_p24_m08, zerov);
+  tmp0[12] = vec_msum(in[6], cospi_m24_p08, zerov);
+  tmp0[13] = vec_msum(in[7], cospi_m24_p08, zerov);
+  tmp0[14] = vec_msum(in[6], cospi_p08_p24, zerov);
+  tmp0[15] = vec_msum(in[7], cospi_p08_p24, zerov);
+
+  tmp1[0] = vec_add(tmp0[0], tmp0[4]);
+  tmp1[1] = vec_add(tmp0[1], tmp0[5]);
+  tmp1[2] = vec_add(tmp0[2], tmp0[6]);
+  tmp1[3] = vec_add(tmp0[3], tmp0[7]);
+  tmp1[4] = vec_sub(tmp0[0], tmp0[4]);
+  tmp1[5] = vec_sub(tmp0[1], tmp0[5]);
+  tmp1[6] = vec_sub(tmp0[2], tmp0[6]);
+  tmp1[7] = vec_sub(tmp0[3], tmp0[7]);
+  tmp1[8] = vec_add(tmp0[8], tmp0[12]);
+  tmp1[9] = vec_add(tmp0[9], tmp0[13]);
+  tmp1[10] = vec_add(tmp0[10], tmp0[14]);
+  tmp1[11] = vec_add(tmp0[11], tmp0[15]);
+  tmp1[12] = vec_sub(tmp0[8], tmp0[12]);
+  tmp1[13] = vec_sub(tmp0[9], tmp0[13]);
+  tmp1[14] = vec_sub(tmp0[10], tmp0[14]);
+  tmp1[15] = vec_sub(tmp0[11], tmp0[15]);
+
+  DCT_CONST_ROUND_SHIFT(tmp1[0]);
+  DCT_CONST_ROUND_SHIFT(tmp1[1]);
+  DCT_CONST_ROUND_SHIFT(tmp1[2]);
+  DCT_CONST_ROUND_SHIFT(tmp1[3]);
+  DCT_CONST_ROUND_SHIFT(tmp1[4]);
+  DCT_CONST_ROUND_SHIFT(tmp1[5]);
+  DCT_CONST_ROUND_SHIFT(tmp1[6]);
+  DCT_CONST_ROUND_SHIFT(tmp1[7]);
+  DCT_CONST_ROUND_SHIFT(tmp1[8]);
+  DCT_CONST_ROUND_SHIFT(tmp1[9]);
+  DCT_CONST_ROUND_SHIFT(tmp1[10]);
+  DCT_CONST_ROUND_SHIFT(tmp1[11]);
+  DCT_CONST_ROUND_SHIFT(tmp1[12]);
+  DCT_CONST_ROUND_SHIFT(tmp1[13]);
+  DCT_CONST_ROUND_SHIFT(tmp1[14]);
+  DCT_CONST_ROUND_SHIFT(tmp1[15]);
+
+  in[0] = vec_add(tmp16_0[0], tmp16_0[2]);
+  in[1] = vec_add(tmp16_0[1], tmp16_0[3]);
+  in[2] = vec_sub(tmp16_0[0], tmp16_0[2]);
+  in[3] = vec_sub(tmp16_0[1], tmp16_0[3]);
+  in[4] = vec_packs(tmp1[0], tmp1[1]);
+  in[5] = vec_packs(tmp1[2], tmp1[3]);
+  in[6] = vec_packs(tmp1[4], tmp1[5]);
+  in[7] = vec_packs(tmp1[6], tmp1[7]);
+  in[8] = vec_add(tmp16_0[8], tmp16_0[10]);
+  in[9] = vec_add(tmp16_0[9], tmp16_0[11]);
+  in[10] = vec_sub(tmp16_0[8], tmp16_0[10]);
+  in[11] = vec_sub(tmp16_0[9], tmp16_0[11]);
+  in[12] = vec_packs(tmp1[8], tmp1[9]);
+  in[13] = vec_packs(tmp1[10], tmp1[11]);
+  in[14] = vec_packs(tmp1[12], tmp1[13]);
+  in[15] = vec_packs(tmp1[14], tmp1[15]);
+
+  // stage 4
+  out[0] = vec_mergeh(in[2], in[3]);
+  out[1] = vec_mergel(in[2], in[3]);
+  out[2] = vec_mergeh(in[6], in[7]);
+  out[3] = vec_mergel(in[6], in[7]);
+  out[4] = vec_mergeh(in[10], in[11]);
+  out[5] = vec_mergel(in[10], in[11]);
+  out[6] = vec_mergeh(in[14], in[15]);
+  out[7] = vec_mergel(in[14], in[15]);
+}
+
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1) {
+  int16x8_t tmp0[16], tmp1[16], tmp2[8];
+  int32x4_t tmp3, tmp4;
+  int16x8_t zero16v = vec_splat_s16(0);
+  int32x4_t zerov = vec_splat_s32(0);
+  int16x8_t cospi_p16_m16 = vec_mergel(cospi16_v, cospi16m_v);
+  int16x8_t cospi_m16_p16 = vec_mergel(cospi16m_v, cospi16_v);
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0[0], src0[2], src0[4], src0[6], src0[8], src0[10], src0[12],
+               src0[14], tmp0[0], tmp0[1], tmp0[2], tmp0[3], tmp0[4], tmp0[5],
+               tmp0[6], tmp0[7]);
+  TRANSPOSE8x8(src1[0], src1[2], src1[4], src1[6], src1[8], src1[10], src1[12],
+               src1[14], tmp1[0], tmp1[1], tmp1[2], tmp1[3], tmp1[4], tmp1[5],
+               tmp1[6], tmp1[7]);
+  TRANSPOSE8x8(src0[1], src0[3], src0[5], src0[7], src0[9], src0[11], src0[13],
+               src0[15], tmp0[8], tmp0[9], tmp0[10], tmp0[11], tmp0[12],
+               tmp0[13], tmp0[14], tmp0[15]);
+  TRANSPOSE8x8(src1[1], src1[3], src1[5], src1[7], src1[9], src1[11], src1[13],
+               src1[15], tmp1[8], tmp1[9], tmp1[10], tmp1[11], tmp1[12],
+               tmp1[13], tmp1[14], tmp1[15]);
+
+  iadst16x8_vsx(tmp0, tmp2);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[14], cospi16m_v);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[0], cospi_p16_m16);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[8], cospi16_v);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[6], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[12], cospi16_v);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[2], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[10], cospi16m_v);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[4], cospi_p16_m16);
+
+  src0[0] = tmp0[0];
+  src0[2] = vec_sub(zero16v, tmp0[8]);
+  src0[4] = tmp0[12];
+  src0[6] = vec_sub(zero16v, tmp0[4]);
+  src1[8] = tmp0[5];
+  src1[10] = vec_sub(zero16v, tmp0[13]);
+  src1[12] = tmp0[9];
+  src1[14] = vec_sub(zero16v, tmp0[1]);
+
+  iadst16x8_vsx(tmp1, tmp2);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src0[15], cospi16m_v);
+  IADST_WRAPLOW(tmp2[0], tmp2[1], tmp3, tmp4, src1[1], cospi_p16_m16);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src0[9], cospi16_v);
+  IADST_WRAPLOW(tmp2[2], tmp2[3], tmp3, tmp4, src1[7], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src0[13], cospi16_v);
+  IADST_WRAPLOW(tmp2[4], tmp2[5], tmp3, tmp4, src1[3], cospi_m16_p16);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src0[11], cospi16m_v);
+  IADST_WRAPLOW(tmp2[6], tmp2[7], tmp3, tmp4, src1[5], cospi_p16_m16);
+
+  src0[1] = tmp1[0];
+  src0[3] = vec_sub(zero16v, tmp1[8]);
+  src0[5] = tmp1[12];
+  src0[7] = vec_sub(zero16v, tmp1[4]);
+  src1[9] = tmp1[5];
+  src1[11] = vec_sub(zero16v, tmp1[13]);
+  src1[13] = tmp1[9];
+  src1[15] = vec_sub(zero16v, tmp1[1]);
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
new file mode 100644
index 00000000000..7031742c1c9
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/inv_txfm_vsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+#define VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+void vpx_round_store4x4_vsx(int16x8_t *in, int16x8_t *out, uint8_t *dest,
+                            int stride);
+void vpx_idct4_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst4_vsx(int16x8_t *in, int16x8_t *out);
+
+void vpx_round_store8x8_vsx(int16x8_t *in, uint8_t *dest, int stride);
+void vpx_idct8_vsx(int16x8_t *in, int16x8_t *out);
+void vp9_iadst8_vsx(int16x8_t *in, int16x8_t *out);
+
+#define LOAD_INPUT16(load, source, offset, step, in) \
+  in[0] = load(offset, source);                      \
+  in[1] = load((step) + (offset), source);           \
+  in[2] = load(2 * (step) + (offset), source);       \
+  in[3] = load(3 * (step) + (offset), source);       \
+  in[4] = load(4 * (step) + (offset), source);       \
+  in[5] = load(5 * (step) + (offset), source);       \
+  in[6] = load(6 * (step) + (offset), source);       \
+  in[7] = load(7 * (step) + (offset), source);       \
+  in[8] = load(8 * (step) + (offset), source);       \
+  in[9] = load(9 * (step) + (offset), source);       \
+  in[10] = load(10 * (step) + (offset), source);     \
+  in[11] = load(11 * (step) + (offset), source);     \
+  in[12] = load(12 * (step) + (offset), source);     \
+  in[13] = load(13 * (step) + (offset), source);     \
+  in[14] = load(14 * (step) + (offset), source);     \
+  in[15] = load(15 * (step) + (offset), source);
+
+void vpx_round_store16x16_vsx(int16x8_t *src0, int16x8_t *src1, uint8_t *dest,
+                              int stride);
+void vpx_idct16_vsx(int16x8_t *src0, int16x8_t *src1);
+void vpx_iadst16_vsx(int16x8_t *src0, int16x8_t *src1);
+
+#endif  // VPX_VPX_DSP_PPC_INV_TXFM_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c
new file mode 100644
index 00000000000..d85e63bd148
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -0,0 +1,305 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+  const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+  return vec_xor(vec_add(a, mask), mask);
+}
+
+// Sets the value of a 32-bit integers to 1 when the corresponding value in a is
+// negative.
+static INLINE int32x4_t vec_is_neg(int32x4_t a) {
+  return vec_sr(a, vec_shift_sign_s32);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+// (a * b) >> 16
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+  // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
+  // shift.
+  return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16);
+}
+
+// Quantization function used for 4x4, 8x8 and 16x16 blocks.
+static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
+                                       int16x8_t round, int16x8_t quant,
+                                       int16x8_t quant_shift, bool16x8_t mask) {
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  qcoeff = vec_mulhi(qcoeff, quant_shift);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+// Quantization function used for 32x32 blocks.
+static INLINE int16x8_t quantize_coeff_32(int16x8_t coeff, int16x8_t coeff_abs,
+                                          int16x8_t round, int16x8_t quant,
+                                          int16x8_t quant_shift,
+                                          bool16x8_t mask) {
+  const int16x8_t rounded = vec_vaddshs(coeff_abs, round);
+  int16x8_t qcoeff = vec_mulhi(rounded, quant);
+  qcoeff = vec_add(qcoeff, rounded);
+  // 32x32 blocks require an extra multiplication by 2, this compensates for the
+  // extra right shift added in vec_mulhi, as such vec_madds can be used
+  // directly instead of vec_mulhi (((a * b) >> 15) >> 1) << 1 == (a * b >> 15)
+  qcoeff = vec_madds(qcoeff, quant_shift, vec_zeros_s16);
+  qcoeff = vec_sign(qcoeff, coeff);
+  return vec_and(qcoeff, mask);
+}
+
+// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32
+// blocks are twice as big as for other block sizes. As such, using
+// vec_mladd results in overflow.
+static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
+                                            int16x8_t dequant) {
+  int32x4_t dqcoeffe = vec_mule(qcoeff, dequant);
+  int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant);
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe));
+  dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo));
+  dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32);
+  dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32);
+  return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
+}
+
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+                                          const int16_t *iscan_ptr, int index) {
+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+  scan = vec_sub(scan, mask);
+  return vec_andc(scan, zero_coeff);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+  a = vec_max(a, vec_perm(a, a, vec_perm64));
+  a = vec_max(a, vec_perm(a, a, vec_perm32));
+  return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *zbin_ptr,
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan_ptr,
+                        const int16_t *iscan_ptr) {
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
+
+  // First set of 8 coeff starts with DC + 7 AC
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  qcoeff0 =
+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+  round = vec_splat(round, 1);
+  quant = vec_splat(quant, 1);
+  quant_shift = vec_splat(quant_shift, 1);
+  qcoeff1 =
+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+
+  if (n_coeffs > 16) {
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+    do {
+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+      bool16x8_t zero_mask2;
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      coeff0_abs = vec_abs(coeff0);
+      coeff1_abs = vec_abs(coeff1);
+      coeff2_abs = vec_abs(coeff2);
+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+                               zero_mask0);
+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+                               zero_mask1);
+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+                               zero_mask2);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob =
+          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
+
+void vpx_quantize_b_32x32_vsx(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  // In stage 1, we quantize 16 coeffs (DC + 15 AC)
+  // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
+  // (32 * 32 - 16) / 24 = 42
+  int num_itr = 42;
+  // Offsets are in bytes, 16 coeffs = 32 bytes
+  int off0 = 32;
+  int off1 = 48;
+  int off2 = 64;
+
+  int16x8_t qcoeff0, qcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
+
+  int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+  int16x8_t round = vec_vsx_ld(0, round_ptr);
+  int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+  int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+  int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  (void)scan_ptr;
+  (void)skip_block;
+  (void)n_coeffs;
+  assert(!skip_block);
+
+  // 32x32 quantization requires that zbin and round be divided by 2
+  zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
+  round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);  // remove DC from zbin
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+
+  qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                              zero_mask0);
+  round = vec_splat(round, 1);              // remove DC from round
+  quant = vec_splat(quant, 1);              // remove DC from quant
+  quant_shift = vec_splat(quant_shift, 1);  // remove DC from quant_shift
+  qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                              zero_mask1);
+
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
+
+  vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);  // remove DC from dequant
+  vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+
+  do {
+    int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
+    bool16x8_t zero_mask2;
+
+    coeff0 = vec_vsx_ld(off0, coeff_ptr);
+    coeff1 = vec_vsx_ld(off1, coeff_ptr);
+    coeff2 = vec_vsx_ld(off2, coeff_ptr);
+
+    coeff0_abs = vec_abs(coeff0);
+    coeff1_abs = vec_abs(coeff1);
+    coeff2_abs = vec_abs(coeff2);
+
+    zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+    zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+    zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+
+    qcoeff0 = quantize_coeff_32(coeff0, coeff0_abs, round, quant, quant_shift,
+                                zero_mask0);
+    qcoeff1 = quantize_coeff_32(coeff1, coeff1_abs, round, quant, quant_shift,
+                                zero_mask1);
+    qcoeff2 = quantize_coeff_32(coeff2, coeff2_abs, round, quant, quant_shift,
+                                zero_mask2);
+
+    vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+    vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+    vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+    vec_vsx_st(dequantize_coeff_32(qcoeff0, dequant), off0, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
+    vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
+
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+    eob = vec_max(eob, eob2);
+
+    // 24 int16_t is 48 bytes
+    off0 += 48;
+    off1 += 48;
+    off2 += 48;
+    num_itr--;
+  } while (num_itr != 0);
+
+  eob = vec_max_across(eob);
+  *eob_ptr = eob[0];
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c
index bb49addae17..a08ae12413b 100644
--- a/libs/libvpx/vpx_dsp/ppc/sad_vsx.c
+++ b/libs/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -17,71 +17,75 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-#define PROCESS16(offset)           \
-  v_a = vec_vsx_ld(offset, a);      \
-  v_b = vec_vsx_ld(offset, b);      \
-  v_ah = unpack_to_s16_h(v_a);      \
-  v_al = unpack_to_s16_l(v_a);      \
-  v_bh = unpack_to_s16_h(v_b);      \
-  v_bl = unpack_to_s16_l(v_b);      \
-  v_subh = vec_sub(v_ah, v_bh);     \
-  v_subl = vec_sub(v_al, v_bl);     \
-  v_absh = vec_abs(v_subh);         \
-  v_absl = vec_abs(v_subl);         \
-  v_sad = vec_sum4s(v_absh, v_sad); \
-  v_sad = vec_sum4s(v_absl, v_sad);
+#define PROCESS16(offset)      \
+  v_a = vec_vsx_ld(offset, a); \
+  v_b = vec_vsx_ld(offset, b); \
+  v_abs = vec_absd(v_a, v_b);  \
+  v_sad = vec_sum4s(v_abs, v_sad);
+
+#define SAD8(height)                                                     \
+  unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride,   \
+                                       const uint8_t *b, int b_stride) { \
+    int y = 0;                                                           \
+    uint8x16_t v_a, v_b, v_abs;                                          \
+    uint32x4_t v_sad = vec_zeros_u32;                                    \
+                                                                         \
+    do {                                                                 \
+      PROCESS16(0)                                                       \
+                                                                         \
+      a += a_stride;                                                     \
+      b += b_stride;                                                     \
+      y++;                                                               \
+    } while (y < height);                                                \
+                                                                         \
+    return v_sad[1] + v_sad[0];                                          \
+  }
 
 #define SAD16(height)                                                     \
   unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride,   \
                                         const uint8_t *b, int b_stride) { \
-    int y;                                                                \
-    unsigned int sad[4];                                                  \
-    uint8x16_t v_a, v_b;                                                  \
-    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
-    int32x4_t v_sad = vec_splat_s32(0);                                   \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
                                                                           \
-    for (y = 0; y < height; y++) {                                        \
+    do {                                                                  \
       PROCESS16(0);                                                       \
                                                                           \
       a += a_stride;                                                      \
       b += b_stride;                                                      \
-    }                                                                     \
-    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+      y++;                                                                \
+    } while (y < height);                                                 \
                                                                           \
-    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
   }
 
 #define SAD32(height)                                                     \
   unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride,   \
                                         const uint8_t *b, int b_stride) { \
-    int y;                                                                \
-    unsigned int sad[4];                                                  \
-    uint8x16_t v_a, v_b;                                                  \
-    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
-    int32x4_t v_sad = vec_splat_s32(0);                                   \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
                                                                           \
-    for (y = 0; y < height; y++) {                                        \
+    do {                                                                  \
       PROCESS16(0);                                                       \
       PROCESS16(16);                                                      \
                                                                           \
       a += a_stride;                                                      \
       b += b_stride;                                                      \
-    }                                                                     \
-    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+      y++;                                                                \
+    } while (y < height);                                                 \
                                                                           \
-    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
   }
 
 #define SAD64(height)                                                     \
   unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride,   \
                                         const uint8_t *b, int b_stride) { \
-    int y;                                                                \
-    unsigned int sad[4];                                                  \
-    uint8x16_t v_a, v_b;                                                  \
-    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
-    int32x4_t v_sad = vec_splat_s32(0);                                   \
+    int y = 0;                                                            \
+    uint8x16_t v_a, v_b, v_abs;                                           \
+    uint32x4_t v_sad = vec_zeros_u32;                                     \
                                                                           \
-    for (y = 0; y < height; y++) {                                        \
+    do {                                                                  \
       PROCESS16(0);                                                       \
       PROCESS16(16);                                                      \
       PROCESS16(32);                                                      \
@@ -89,12 +93,15 @@
                                                                           \
       a += a_stride;                                                      \
       b += b_stride;                                                      \
-    }                                                                     \
-    vec_vsx_st((uint32x4_t)v_sad, 0, sad);                                \
+      y++;                                                                \
+    } while (y < height);                                                 \
                                                                           \
-    return sad[3] + sad[2] + sad[1] + sad[0];                             \
+    return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0];                     \
   }
 
+SAD8(4);
+SAD8(8);
+SAD8(16);
 SAD16(8);
 SAD16(16);
 SAD16(32);
@@ -108,7 +115,7 @@ SAD64(64);
   unsigned int vpx_sad16x##height##_avg_vsx(                                  \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *second_pred) {                                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]);                     \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * (height)]);                   \
     vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref,            \
                           ref_stride);                                        \
                                                                               \
@@ -119,7 +126,7 @@ SAD64(64);
   unsigned int vpx_sad32x##height##_avg_vsx(                                  \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *second_pred) {                                           \
-    DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]);                     \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * (height)]);                   \
     vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref,            \
                           ref_stride);                                        \
                                                                               \
@@ -130,7 +137,7 @@ SAD64(64);
   unsigned int vpx_sad64x##height##_avg_vsx(                                  \
       const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
       const uint8_t *second_pred) {                                           \
-    DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]);                     \
+    DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * (height)]);                   \
     vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref,            \
                           ref_stride);                                        \
     return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64);          \
diff --git a/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c b/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c
new file mode 100644
index 00000000000..76ad302da60
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/subtract_vsx.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static VPX_FORCE_INLINE void subtract_block4x4(
+    int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
+    ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
+  int16_t *diff1 = diff + 2 * diff_stride;
+  const uint8_t *src1 = src + 2 * src_stride;
+  const uint8_t *pred1 = pred + 2 * pred_stride;
+
+  const int16x8_t d0 = vec_vsx_ld(0, diff);
+  const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
+  const int16x8_t d2 = vec_vsx_ld(0, diff1);
+  const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
+
+  const uint8x16_t s0 = read4x2(src, (int)src_stride);
+  const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
+  const uint8x16_t s1 = read4x2(src1, (int)src_stride);
+  const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
+
+  const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+  const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+
+  vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
+  vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
+  vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
+  vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
+}
+
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  int r = rows, c;
+
+  switch (cols) {
+    case 64:
+    case 32:
+      do {
+        for (c = 0; c < cols; c += 32) {
+          const uint8x16_t s0 = vec_vsx_ld(0, src + c);
+          const uint8x16_t s1 = vec_vsx_ld(16, src + c);
+          const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
+          const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
+          const int16x8_t d0l =
+              vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+          const int16x8_t d0h =
+              vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+          const int16x8_t d1l =
+              vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
+          const int16x8_t d1h =
+              vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+          vec_vsx_st(d0h, 0, diff + c);
+          vec_vsx_st(d0l, 16, diff + c);
+          vec_vsx_st(d1h, 0, diff + c + 16);
+          vec_vsx_st(d1l, 16, diff + c + 16);
+        }
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 16:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        vec_vsx_st(d0l, 16, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 8:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 4:
+      subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
+      if (r > 4) {
+        diff += 4 * diff_stride;
+        pred += 4 * pred_stride;
+        src += 4 * src_stride;
+
+        subtract_block4x4(diff, diff_stride,
+
+                          src, src_stride,
+
+                          pred, pred_stride);
+      }
+      break;
+    default: assert(0);  // unreachable
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h
index f02556d5223..4883b734ad2 100644
--- a/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h
+++ b/libs/libvpx/vpx_dsp/ppc/transpose_vsx.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_PPC_TRANSPOSE_VSX_H_
-#define VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#ifndef VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
+#define VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
 
 #include "./vpx_config.h"
 #include "vpx_dsp/ppc/types_vsx.h"
@@ -98,4 +98,36 @@ static INLINE void vpx_transpose_s16_8x8(int16x8_t v[8]) {
   // v[7]: 07 17 27 37 47 57 67 77
 }
 
-#endif  // VPX_DSP_PPC_TRANSPOSE_VSX_H_
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+  // Stage 1
+  const int16x8_t s1_0 = vec_mergeh(a[0], a[4]);
+  const int16x8_t s1_1 = vec_mergel(a[0], a[4]);
+  const int16x8_t s1_2 = vec_mergeh(a[1], a[5]);
+  const int16x8_t s1_3 = vec_mergel(a[1], a[5]);
+  const int16x8_t s1_4 = vec_mergeh(a[2], a[6]);
+  const int16x8_t s1_5 = vec_mergel(a[2], a[6]);
+  const int16x8_t s1_6 = vec_mergeh(a[3], a[7]);
+  const int16x8_t s1_7 = vec_mergel(a[3], a[7]);
+
+  // Stage 2
+  const int16x8_t s2_0 = vec_mergeh(s1_0, s1_4);
+  const int16x8_t s2_1 = vec_mergel(s1_0, s1_4);
+  const int16x8_t s2_2 = vec_mergeh(s1_1, s1_5);
+  const int16x8_t s2_3 = vec_mergel(s1_1, s1_5);
+  const int16x8_t s2_4 = vec_mergeh(s1_2, s1_6);
+  const int16x8_t s2_5 = vec_mergel(s1_2, s1_6);
+  const int16x8_t s2_6 = vec_mergeh(s1_3, s1_7);
+  const int16x8_t s2_7 = vec_mergel(s1_3, s1_7);
+
+  // Stage 2
+  b[0] = vec_mergeh(s2_0, s2_4);
+  b[1] = vec_mergel(s2_0, s2_4);
+  b[2] = vec_mergeh(s2_1, s2_5);
+  b[3] = vec_mergel(s2_1, s2_5);
+  b[4] = vec_mergeh(s2_2, s2_6);
+  b[5] = vec_mergel(s2_2, s2_6);
+  b[6] = vec_mergeh(s2_3, s2_7);
+  b[7] = vec_mergel(s2_3, s2_7);
+}
+
+#endif  // VPX_VPX_DSP_PPC_TRANSPOSE_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h b/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
new file mode 100644
index 00000000000..2907a1fe40f
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/ppc/txfm_common_vsx.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+#define VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
+
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static const int32x4_t vec_dct_const_rounding = { 8192, 8192, 8192, 8192 };
+
+static const uint32x4_t vec_dct_const_bits = { 14, 14, 14, 14 };
+
+static const uint16x8_t vec_dct_scale_log2 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+
+static const int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                                    16364, 16364, 16364, 16364 };
+static const int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                                    16305, 16305, 16305, 16305 };
+static const int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                                    16207, 16207, 16207, 16207 };
+static const int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                                    16069, 16069, 16069, 16069 };
+static const int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                                     -16069, -16069, -16069, -16069 };
+static const int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                                    15893, 15893, 15893, 15893 };
+static const int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                                    15679, 15679, 15679, 15679 };
+static const int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                                    15426, 15426, 15426, 15426 };
+static const int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                                    15137, 15137, 15137, 15137 };
+static const int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                                     -15137, -15137, -15137, -15137 };
+static const int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                                    14811, 14811, 14811, 14811 };
+static const int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                                     14449, 14449, 14449, 14449 };
+static const int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                                     14053, 14053, 14053, 14053 };
+static const int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                                     13623, 13623, 13623, 13623 };
+static const int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                                     13160, 13160, 13160, 13160 };
+static const int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                                     12665, 12665, 12665, 12665 };
+static const int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                                     12140, 12140, 12140, 12140 };
+static const int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                                     11585, 11585, 11585, 11585 };
+static const int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                                     11003, 11003, 11003, 11003 };
+static const int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                                     10394, 10394, 10394, 10394 };
+static const int16x8_t cospi19_v = { 9760, 9760, 9760, 9760,
+                                     9760, 9760, 9760, 9760 };
+static const int16x8_t cospi20_v = { 9102, 9102, 9102, 9102,
+                                     9102, 9102, 9102, 9102 };
+static const int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                      -9102, -9102, -9102, -9102 };
+static const int16x8_t cospi21_v = { 8423, 8423, 8423, 8423,
+                                     8423, 8423, 8423, 8423 };
+static const int16x8_t cospi22_v = { 7723, 7723, 7723, 7723,
+                                     7723, 7723, 7723, 7723 };
+static const int16x8_t cospi23_v = { 7005, 7005, 7005, 7005,
+                                     7005, 7005, 7005, 7005 };
+static const int16x8_t cospi24_v = { 6270, 6270, 6270, 6270,
+                                     6270, 6270, 6270, 6270 };
+static const int16x8_t cospi25_v = { 5520, 5520, 5520, 5520,
+                                     5520, 5520, 5520, 5520 };
+static const int16x8_t cospi26_v = { 4756, 4756, 4756, 4756,
+                                     4756, 4756, 4756, 4756 };
+static const int16x8_t cospi27_v = { 3981, 3981, 3981, 3981,
+                                     3981, 3981, 3981, 3981 };
+static const int16x8_t cospi28_v = { 3196, 3196, 3196, 3196,
+                                     3196, 3196, 3196, 3196 };
+static const int16x8_t cospi29_v = { 2404, 2404, 2404, 2404,
+                                     2404, 2404, 2404, 2404 };
+static const int16x8_t cospi30_v = { 1606, 1606, 1606, 1606,
+                                     1606, 1606, 1606, 1606 };
+static const int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#endif  // VPX_VPX_DSP_PPC_TXFM_COMMON_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/types_vsx.h b/libs/libvpx/vpx_dsp/ppc/types_vsx.h
index f611d02d2d5..b891169245c 100644
--- a/libs/libvpx/vpx_dsp/ppc/types_vsx.h
+++ b/libs/libvpx/vpx_dsp/ppc/types_vsx.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_PPC_TYPES_VSX_H_
-#define VPX_DSP_PPC_TYPES_VSX_H_
+#ifndef VPX_VPX_DSP_PPC_TYPES_VSX_H_
+#define VPX_VPX_DSP_PPC_TYPES_VSX_H_
 
 #include <altivec.h>
 
@@ -19,8 +19,11 @@ typedef vector signed short int16x8_t;
 typedef vector unsigned short uint16x8_t;
 typedef vector signed int int32x4_t;
 typedef vector unsigned int uint32x4_t;
+typedef vector bool char bool8x16_t;
+typedef vector bool short bool16x8_t;
+typedef vector bool int bool32x4_t;
 
-#ifdef __clang__
+#if defined(__clang__) && __clang_major__ < 6
 static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
                                            0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
                                            0x14, 0x15, 0x16, 0x17 };
@@ -61,8 +64,45 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
 #define unpack_to_s16_l(v) \
   (int16x8_t) vec_mergel((uint8x16_t)v, vec_splat_u8(0))
 #ifndef xxpermdi
-#define xxpermdi(a, b, c) vec_xxpermdi(b, a, ((c >> 1) | (c & 1) << 1) ^ 3)
+#define xxpermdi(a, b, c) vec_xxpermdi(b, a, (((c) >> 1) | ((c)&1) << 1) ^ 3)
 #endif
 #endif
 
-#endif  // VPX_DSP_PPC_TYPES_VSX_H_
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
+  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+  return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
+#ifndef __POWER9_VECTOR__
+#define vec_absd(a, b) vec_sub(vec_max(a, b), vec_min(a, b))
+#endif
+
+static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
+                                         0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const int16x8_t vec_twos_s16 = { 2, 2, 2, 2, 2, 2, 2, 2 };
+static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 };
+static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 };
+static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 };
+static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 };
+static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
+                                       0x04, 0x05, 0x06, 0x07 };
+static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+                                       0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                                       0x00, 0x01, 0x02, 0x03 };
+static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                       0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
+                                       0x0E, 0x0F, 0x00, 0x01 };
+
+static const uint8x16_t vec_perm_odd_even_pack = { 0x00, 0x01, 0x10, 0x11,
+                                                   0x04, 0x05, 0x14, 0x15,
+                                                   0x08, 0x09, 0x18, 0x19,
+                                                   0x0C, 0x0D, 0x1C, 0x1D };
+
+#endif  // VPX_VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c
index 1efe2f00569..be9614a3584 100644
--- a/libs/libvpx/vpx_dsp/ppc/variance_vsx.c
+++ b/libs/libvpx/vpx_dsp/ppc/variance_vsx.c
@@ -10,24 +10,20 @@
 
 #include <assert.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ppc/types_vsx.h"
 
-static inline uint8x16_t read4x2(const uint8_t *a, int stride) {
-  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
-  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
-
-  return (uint8x16_t)vec_mergeh(a0, a1);
-}
-
-uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride) {
+uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride) {
   int distortion;
 
-  const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride));
-  const int16x8_t a1 = unpack_to_s16_h(read4x2(a + a_stride * 2, a_stride));
-  const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride));
-  const int16x8_t b1 = unpack_to_s16_h(read4x2(b + b_stride * 2, b_stride));
+  const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+  const int16x8_t a1 =
+      unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
+  const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+  const int16x8_t b1 =
+      unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
   const int16x8_t d0 = vec_sub(a0, b0);
   const int16x8_t d1 = vec_sub(a1, b1);
   const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
@@ -39,12 +35,12 @@ uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 // TODO(lu_zero): Unroll
-uint32_t vpx_get_mb_ss_vsx(const int16_t *a) {
+uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
   int32x4_t s = vec_splat_s32(0);
 
   for (i = 0; i < 256; i += 8) {
-    const int16x8_t v = vec_vsx_ld(0, a + i);
+    const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
     s = vec_msum(v, v, s);
   }
 
@@ -101,3 +97,175 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
     }
   }
 }
+
+static INLINE void variance_inner_32(const uint8_t *src_ptr,
+                                     const uint8_t *ref_ptr,
+                                     int32x4_t *sum_squared, int32x4_t *sum) {
+  int32x4_t s = *sum;
+  int32x4_t ss = *sum_squared;
+
+  const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
+  const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
+  const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
+  const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
+
+  const int16x8_t a0 = unpack_to_s16_h(va0);
+  const int16x8_t b0 = unpack_to_s16_h(vb0);
+  const int16x8_t a1 = unpack_to_s16_l(va0);
+  const int16x8_t b1 = unpack_to_s16_l(vb0);
+  const int16x8_t a2 = unpack_to_s16_h(va1);
+  const int16x8_t b2 = unpack_to_s16_h(vb1);
+  const int16x8_t a3 = unpack_to_s16_l(va1);
+  const int16x8_t b3 = unpack_to_s16_l(vb1);
+  const int16x8_t d0 = vec_sub(a0, b0);
+  const int16x8_t d1 = vec_sub(a1, b1);
+  const int16x8_t d2 = vec_sub(a2, b2);
+  const int16x8_t d3 = vec_sub(a3, b3);
+
+  s = vec_sum4s(d0, s);
+  ss = vec_msum(d0, d0, ss);
+  s = vec_sum4s(d1, s);
+  ss = vec_msum(d1, d1, ss);
+  s = vec_sum4s(d2, s);
+  ss = vec_msum(d2, d2, ss);
+  s = vec_sum4s(d3, s);
+  ss = vec_msum(d3, d3, ss);
+  *sum = s;
+  *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr, int ref_stride, int w,
+                            int h, uint32_t *sse, int *sum) {
+  int i;
+
+  int32x4_t s = vec_splat_s32(0);
+  int32x4_t ss = vec_splat_s32(0);
+
+  switch (w) {
+    case 4:
+      for (i = 0; i < h / 2; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
+        const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
+        const int16x8_t d = vec_sub(a0, b0);
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride * 2;
+        ref_ptr += ref_stride * 2;
+      }
+      break;
+    case 8:
+      for (i = 0; i < h; ++i) {
+        const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
+        const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
+        const int16x8_t d = vec_sub(a0, b0);
+
+        s = vec_sum4s(d, s);
+        ss = vec_msum(d, d, ss);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 16:
+      for (i = 0; i < h; ++i) {
+        const uint8x16_t va = vec_vsx_ld(0, src_ptr);
+        const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
+        const int16x8_t a0 = unpack_to_s16_h(va);
+        const int16x8_t b0 = unpack_to_s16_h(vb);
+        const int16x8_t a1 = unpack_to_s16_l(va);
+        const int16x8_t b1 = unpack_to_s16_l(vb);
+        const int16x8_t d0 = vec_sub(a0, b0);
+        const int16x8_t d1 = vec_sub(a1, b1);
+
+        s = vec_sum4s(d0, s);
+        ss = vec_msum(d0, d0, ss);
+        s = vec_sum4s(d1, s);
+        ss = vec_msum(d1, d1, ss);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 32:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+    case 64:
+      for (i = 0; i < h; ++i) {
+        variance_inner_32(src_ptr, ref_ptr, &ss, &s);
+        variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+      }
+      break;
+  }
+
+  s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+  vec_ste(s, 0, sum);
+
+  ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+  vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H)                                                    \
+  void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
+                                 const uint8_t *ref_ptr, int ref_stride, \
+                                 uint32_t *sse, int *sum) {              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum);  \
+  }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                                         \
+  uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                  const uint8_t *ref_ptr, int ref_stride, \
+                                  uint32_t *sse) {                        \
+    int sum;                                                              \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);  \
+    return *sse;                                                          \
+  }
+
+#define VAR(W, H)                                                              \
+  uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
+                                       const uint8_t *ref_ptr, int ref_stride, \
+                                       uint32_t *sse) {                        \
+    int sum;                                                                   \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H)));              \
+  }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
diff --git a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
index 5c3ba4576fa..2dc66055cc9 100644
--- a/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
+++ b/libs/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -9,13 +9,16 @@
  */
 #include <assert.h>
 #include <string.h>
+
 #include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_filter.h"
+#include "vpx/vpx_integer.h"
 #include "vpx_dsp/ppc/types_vsx.h"
+#include "vpx_dsp/vpx_filter.h"
 
 // TODO(lu_zero): unroll
-static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -25,8 +28,9 @@ static inline void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -37,8 +41,9 @@ static inline void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -86,8 +91,9 @@ void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -98,8 +104,9 @@ static inline void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -112,8 +119,9 @@ static inline void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride, int32_t h) {
+static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     int32_t h) {
   int i;
 
   for (i = h; i--;) {
@@ -155,8 +163,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void convolve_line(uint8_t *dst, const int16x8_t s,
-                                 const int16x8_t f) {
+static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
+                                           const int16x8_t f) {
   const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
   const int32x4_t bias =
       vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
@@ -166,8 +174,9 @@ static inline void convolve_line(uint8_t *dst, const int16x8_t s,
   vec_ste(v, 0, dst);
 }
 
-static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
-                                   const int16_t *const x_filter) {
+static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
+                                             const uint8_t *const src_x,
+                                             const int16_t *const x_filter) {
   const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
   const int16x8_t f = vec_vsx_ld(0, x_filter);
 
@@ -175,10 +184,12 @@ static inline void convolve_line_h(uint8_t *dst, const uint8_t *const src_x,
 }
 
 // TODO(lu_zero): Implement 8x8 and bigger block special cases
-static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *x_filters, int x0_q4,
-                                  int x_step_q4, int w, int h) {
+static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
+                                            ptrdiff_t src_stride, uint8_t *dst,
+                                            ptrdiff_t dst_stride,
+                                            const InterpKernel *x_filters,
+                                            int x0_q4, int x_step_q4, int w,
+                                            int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
 
@@ -194,10 +205,10 @@ static inline void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *x_filters, int x0_q4,
-                                      int x_step_q4, int w, int h) {
+static VPX_FORCE_INLINE void convolve_avg_horiz(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+    int x_step_q4, int w, int h) {
   int x, y;
   src -= SUBPEL_TAPS / 2 - 1;
 
@@ -230,9 +241,10 @@ static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
   return (uint8x16_t)vec_mergeh(abcd, efgh);
 }
 
-static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
-                                   ptrdiff_t src_stride,
-                                   const int16_t *const y_filter) {
+static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
+                                             const uint8_t *const src_y,
+                                             ptrdiff_t src_stride,
+                                             const int16_t *const y_filter) {
   uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
   uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
   uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
@@ -250,10 +262,12 @@ static inline void convolve_line_v(uint8_t *dst, const uint8_t *const src_y,
   convolve_line(dst, unpack_to_s16_h(s), f);
 }
 
-static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *y_filters, int y0_q4,
-                                 int y_step_q4, int w, int h) {
+static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
+                                           ptrdiff_t src_stride, uint8_t *dst,
+                                           ptrdiff_t dst_stride,
+                                           const InterpKernel *y_filters,
+                                           int y0_q4, int y_step_q4, int w,
+                                           int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
@@ -270,10 +284,10 @@ static inline void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const InterpKernel *y_filters, int y0_q4,
-                                     int y_step_q4, int w, int h) {
+static VPX_FORCE_INLINE void convolve_avg_vert(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+    int y_step_q4, int w, int h) {
   int x, y;
   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
@@ -291,11 +305,11 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const filter, int x0_q4,
-                            int x_step_q4, int y0_q4, int y_step_q4, int w,
-                            int h) {
+static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *const filter,
+                                      int x0_q4, int x_step_q4, int y0_q4,
+                                      int y_step_q4, int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
diff --git a/libs/libvpx/vpx_dsp/prob.h b/libs/libvpx/vpx_dsp/prob.h
index f1cc0eaa105..7a71c0041f2 100644
--- a/libs/libvpx/vpx_dsp/prob.h
+++ b/libs/libvpx/vpx_dsp/prob.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_PROB_H_
-#define VPX_DSP_PROB_H_
+#ifndef VPX_VPX_DSP_PROB_H_
+#define VPX_VPX_DSP_PROB_H_
 
 #include <assert.h>
 
@@ -32,7 +32,7 @@ typedef int8_t vpx_tree_index;
 
 #define TREE_SIZE(leaf_count) (2 * (leaf_count)-2)
 
-#define vpx_complement(x) (255 - x)
+#define vpx_complement(x) (255 - (x))
 
 #define MODE_MV_COUNT_SAT 20
 
@@ -103,4 +103,4 @@ DECLARE_ALIGNED(16, extern const uint8_t, vpx_norm[256]);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_PROB_H_
+#endif  // VPX_VPX_DSP_PROB_H_
diff --git a/libs/libvpx/vpx_dsp/psnr.c b/libs/libvpx/vpx_dsp/psnr.c
index 47afd4388ab..48bac04508f 100644
--- a/libs/libvpx/vpx_dsp/psnr.c
+++ b/libs/libvpx/vpx_dsp/psnr.c
@@ -1,12 +1,12 @@
 /*
-*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include <math.h>
 #include <assert.h>
@@ -24,8 +24,8 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 }
 
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
-* and highbd_8_variance(). It should not.
-*/
+ * and highbd_8_variance(). It should not.
+ */
 static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
                              int b_stride, int w, int h, unsigned int *sse,
                              int *sum) {
diff --git a/libs/libvpx/vpx_dsp/psnr.h b/libs/libvpx/vpx_dsp/psnr.h
index f321131d0b9..a5563557e97 100644
--- a/libs/libvpx/vpx_dsp/psnr.h
+++ b/libs/libvpx/vpx_dsp/psnr.h
@@ -1,15 +1,15 @@
 /*
-*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
-
-#ifndef VPX_DSP_PSNR_H_
-#define VPX_DSP_PSNR_H_
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_PSNR_H_
+#define VPX_VPX_DSP_PSNR_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -28,13 +28,13 @@ typedef struct {
 // TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
 
 /*!\brief Converts SSE to PSNR
-*
-* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
-*
-* \param[in]    samples       Number of samples
-* \param[in]    peak          Max sample value
-* \param[in]    sse           Sum of squared errors
-*/
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
 double vpx_sse_to_psnr(double samples, double peak, double sse);
 int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -54,4 +54,4 @@ double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
 #ifdef __cplusplus
 }  // extern "C"
 #endif
-#endif  // VPX_DSP_PSNR_H_
+#endif  // VPX_VPX_DSP_PSNR_H_
diff --git a/libs/libvpx/vpx_dsp/psnrhvs.c b/libs/libvpx/vpx_dsp/psnrhvs.c
index b3910152c47..d7ec1a429ac 100644
--- a/libs/libvpx/vpx_dsp/psnrhvs.c
+++ b/libs/libvpx/vpx_dsp/psnrhvs.c
@@ -126,8 +126,10 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
   const uint8_t *_dst8 = dst;
   const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
   const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
-  int16_t dct_s[8 * 8], dct_d[8 * 8];
-  tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
   double mask[8][8];
   int pixels;
   int x;
@@ -142,7 +144,7 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
    been normalized and then squared." Their CSF matrix (from PSNR-HVS)
    was also constructed from the JPEG matrices. I can not find any obvious
    scheme of normalizing to produce their table, but if I multiply their
-   CSF by 0.38857 and square the result I get their masking table.
+   CSF by 0.3885746225901003 and square the result I get their masking table.
    I have no idea where this constant comes from, but deviating from it
    too greatly hurts MOS agreement.
 
@@ -150,11 +152,15 @@ static double calc_psnrhvs(const unsigned char *src, int _systride,
    Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
    of DCT basis functions", CD-ROM Proceedings of the Third
    International Workshop on Video Processing and Quality Metrics for Consumer
-   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+   Suggested in aomedia issue #2363:
+   0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+   of the old JPEG based matrix from the paper. Since you are not using that,
+   divide by actual maximum coefficient. */
   for (x = 0; x < 8; x++)
     for (y = 0; y < 8; y++)
-      mask[x][y] =
-          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
+      mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
   for (y = 0; y < _h - 7; y += _step) {
     for (x = 0; x < _w - 7; x += _step) {
       int i;
diff --git a/libs/libvpx/vpx_dsp/quantize.c b/libs/libvpx/vpx_dsp/quantize.c
index e37ca92ad45..0e6a0b83fac 100644
--- a/libs/libvpx/vpx_dsp/quantize.c
+++ b/libs/libvpx/vpx_dsp/quantize.c
@@ -12,12 +12,13 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                     const int16_t dequant, uint16_t *eob_ptr) {
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
   const int coeff_sign = (coeff >> 31);
@@ -31,7 +32,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
     tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 16;
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
     if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -41,7 +42,7 @@ void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
                             const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                             uint16_t *eob_ptr) {
   int eob = -1;
 
@@ -55,7 +56,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
     const int64_t tmp = abs_coeff + round_ptr[0];
     const int abs_qcoeff = (int)((tmp * quant) >> 16);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -65,7 +66,7 @@ void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr) {
+                           const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   const int rc = 0;
   const int coeff = coeff_ptr[rc];
@@ -81,7 +82,7 @@ void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                 INT16_MIN, INT16_MAX);
     tmp = (tmp * quant) >> 15;
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
     if (tmp) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -92,8 +93,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr,
-                                  uint16_t *eob_ptr) {
+                                  const int16_t dequant, uint16_t *eob_ptr) {
   const int n_coeffs = 1024;
   int eob = -1;
 
@@ -107,7 +107,7 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
     const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], 1);
     const int abs_qcoeff = (int)((tmp * quant) >> 15);
     qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / 2;
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
   *eob_ptr = eob + 1;
@@ -260,7 +260,15 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
           15;
 
     qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+#if (ARCH_X86 || ARCH_X86_64) && !CONFIG_VP9_HIGHBITDEPTH
+    // When tran_low_t is only 16 bits dqcoeff can outrange it. Rather than
+    // truncating with a cast, saturate the value. This is easier to implement
+    // on x86 and preserves the sign of the value.
+    dqcoeff_ptr[rc] =
+        clamp(qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2, INT16_MIN, INT16_MAX);
+#else
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+#endif  // ARCH_X86 && CONFIG_VP9_HIGHBITDEPTH
 
     if (tmp) eob = idx_arr[i];
   }
diff --git a/libs/libvpx/vpx_dsp/quantize.h b/libs/libvpx/vpx_dsp/quantize.h
index e1328454633..7cac140e9de 100644
--- a/libs/libvpx/vpx_dsp/quantize.h
+++ b/libs/libvpx/vpx_dsp/quantize.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_QUANTIZE_H_
-#define VPX_DSP_QUANTIZE_H_
+#ifndef VPX_VPX_DSP_QUANTIZE_H_
+#define VPX_VPX_DSP_QUANTIZE_H_
 
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -19,30 +19,29 @@ extern "C" {
 #endif
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
-                     const int16_t *round_ptr, const int16_t quant_ptr,
+                     const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                     const int16_t dequant_ptr, uint16_t *eob_ptr);
+                     const int16_t dequant, uint16_t *eob_ptr);
 void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                           const int16_t *round_ptr, const int16_t quant_ptr,
+                           const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                           const int16_t dequant_ptr, uint16_t *eob_ptr);
+                           const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                             int skip_block, const int16_t *round_ptr,
-                            const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+                            const int16_t quant, tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
                             uint16_t *eob_ptr);
 void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
-                                  const int16_t *round_ptr,
-                                  const int16_t quant_ptr,
+                                  const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
-                                  const int16_t dequant_ptr, uint16_t *eob_ptr);
+                                  const int16_t dequant, uint16_t *eob_ptr);
 #endif
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_QUANTIZE_H_
+#endif  // VPX_VPX_DSP_QUANTIZE_H_
diff --git a/libs/libvpx/vpx_dsp/sad.c b/libs/libvpx/vpx_dsp/sad.c
index 18b6dc6e09f..873ddca0938 100644
--- a/libs/libvpx/vpx_dsp/sad.c
+++ b/libs/libvpx/vpx_dsp/sad.c
@@ -17,54 +17,55 @@
 #include "vpx_ports/mem.h"
 
 /* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
+static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src_ptr[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-#define sadMxN(m, n)                                                        \
-  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src, int src_stride,     \
-                                    const uint8_t *ref, int ref_stride) {   \
-    return sad(src, src_stride, ref, ref_stride, m, n);                     \
-  }                                                                         \
-  unsigned int vpx_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
-                                        const uint8_t *ref, int ref_stride, \
-                                        const uint8_t *second_pred) {       \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                         \
-    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride);     \
-    return sad(src, src_stride, comp_pred, m, m, n);                        \
+#define sadMxN(m, n)                                                          \
+  unsigned int vpx_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);               \
+  }                                                                           \
+  unsigned int vpx_sad##m##x##n##_avg_c(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
+    return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }
 
 // depending on call sites, pass **ref_array to avoid & in subsequent call and
 // de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                   \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride,       \
-                                  const uint8_t *ref_array, int ref_stride, \
-                                  uint32_t *sad_array) {                    \
-    int i;                                                                  \
-    for (i = 0; i < k; ++i)                                                 \
-      sad_array[i] =                                                        \
-          vpx_sad##m##x##n##_c(src, src_stride, &ref_array[i], ref_stride); \
+#define sadMxNxK(m, n, k)                                                     \
+  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *ref_ptr, int ref_stride,     \
+                                  uint32_t *sad_array) {                      \
+    int i;                                                                    \
+    for (i = 0; i < k; ++i)                                                   \
+      sad_array[i] =                                                          \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
   }
 
 // This appears to be equivalent to the above when k == 4 and refs is const
-#define sadMxNx4D(m, n)                                                    \
-  void vpx_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
+#define sadMxNx4D(m, n)                                                        \
+  void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
+                               const uint8_t *const ref_array[],               \
+                               int ref_stride, uint32_t *sad_array) {          \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i)                                                    \
+      sad_array[i] =                                                           \
+          vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
   }
 
 /* clang-format off */
@@ -133,59 +134,61 @@ sadMxNx4D(4, 4)
 
 #if CONFIG_VP9_HIGHBITDEPTH
         static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
+    unsigned int highbd_sad(const uint8_t *src8_ptr, int src_stride,
+                            const uint8_t *ref8_ptr, int ref_stride, int width,
+                            int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
+static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
+                                       const uint16_t *ref_ptr, int ref_stride,
                                        int width, int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
   for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
+    for (x = 0; x < width; x++) sad += abs(src[x] - ref_ptr[x]);
 
-    a += a_stride;
-    b += b_stride;
+    src += src_stride;
+    ref_ptr += ref_stride;
   }
   return sad;
 }
 
 #define highbd_sadMxN(m, n)                                                    \
-  unsigned int vpx_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  unsigned int vpx_highbd_sad##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride) {                                                        \
+    return highbd_sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n);         \
   }                                                                            \
   unsigned int vpx_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, const uint8_t *second_pred) {                            \
     DECLARE_ALIGNED(16, uint16_t, comp_pred[m * n]);                           \
-    vpx_highbd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride); \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
+    vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
+                               n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
+    return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
   }
 
-#define highbd_sadMxNx4D(m, n)                                               \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
+#define highbd_sadMxNx4D(m, n)                                                \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+                                      const uint8_t *const ref_array[],       \
+                                      int ref_stride, uint32_t *sad_array) {  \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
+                                                 ref_array[i], ref_stride);   \
+    }                                                                         \
   }
 
 /* clang-format off */
diff --git a/libs/libvpx/vpx_dsp/skin_detection.h b/libs/libvpx/vpx_dsp/skin_detection.h
index a2e99baf7e7..91640c33d5a 100644
--- a/libs/libvpx/vpx_dsp/skin_detection.h
+++ b/libs/libvpx/vpx_dsp/skin_detection.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_SKIN_DETECTION_H_
-#define VPX_DSP_SKIN_DETECTION_H_
+#ifndef VPX_VPX_DSP_SKIN_DETECTION_H_
+#define VPX_VPX_DSP_SKIN_DETECTION_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -21,4 +21,4 @@ int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_SKIN_DETECTION_H_
+#endif  // VPX_VPX_DSP_SKIN_DETECTION_H_
diff --git a/libs/libvpx/vpx_dsp/ssim.c b/libs/libvpx/vpx_dsp/ssim.c
index 7a29bd29f9f..7c3c31bad86 100644
--- a/libs/libvpx/vpx_dsp/ssim.c
+++ b/libs/libvpx/vpx_dsp/ssim.c
@@ -73,7 +73,7 @@ static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
                          uint32_t sum_sq_r, uint32_t sum_sxr, int count,
                          uint32_t bd) {
-  int64_t ssim_n, ssim_d;
+  double ssim_n, ssim_d;
   int64_t c1, c2;
   if (bd == 8) {
     // scale the constants by number of pixels
@@ -90,14 +90,14 @@ static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
     assert(0);
   }
 
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
 
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
 
-  return ssim_n * 1.0 / ssim_d;
+  return ssim_n / ssim_d;
 }
 
 static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
@@ -284,7 +284,7 @@ double vpx_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
   for (i = 0; i < height;
        i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
     for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = { 0 };
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
       double ssim;
       double ssim2;
       double dssim;
diff --git a/libs/libvpx/vpx_dsp/ssim.h b/libs/libvpx/vpx_dsp/ssim.h
index 4f2bb1d556c..c382237fc6d 100644
--- a/libs/libvpx/vpx_dsp/ssim.h
+++ b/libs/libvpx/vpx_dsp/ssim.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_SSIM_H_
-#define VPX_DSP_SSIM_H_
+#ifndef VPX_VPX_DSP_SSIM_H_
+#define VPX_VPX_DSP_SSIM_H_
 
 #define MAX_SSIM_DB 100.0;
 
@@ -84,4 +84,4 @@ double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_SSIM_H_
+#endif  // VPX_VPX_DSP_SSIM_H_
diff --git a/libs/libvpx/vpx_dsp/subtract.c b/libs/libvpx/vpx_dsp/subtract.c
index 95e7071b27e..45c819e67a7 100644
--- a/libs/libvpx/vpx_dsp/subtract.c
+++ b/libs/libvpx/vpx_dsp/subtract.c
@@ -16,37 +16,37 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-void vpx_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
+void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                          ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                          ptrdiff_t src_stride, const uint8_t *pred_ptr,
                           ptrdiff_t pred_stride) {
   int r, c;
 
   for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+    for (c = 0; c < cols; c++) diff_ptr[c] = src_ptr[c] - pred_ptr[c];
 
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
+    diff_ptr += diff_stride;
+    pred_ptr += pred_stride;
+    src_ptr += src_stride;
   }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
-                                 ptrdiff_t diff_stride, const uint8_t *src8,
-                                 ptrdiff_t src_stride, const uint8_t *pred8,
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr,
+                                 ptrdiff_t diff_stride, const uint8_t *src8_ptr,
+                                 ptrdiff_t src_stride, const uint8_t *pred8_ptr,
                                  ptrdiff_t pred_stride, int bd) {
   int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
   (void)bd;
 
   for (r = 0; r < rows; r++) {
     for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
+      diff_ptr[c] = src[c] - pred[c];
     }
 
-    diff += diff_stride;
+    diff_ptr += diff_stride;
     pred += pred_stride;
     src += src_stride;
   }
diff --git a/libs/libvpx/vpx_dsp/sum_squares.c b/libs/libvpx/vpx_dsp/sum_squares.c
index 7c535ac2db6..b80cd588e42 100644
--- a/libs/libvpx/vpx_dsp/sum_squares.c
+++ b/libs/libvpx/vpx_dsp/sum_squares.c
@@ -10,8 +10,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 
-uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
-                                  int size) {
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
   int r, c;
   uint64_t ss = 0;
 
@@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
       const int16_t v = src[c];
       ss += v * v;
     }
-    src += src_stride;
+    src += stride;
   }
 
   return ss;
diff --git a/libs/libvpx/vpx_dsp/txfm_common.h b/libs/libvpx/vpx_dsp/txfm_common.h
index d01d7085a27..25f4fdb3276 100644
--- a/libs/libvpx/vpx_dsp/txfm_common.h
+++ b/libs/libvpx/vpx_dsp/txfm_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_TXFM_COMMON_H_
-#define VPX_DSP_TXFM_COMMON_H_
+#ifndef VPX_VPX_DSP_TXFM_COMMON_H_
+#define VPX_VPX_DSP_TXFM_COMMON_H_
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
@@ -63,4 +63,4 @@ static const tran_coef_t sinpi_2_9 = 9929;
 static const tran_coef_t sinpi_3_9 = 13377;
 static const tran_coef_t sinpi_4_9 = 15212;
 
-#endif  // VPX_DSP_TXFM_COMMON_H_
+#endif  // VPX_VPX_DSP_TXFM_COMMON_H_
diff --git a/libs/libvpx/vpx_dsp/variance.c b/libs/libvpx/vpx_dsp/variance.c
index 93bd8f30de0..30b55dcb407 100644
--- a/libs/libvpx/vpx_dsp/variance.c
+++ b/libs/libvpx/vpx_dsp/variance.c
@@ -21,36 +21,37 @@ static const uint8_t bilinear_filters[8][2] = {
   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
-uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride) {
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *ref_ptr, int ref_stride) {
   int distortion = 0;
   int r, c;
 
   for (r = 0; r < 4; ++r) {
     for (c = 0; c < 4; ++c) {
-      int diff = a[c] - b[c];
+      int diff = src_ptr[c] - ref_ptr[c];
       distortion += diff * diff;
     }
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 
   return distortion;
 }
 
-uint32_t vpx_get_mb_ss_c(const int16_t *a) {
+uint32_t vpx_get_mb_ss_c(const int16_t *src_ptr) {
   unsigned int i, sum = 0;
 
   for (i = 0; i < 256; ++i) {
-    sum += a[i] * a[i];
+    sum += src_ptr[i] * src_ptr[i];
   }
 
   return sum;
 }
 
-static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
+static void variance(const uint8_t *src_ptr, int src_stride,
+                     const uint8_t *ref_ptr, int ref_stride, int w, int h,
+                     uint32_t *sse, int *sum) {
   int i, j;
 
   *sum = 0;
@@ -58,13 +59,13 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
+      const int diff = src_ptr[j] - ref_ptr[j];
       *sum += diff;
       *sse += diff * diff;
     }
 
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 }
 
@@ -76,24 +77,23 @@ static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
 // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
 // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
 // It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
+static void var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr, uint16_t *ref_ptr, unsigned int src_pixels_per_line,
+    int pixel_step, unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
 
-      ++a;
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
@@ -106,91 +106,90 @@ static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
 // filter is applied horizontally (pixel_step = 1) or vertically
 // (pixel_step = stride). It defines the offset required to move from one input
 // to the next. Output is 8-bit.
-static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *filter) {
+static void var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint8_t *ref_ptr, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
   unsigned int i, j;
 
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-      ++a;
+      ref_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
     }
 
-    a += src_pixels_per_line - output_width;
-    b += output_width;
+    src_ptr += src_pixels_per_line - output_width;
+    ref_ptr += output_width;
   }
 }
 
-#define VAR(W, H)                                                    \
-  uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                     const uint8_t *b, int b_stride, \
-                                     uint32_t *sse) {                \
-    int sum;                                                         \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
+#define VAR(W, H)                                                            \
+  uint32_t vpx_variance##W##x##H##_c(const uint8_t *src_ptr, int src_stride, \
+                                     const uint8_t *ref_ptr, int ref_stride, \
+                                     uint32_t *sse) {                        \
+    int sum;                                                                 \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);     \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                \
   }
 
-#define SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_sub_pixel_variance##W##x##H##_c(                         \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse);       \
+#define SUBPIX_VAR(W, H)                                                     \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_c(                              \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,    \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {               \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+                                      W, bilinear_filters[x_offset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,            \
+                                       bilinear_filters[y_offset]);          \
+                                                                             \
+    return vpx_variance##W##x##H##_c(temp2, W, ref_ptr, ref_stride, sse);    \
   }
 
-#define SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(                     \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
-      const uint8_t *b, int b_stride, uint32_t *sse,                    \
-      const uint8_t *second_pred) {                                     \
-    uint16_t fdata3[(H + 1) * W];                                       \
-    uint8_t temp2[H * W];                                               \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
-                                                                        \
-    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                      bilinear_filters[xoffset]);       \
-    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                       bilinear_filters[yoffset]);      \
-                                                                        \
-    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
-                                                                        \
-    return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
+#define SUBPIX_AVG_VAR(W, H)                                                 \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(                          \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,    \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                 \
+      const uint8_t *second_pred) {                                          \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+                                                                             \
+    var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
+                                      W, bilinear_filters[x_offset]);        \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,            \
+                                       bilinear_filters[y_offset]);          \
+                                                                             \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);                 \
+                                                                             \
+    return vpx_variance##W##x##H##_c(temp3, W, ref_ptr, ref_stride, sse);    \
   }
 
 /* Identical to the variance call except it takes an additional parameter, sum,
  * and returns that value using pass-by-reference instead of returning
  * sse - sum^2 / w*h
  */
-#define GET_VAR(W, H)                                                         \
-  void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
-                               const uint8_t *b, int b_stride, uint32_t *sse, \
-                               int *sum) {                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
+#define GET_VAR(W, H)                                                   \
+  void vpx_get##W##x##H##var_c(const uint8_t *src_ptr, int src_stride,  \
+                               const uint8_t *ref_ptr, int ref_stride,  \
+                               uint32_t *sse, int *sum) {               \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
   }
 
 /* Identical to the variance call except it does not calculate the
  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
  * variable.
  */
-#define MSE(W, H)                                               \
-  uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                const uint8_t *b, int b_stride, \
-                                uint32_t *sse) {                \
-    int sum;                                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
-    return *sse;                                                \
+#define MSE(W, H)                                                        \
+  uint32_t vpx_mse##W##x##H##_c(const uint8_t *src_ptr, int src_stride,  \
+                                const uint8_t *ref_ptr, int ref_stride,  \
+                                uint32_t *sse) {                         \
+    int sum;                                                             \
+    variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
+    return *sse;                                                         \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -237,128 +236,140 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint64_t *sse, int64_t *sum) {
+static void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+                              const uint8_t *ref8_ptr, int ref_stride, int w,
+                              int h, uint64_t *sse, int64_t *sum) {
   int i, j;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
   *sum = 0;
   *sse = 0;
 
   for (i = 0; i < h; ++i) {
     for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
+      const int diff = src_ptr[j] - ref_ptr[j];
       *sum += diff;
       *sse += diff * diff;
     }
-    a += a_stride;
-    b += b_stride;
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 }
 
-static void highbd_8_variance(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint32_t *sse, int *sum) {
+static void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+                              const uint8_t *ref8_ptr, int ref_stride, int w,
+                              int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
 }
 
-static void highbd_10_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
+static void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+                               const uint8_t *ref8_ptr, int ref_stride, int w,
+                               int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }
 
-static void highbd_12_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
+static void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+                               const uint8_t *ref8_ptr, int ref_stride, int w,
+                               int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 
-#define HIGHBD_VAR(W, H)                                                       \
-  uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
-                                              const uint8_t *b, int b_stride,  \
-                                              uint32_t *sse) {                 \
-    int sum;                                                                   \
-    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
+#define HIGHBD_VAR(W, H)                                                    \
+  uint32_t vpx_highbd_8_variance##W##x##H##_c(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_variance##W##x##H##_c(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_variance##W##x##H##_c(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
   }
 
-#define HIGHBD_GET_VAR(S)                                                    \
-  void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
-                                        const uint8_t *ref, int ref_stride,  \
-                                        uint32_t *sse, int *sum) {           \
-    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
-  }                                                                          \
-                                                                             \
-  void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
-  }                                                                          \
-                                                                             \
-  void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
+#define HIGHBD_GET_VAR(S)                                                   \
+  void vpx_highbd_8_get##S##x##S##var_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
+                      sum);                                                 \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_10_get##S##x##S##var_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_12_get##S##x##S##var_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
   }
 
-#define HIGHBD_MSE(W, H)                                                      \
-  uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
-                                         const uint8_t *ref, int ref_stride,  \
-                                         uint32_t *sse) {                     \
-    int sum;                                                                  \
-    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
+#define HIGHBD_MSE(W, H)                                                    \
+  uint32_t vpx_highbd_8_mse##W##x##H##_c(                                   \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_mse##W##x##H##_c(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_mse##W##x##H##_c(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
   }
 
 static void highbd_var_filter_block2d_bil_first_pass(
@@ -403,111 +414,111 @@ static void highbd_var_filter_block2d_bil_second_pass(
   }
 }
 
-#define HIGHBD_SUBPIX_VAR(W, H)                                              \
-  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,    \
+                                              ref_ptr, ref_stride, sse);       \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c(                      \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,   \
+                                               ref_ptr, ref_stride, sse);      \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c(                      \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,   \
+                                               ref_ptr, ref_stride, sse);      \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                          \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                 \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred) {                                          \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                             \
-                                                                             \
-    highbd_var_filter_block2d_bil_first_pass(                                \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters[xoffset]);    \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
-                                              bilinear_filters[yoffset]);    \
-                                                                             \
-    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
-                               CONVERT_TO_BYTEPTR(temp2), W);                \
-                                                                             \
-    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
-                                               dst, dst_stride, sse);        \
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              ref_ptr, ref_stride, sse);       \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref_ptr, ref_stride, sse);      \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_c(temp3, CONVERT_TO_SHORTPTR(second_pred), W, H,  \
+                               temp2, W);                                      \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref_ptr, ref_stride, sse);      \
   }
 
 /* All three forms of the variance are available in the same sizes. */
@@ -538,12 +549,10 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
+                              int width, int height, const uint16_t *ref,
                               int ref_stride) {
   int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
       const int tmp = pred[j] + ref[j];
diff --git a/libs/libvpx/vpx_dsp/variance.h b/libs/libvpx/vpx_dsp/variance.h
index 100573299b9..6d0e1b8a6b3 100644
--- a/libs/libvpx/vpx_dsp/variance.h
+++ b/libs/libvpx/vpx_dsp/variance.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VARIANCE_H_
-#define VPX_DSP_VARIANCE_H_
+#ifndef VPX_VPX_DSP_VARIANCE_H_
+#define VPX_VPX_DSP_VARIANCE_H_
 
 #include "./vpx_config.h"
 
@@ -22,37 +22,38 @@ extern "C" {
 #define FILTER_BITS 7
 #define FILTER_WEIGHT 128
 
-typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride);
+typedef unsigned int (*vpx_sad_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride);
 
-typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                         const uint8_t *b_ptr, int b_stride,
+typedef unsigned int (*vpx_sad_avg_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
                                          const uint8_t *second_pred);
 
-typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
-                                  int b_stride, int n);
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                  uint8_t *ref_ptr, int ref_stride, int n);
 
-typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
-                                   const uint8_t *b, int b_stride,
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sad_array);
 
-typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *const b_array[],
-                                     int b_stride, unsigned int *sad_array);
+                                     int ref_stride, unsigned int *sad_array);
 
-typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          unsigned int *sse);
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, unsigned int *sse);
 
-typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                unsigned int *sse);
+typedef unsigned int (*vpx_subpixvariance_fn_t)(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 typedef unsigned int (*vpx_subp_avg_variance_fn_t)(
-    const uint8_t *a_ptr, int a_stride, int xoffset, int yoffset,
-    const uint8_t *b_ptr, int b_stride, unsigned int *sse,
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
     const uint8_t *second_pred);
+
 #if CONFIG_VP8
 typedef struct variance_vtable {
   vpx_sad_fn_t sdf;
@@ -82,4 +83,4 @@ typedef struct vp9_variance_vtable {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VARIANCE_H_
+#endif  // VPX_VPX_DSP_VARIANCE_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_convolve.h b/libs/libvpx/vpx_dsp/vpx_convolve.h
index 7979268a954..d5793e17ad5 100644
--- a/libs/libvpx/vpx_dsp/vpx_convolve.h
+++ b/libs/libvpx/vpx_dsp/vpx_convolve.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_VPX_CONVOLVE_H_
-#define VPX_DSP_VPX_CONVOLVE_H_
+#ifndef VPX_VPX_DSP_VPX_CONVOLVE_H_
+#define VPX_VPX_DSP_VPX_CONVOLVE_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -35,4 +35,4 @@ typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_CONVOLVE_H_
+#endif  // VPX_VPX_DSP_VPX_CONVOLVE_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp.mk b/libs/libvpx/vpx_dsp/vpx_dsp.mk
index 3b1a873cd20..f013fa59220 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libs/libvpx/vpx_dsp/vpx_dsp.mk
@@ -47,13 +47,11 @@ endif
 # intra predictions
 DSP_SRCS-yes += intrapred.c
 
-DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
 DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
 DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
@@ -69,6 +67,8 @@ DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c
 DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c
 DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/post_proc_sse2.c
+DSP_SRCS-$(HAVE_VSX) += ppc/deblock_vsx.c
 endif # CONFIG_POSTPROC
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
@@ -81,16 +81,19 @@ DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
+DSP_SRCS-yes += vpx_filter.h
+ifeq ($(CONFIG_VP9),yes)
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
-DSP_SRCS-yes += vpx_filter.h
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+
+DSP_SRCS-$(HAVE_SSE2) += x86/convolve_sse2.h
 DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
 DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_4t_intrin_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_bilinear_ssse3.asm
@@ -111,9 +114,17 @@ DSP_SRCS-$(HAVE_NEON)  += arm/vpx_scaled_convolve8_neon.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_avg_neon_asm$(ASM)
-DSP_SRCS-yes += arm/vpx_convolve8_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_vert_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type2_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_horiz_filter_type1_neon$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_avg_vert_filter_type1_neon$(ASM)
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon_asm$(ASM)
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.c
+DSP_SRCS-yes += arm/vpx_convolve8_neon_asm.h
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
 else
 ifeq ($(HAVE_NEON),yes)
@@ -134,6 +145,7 @@ DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve8_vert_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_avg_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_copy_msa.c
 DSP_SRCS-$(HAVE_MSA) += mips/vpx_convolve_msa.h
+DSP_SRCS-$(HAVE_MMI) += mips/vpx_convolve8_mmi.c
 
 # common (dspr2)
 DSP_SRCS-$(HAVE_DSPR2)  += mips/convolve_common_dspr2.h
@@ -153,8 +165,8 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
-DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/loopfilter_sse2.c
-DSP_SRCS-$(HAVE_AVX2)                += x86/loopfilter_avx2.c
+DSP_SRCS-$(HAVE_SSE2)  += x86/loopfilter_sse2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/loopfilter_avx2.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes  += arm/loopfilter_16_neon$(ASM)
@@ -180,6 +192,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
+endif # CONFIG_VP9
 
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
@@ -204,7 +217,12 @@ DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+endif  # !CONFIG_VP9_HIGHBITDEPTH
+
+DSP_SRCS-$(HAVE_VSX)    += ppc/fdct32x32_vsx.c
 endif  # CONFIG_VP9_ENCODER
 
 # inverse transform
@@ -242,6 +260,7 @@ DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_34_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_135_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct32x32_1024_add_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct_neon.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
@@ -279,11 +298,13 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
-DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_x86.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
+DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
@@ -310,6 +331,7 @@ ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
 DSP_SRCS-yes            += sum_squares.c
+DSP_SRCS-$(HAVE_NEON)   += arm/sum_squares_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
@@ -330,13 +352,12 @@ DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
-DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
-DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
@@ -358,7 +379,6 @@ DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
-DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
@@ -368,7 +388,6 @@ ifeq ($(ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/ssim_opt_x86_64.asm
 endif  # ARCH_X86_64
 
-DSP_SRCS-$(HAVE_SSE)    += x86/subpel_variance_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -386,6 +405,7 @@ DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
 
 # PPC VSX utilities
 DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
+DSP_SRCS-$(HAVE_VSX)  += ppc/txfm_common_vsx.h
 DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
 DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
 
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_common.h b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
index c8c852374f5..2de4495465e 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VPX_DSP_COMMON_H_
-#define VPX_DSP_VPX_DSP_COMMON_H_
+#ifndef VPX_VPX_DSP_VPX_DSP_COMMON_H_
+#define VPX_VPX_DSP_VPX_DSP_COMMON_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -25,8 +25,8 @@ extern "C" {
 #define VPX_SWAP(type, a, b) \
   do {                       \
     type c = (b);            \
-    b = a;                   \
-    a = c;                   \
+    (b) = a;                 \
+    (a) = c;                 \
   } while (0)
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -57,6 +57,10 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
+static INLINE int64_t lclamp(int64_t value, int64_t low, int64_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
@@ -70,4 +74,4 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_DSP_COMMON_H_
+#endif  // VPX_VPX_DSP_VPX_DSP_COMMON_H_
diff --git a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 1a743d910e3..797ef7fe0d5 100644
--- a/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libs/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -37,325 +37,333 @@ ()
 # Intra prediction
 #
 
-add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_4x4 sse2/;
 
-add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 
-add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_4x4 ssse3/;
 
-add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2 vsx/;
+add_proto qw/void vpx_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
-add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_4x4 ssse3/;
 
-add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
 
-add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_ve_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2 vsx/;
+add_proto qw/void vpx_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_tm_predictor_4x4 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_4x4 dspr2 msa neon sse2/;
 
-add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
-add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d45_predictor_8x8 neon sse2 vsx/;
+add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_d45_predictor_8x8 neon sse2/;
 
-add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_8x8 ssse3 vsx/;
+add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_d63_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2 vsx/;
+add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
-add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_8x8 ssse3/;
 
-add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2 vsx/;
+add_proto qw/void vpx_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_tm_predictor_8x8 neon dspr2 msa sse2/;
 
-add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 vsx/;
+add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+# TODO(crbug.com/webm/1522): Re-enable vsx implementation.
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
 
-add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
-add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_16x16 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
 
-add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
-add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_16x16 ssse3/;
 
-add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d207_predictor_32x32 ssse3/;
 
-add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
-add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
 
-add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
-add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
-add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d153_predictor_32x32 ssse3/;
 
-add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_tm_predictor_32x32 neon msa sse2 vsx/;
 
-add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_left_predictor_32x32 msa neon sse2 vsx/;
 
-add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+add_proto qw/void vpx_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
 
-  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
-  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
 
-  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
 
-  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9") eq "yes") {
 #
 # Sub Pixel Filters
 #
@@ -363,25 +371,25 @@ ()
 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;
@@ -395,36 +403,38 @@ ()
 add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
 add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+} #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd";
   specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9") eq "yes") {
 #
 # Loopfilter
 #
@@ -463,6 +473,7 @@ ()
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
 specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+} #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
@@ -583,7 +594,7 @@ ()
   specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
@@ -626,6 +637,7 @@ ()
   specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
+  specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
@@ -646,7 +658,7 @@ ()
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
     specialize qw/vpx_idct32x32_34_add dspr2 msa/;
     specialize qw/vpx_idct32x32_1_add dspr2 msa/;
-    specialize qw/vpx_iwht4x4_16_add msa sse2/;
+    specialize qw/vpx_iwht4x4_16_add msa/;
     specialize qw/vpx_iwht4x4_1_add msa/;
   } # !CONFIG_VP9_HIGHBITDEPTH
 }  # !CONFIG_EMULATE_HARDWARE
@@ -654,7 +666,6 @@ ()
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
   # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
-  specialize qw/vpx_iwht4x4_16_add sse2/;
 
   add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
   add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
@@ -699,10 +710,10 @@ ()
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -718,7 +729,7 @@ ()
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
 
 #
 # Single block SAD
@@ -748,13 +759,13 @@ ()
 specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2 mmi/;
+specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2 mmi/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 neon msa sse2 mmi/;
+specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
@@ -782,8 +793,23 @@ ()
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
 
+    add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+
+    add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_8x8 avx2/;
+
+    add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
+
+    add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
+
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
+
+    add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
+    specialize qw/vpx_highbd_satd avx2/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
@@ -791,6 +817,9 @@ ()
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
 
+    add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+
     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
@@ -882,47 +911,47 @@ ()
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
 
 #
 # Structured Similarity (SSIM)
@@ -939,7 +968,7 @@ ()
   #
   # Block subtraction
   #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
 
   #
   # Single block SAD
@@ -984,9 +1013,13 @@ ()
   #
   # Avg
   #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
+  specialize qw/vpx_highbd_avg_8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
+  specialize qw/vpx_highbd_avg_4x4 sse2/;
+
+  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg sse2/;
@@ -1028,43 +1061,43 @@ ()
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad64x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad64x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad32x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad16x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad8x4x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad4x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
   specialize qw/vpx_highbd_sad4x4x4d sse2/;
 
   #
@@ -1081,70 +1114,70 @@ ()
 #
 # Variance
 #
-add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
 
 #
 # Specialty Variance
 #
-add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/;
 
-add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa/;
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var sse2 neon msa vsx/;
 
-add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa mmi/;
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa mmi/;
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
 
-add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa mmi/;
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
-add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
@@ -1153,440 +1186,449 @@ ()
 #
 # Subpixel Variance
 #
-add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
 
-add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance64x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance64x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance32x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance32x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance32x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance16x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance16x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance8x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance64x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance64x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance32x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance32x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance32x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance16x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance16x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance8x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_variance8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 
-  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance64x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance64x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance32x64 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance32x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance32x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance16x32 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance16x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance8x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_variance8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_8_get16x16var sse2/;
+
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_8_get8x8var sse2/;
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_10_get16x16var sse2/;
 
-  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_10_get8x8var sse2/;
 
-  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_12_get16x16var sse2/;
 
-  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_highbd_12_get8x8var sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_mse16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_8_mse8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_mse16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_10_mse8x8 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_mse16x16 sse2/;
 
-  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_mse8x8 sse2/;
 
-  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
 
   #
   # Subpixel Variance
   #
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
 
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -1598,13 +1640,13 @@ ()
     specialize qw/vpx_plane_add_noise sse2 msa/;
 
     add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_down sse2 neon msa/;
+    specialize qw/vpx_mbpost_proc_down sse2 neon msa vsx/;
 
-    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit";
-    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/;
+    add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *src, int pitch, int rows, int cols,int flimit";
+    specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
 
     add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/;
 
 }
 
diff --git a/libs/libvpx/vpx_dsp/vpx_filter.h b/libs/libvpx/vpx_dsp/vpx_filter.h
index 6cea251bcca..54357ee6cae 100644
--- a/libs/libvpx/vpx_dsp/vpx_filter.h
+++ b/libs/libvpx/vpx_dsp/vpx_filter.h
@@ -8,9 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_VPX_FILTER_H_
-#define VPX_DSP_VPX_FILTER_H_
+#ifndef VPX_VPX_DSP_VPX_FILTER_H_
+#define VPX_VPX_DSP_VPX_FILTER_H_
 
+#include <assert.h>
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -26,8 +27,16 @@ extern "C" {
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
+static INLINE int vpx_get_filter_taps(const int16_t *const filter) {
+  assert(filter[3] != 128);
+  if (!filter[0] && !filter[1] && !filter[2])
+    return 2;
+  else
+    return 8;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_VPX_FILTER_H_
+#endif  // VPX_VPX_DSP_VPX_FILTER_H_
diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
index ff19ea6470d..3f4f577a21d 100644
--- a/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -15,6 +15,209 @@
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_ports/mem.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void vpx_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    vpx_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];
@@ -91,7 +294,7 @@ static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   }
 }
 
-static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                                 int16_t *coeff) {
   __m256i src[8];
   src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
@@ -131,18 +334,19 @@ static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
                       _mm256_permute2x128_si256(src[6], src[7], 0x31));
 }
 
-void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
-  int idx;
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
   int16_t *t_coeff = temp_coeff;
 #else
   int16_t *t_coeff = coeff;
 #endif
-
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
   for (idx = 0; idx < 2; ++idx) {
-    int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
     hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
   }
 
@@ -161,11 +365,69 @@ void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
     b1 = _mm256_srai_epi16(b1, 1);
     b2 = _mm256_srai_epi16(b2, 1);
     b3 = _mm256_srai_epi16(b3, 1);
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
+    t_coeff += 16;
+  }
+}
+
+void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 2);
+    b1 = _mm256_srai_epi16(b1, 2);
+    b2 = _mm256_srai_epi16(b2, 2);
+    b3 = _mm256_srai_epi16(b3, 2);
 
     store_tran_low(_mm256_add_epi16(b0, b2), coeff);
-    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
-    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
-    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
 
     coeff += 16;
     t_coeff += 16;
@@ -195,3 +457,26 @@ int vpx_satd_avx2(const tran_low_t *coeff, int length) {
     return _mm_cvtsi128_si32(accum_128);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index a235ba41df8..5aba903a2dd 100644
--- a/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -138,6 +138,56 @@ unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   return (avg + 8) >> 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
+  __m128i s0, s1;
+  unsigned int avg;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const __m128i zero = _mm_setzero_si128();
+  s0 = _mm_loadu_si128((const __m128i *)(s));
+  s1 = _mm_loadu_si128((const __m128i *)(s + p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 2 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 3 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 4 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 5 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 6 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadu_si128((const __m128i *)(s + 7 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_unpackhi_epi16(s0, zero);
+  s0 = _mm_unpacklo_epi16(s0, zero);
+  s0 = _mm_add_epi32(s0, s1);
+  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
+  s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
+  avg = _mm_cvtsi128_si32(s0);
+
+  return (avg + 32) >> 6;
+}
+
+unsigned int vpx_highbd_avg_4x4_sse2(const uint8_t *s8, int p) {
+  __m128i s0, s1;
+  unsigned int avg;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  s0 = _mm_loadl_epi64((const __m128i *)(s));
+  s1 = _mm_loadl_epi64((const __m128i *)(s + p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadl_epi64((const __m128i *)(s + 2 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s1 = _mm_loadl_epi64((const __m128i *)(s + 3 * p));
+  s0 = _mm_adds_epu16(s0, s1);
+  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 4));
+  s0 = _mm_add_epi16(s0, _mm_srli_si128(s0, 2));
+  avg = _mm_extract_epi16(s0, 0);
+
+  return (avg + 8) >> 4;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void hadamard_col8_sse2(__m128i *in, int iter) {
   __m128i a0 = in[0];
   __m128i a1 = in[1];
@@ -214,8 +264,9 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
-                           tran_low_t *coeff) {
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+                                     ptrdiff_t src_stride, tran_low_t *coeff,
+                                     int is_final) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -229,37 +280,74 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);
 
-  store_tran_low(src[0], coeff);
-  coeff += 8;
-  store_tran_low(src[1], coeff);
-  coeff += 8;
-  store_tran_low(src[2], coeff);
-  coeff += 8;
-  store_tran_low(src[3], coeff);
-  coeff += 8;
-  store_tran_low(src[4], coeff);
-  coeff += 8;
-  store_tran_low(src[5], coeff);
-  coeff += 8;
-  store_tran_low(src[6], coeff);
-  coeff += 8;
-  store_tran_low(src[7], coeff);
+  if (is_final) {
+    store_tran_low(src[0], coeff);
+    coeff += 8;
+    store_tran_low(src[1], coeff);
+    coeff += 8;
+    store_tran_low(src[2], coeff);
+    coeff += 8;
+    store_tran_low(src[3], coeff);
+    coeff += 8;
+    store_tran_low(src[4], coeff);
+    coeff += 8;
+    store_tran_low(src[5], coeff);
+    coeff += 8;
+    store_tran_low(src[6], coeff);
+    coeff += 8;
+    store_tran_low(src[7], coeff);
+  } else {
+    int16_t *coeff16 = (int16_t *)coeff;
+    _mm_store_si128((__m128i *)coeff16, src[0]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[1]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[2]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[3]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[4]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[5]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[6]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[7]);
+  }
 }
 
-void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
-                             tran_low_t *coeff) {
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int16_t *coeff16 = (int16_t *)coeff;
   int idx;
   for (idx = 0; idx < 4; ++idx) {
-    int16_t const *src_ptr =
+    const int16_t *src_ptr =
         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
-    vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+                      0);
   }
 
   for (idx = 0; idx < 64; idx += 8) {
-    __m128i coeff0 = load_tran_low(coeff);
-    __m128i coeff1 = load_tran_low(coeff + 64);
-    __m128i coeff2 = load_tran_low(coeff + 128);
-    __m128i coeff3 = load_tran_low(coeff + 192);
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
 
     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
@@ -271,17 +359,82 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
     b2 = _mm_srai_epi16(b2, 1);
     b3 = _mm_srai_epi16(b3, 1);
 
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    if (is_final) {
+      store_tran_low(coeff0, coeff);
+      store_tran_low(coeff1, coeff + 64);
+      store_tran_low(coeff2, coeff + 128);
+      store_tran_low(coeff3, coeff + 192);
+      coeff += 8;
+    } else {
+      _mm_store_si128((__m128i *)coeff16, coeff0);
+      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+      coeff16 += 8;
+    }
+
+    t_coeff += 8;
+  }
+}
+
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_sse2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 2);
+    b1 = _mm_srai_epi16(b1, 2);
+    b2 = _mm_srai_epi16(b2, 2);
+    b3 = _mm_srai_epi16(b3, 2);
+
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
     store_tran_low(coeff0, coeff);
-    store_tran_low(coeff1, coeff + 64);
+    store_tran_low(coeff1, coeff + 256);
 
     coeff2 = _mm_sub_epi16(b0, b2);
     coeff3 = _mm_sub_epi16(b1, b3);
-    store_tran_low(coeff2, coeff + 128);
-    store_tran_low(coeff3, coeff + 192);
+    store_tran_low(coeff2, coeff + 512);
+    store_tran_low(coeff3, coeff + 768);
 
     coeff += 8;
+    t_coeff += 8;
   }
 }
 
@@ -311,7 +464,7 @@ int vpx_satd_sse2(const tran_low_t *coeff, int length) {
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
+void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
@@ -360,7 +513,7 @@ void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
   _mm_storeu_si128((__m128i *)hbuf, s1);
 }
 
-int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
+int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width) {
   __m128i zero = _mm_setzero_si128();
   __m128i src_line = _mm_load_si128((const __m128i *)ref);
   __m128i s0 = _mm_sad_epu8(src_line, zero);
@@ -380,7 +533,7 @@ int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
   return _mm_extract_epi16(s0, 0);
 }
 
-int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
+int vpx_vector_var_sse2(const int16_t *ref, const int16_t *src, const int bwl) {
   int idx;
   int width = 4 << bwl;
   int16_t mean;
diff --git a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c
index f83b26490e7..e4e1e0e7a2c 100644
--- a/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -13,11 +13,12 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
-void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
+void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
-  /* comp and pred must be 16 byte aligned. */
-  assert(((intptr_t)comp & 0xf) == 0);
+  /* comp_pred and pred must be 16 byte aligned. */
+  assert(((intptr_t)comp_pred & 0xf) == 0);
   assert(((intptr_t)pred & 0xf) == 0);
   if (width > 8) {
     int x, y;
@@ -26,17 +27,17 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
         const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
         const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
         const __m128i avg = _mm_avg_epu8(p, r);
-        _mm_store_si128((__m128i *)(comp + x), avg);
+        _mm_store_si128((__m128i *)(comp_pred + x), avg);
       }
-      comp += width;
+      comp_pred += width;
       pred += width;
       ref += ref_stride;
     }
   } else {  // width must be 4 or 8.
     int i;
-    // Process 16 elements at a time. comp and pred have width == stride and
-    // therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are all
-    // divisible by 16 so just ref needs to be massaged when loading.
+    // Process 16 elements at a time. comp_pred and pred have width == stride
+    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
+    // all divisible by 16 so just ref needs to be massaged when loading.
     for (i = 0; i < width * height; i += 16) {
       const __m128i p = _mm_load_si128((const __m128i *)pred);
       __m128i r;
@@ -45,10 +46,9 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
         r = _mm_loadu_si128((const __m128i *)ref);
         ref += 16;
       } else if (width == 4) {
-        r = _mm_set_epi32(*(const uint32_t *)(ref + 3 * ref_stride),
-                          *(const uint32_t *)(ref + 2 * ref_stride),
-                          *(const uint32_t *)(ref + ref_stride),
-                          *(const uint32_t *)(ref));
+        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
+                          loadu_uint32(ref + 2 * ref_stride),
+                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));
 
         ref += 4 * ref_stride;
       } else {
@@ -60,10 +60,10 @@ void vpx_comp_avg_pred_sse2(uint8_t *comp, const uint8_t *pred, int width,
         ref += 2 * ref_stride;
       }
       avg = _mm_avg_epu8(p, r);
-      _mm_store_si128((__m128i *)comp, avg);
+      _mm_store_si128((__m128i *)comp_pred, avg);
 
       pred += 16;
-      comp += 16;
+      comp_pred += 16;
     }
   }
 }
diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
index 3552c07cd36..c02b47a3ebf 100644
--- a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
+++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
-#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
 
 #include <immintrin.h>
 
@@ -41,4 +41,4 @@ static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
   _mm256_storeu_si256((__m256i *)b, a);
 #endif
 }
-#endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#endif  // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
index 5d1d7795723..74dde656b1d 100644
--- a/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
-#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
 
 #include <xmmintrin.h>
 
@@ -53,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
   _mm_store_si128((__m128i *)(a), zero);
 #endif
 }
-#endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve.h b/libs/libvpx/vpx_dsp/x86/convolve.h
index 68d7589d45d..b75d4d7216d 100644
--- a/libs/libvpx/vpx_dsp/x86/convolve.h
+++ b/libs/libvpx/vpx_dsp/x86/convolve.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPX_DSP_X86_CONVOLVE_H_
-#define VPX_DSP_X86_CONVOLVE_H_
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_H_
 
 #include <assert.h>
 
@@ -16,56 +16,83 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
+// TODO(chiyotsai@google.com): Refactor the code here. Currently this is pretty
+// hacky and awful to read. Note that there is a filter_x[3] == 128 check in
+// HIGHBD_FUN_CONV_2D to avoid seg fault due to the fact that the c function
+// assumes the filter is always 8 tap.
 typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
+// TODO(chiyotsai@google.com): Remove the is_avg argument to the MACROS once we
+// have 4-tap vert avg filter.
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt, is_avg) \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
-    const int16_t *filter = filter_kernel[offset];                           \
+    const int16_t *filter_row = filter[offset];                              \
     (void)x0_q4;                                                             \
     (void)x_step_q4;                                                         \
     (void)y0_q4;                                                             \
     (void)y_step_q4;                                                         \
-    assert(filter[3] != 128);                                                \
+    assert(filter_row[3] != 128);                                            \
     assert(step_q4 == 16);                                                   \
-    if (filter[0] | filter[1] | filter[2]) {                                 \
+    if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {     \
+      const int num_taps = 8;                                                \
       while (w >= 16) {                                                      \
         vpx_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
         vpx_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
         vpx_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
+    } else if (filter_row[2] | filter_row[5]) {                              \
+      const int num_taps = is_avg ? 8 : 4;                                   \
+      while (w >= 16) {                                                      \
+        vpx_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter_row); \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      if (w == 8) {                                                          \
+        vpx_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
+      } else if (w == 4) {                                                   \
+        vpx_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
+      }                                                                      \
+      (void)num_taps;                                                        \
     } else {                                                                 \
+      const int num_taps = 2;                                                \
       while (w >= 16) {                                                      \
-        vpx_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
+        vpx_filter_block1d16_##dir##2_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter_row); \
         src += 16;                                                           \
         dst += 16;                                                           \
         w -= 16;                                                             \
       }                                                                      \
       if (w == 8) {                                                          \
-        vpx_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+        vpx_filter_block1d8_##dir##2_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
       } else if (w == 4) {                                                   \
-        vpx_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
+        vpx_filter_block1d4_##dir##2_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter_row);  \
       }                                                                      \
+      (void)num_taps;                                                        \
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                  \
+#define FUN_CONV_2D(avg, opt, is_avg)                                          \
   void vpx_convolve8_##avg##opt(                                               \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -79,7 +106,7 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     assert(h <= 64);                                                           \
     assert(x_step_q4 == 16);                                                   \
     assert(y_step_q4 == 16);                                                   \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+    if (filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) {               \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
       vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
                                 filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
@@ -87,6 +114,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
       vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
                                       filter, x0_q4, x_step_q4, y0_q4,         \
                                       y_step_q4, w, h);                        \
+    } else if (filter_x[2] | filter_x[5]) {                                    \
+      const int num_taps = is_avg ? 8 : 4;                                     \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(                                               \
+          src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,       \
+          filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1);    \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 64 * (num_taps / 2 - 1), 64,    \
+                                      dst, dst_stride, filter, x0_q4,          \
+                                      x_step_q4, y0_q4, y_step_q4, w, h);      \
     } else {                                                                   \
       DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
       vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
@@ -106,57 +142,86 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt,     \
+                         is_avg)                                              \
   void vpx_highbd_convolve8_##name##_##opt(                                   \
       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
       ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
-    const int16_t *filter = filter_kernel[offset];                            \
-    if (step_q4 == 16 && filter[3] != 128) {                                  \
-      if (filter[0] | filter[1] | filter[2]) {                                \
+    const int16_t *filter_row = filter_kernel[offset];                        \
+    if (step_q4 == 16 && filter_row[3] != 128) {                              \
+      if (filter_row[0] | filter_row[1] | filter_row[6] | filter_row[7]) {    \
+        const int num_taps = 8;                                               \
         while (w >= 16) {                                                     \
           vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 16;                                                          \
           dst += 16;                                                          \
           w -= 16;                                                            \
         }                                                                     \
         while (w >= 8) {                                                      \
           vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 8;                                                           \
           dst += 8;                                                           \
           w -= 8;                                                             \
         }                                                                     \
         while (w >= 4) {                                                      \
           vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
+        (void)num_taps;                                                       \
+      } else if (filter_row[2] | filter_row[5]) {                             \
+        const int num_taps = is_avg ? 8 : 4;                                  \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##4_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##4_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 4;                                                           \
           dst += 4;                                                           \
           w -= 4;                                                             \
         }                                                                     \
+        (void)num_taps;                                                       \
       } else {                                                                \
+        const int num_taps = 2;                                               \
         while (w >= 16) {                                                     \
           vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 16;                                                          \
           dst += 16;                                                          \
           w -= 16;                                                            \
         }                                                                     \
         while (w >= 8) {                                                      \
           vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 8;                                                           \
           dst += 8;                                                           \
           w -= 8;                                                             \
         }                                                                     \
         while (w >= 4) {                                                      \
           vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
-              src, src_stride, dst, dst_stride, h, filter, bd);               \
+              src_start, src_stride, dst, dst_stride, h, filter_row, bd);     \
           src += 4;                                                           \
           dst += 4;                                                           \
           w -= 4;                                                             \
         }                                                                     \
+        (void)num_taps;                                                       \
       }                                                                       \
     }                                                                         \
     if (w) {                                                                  \
@@ -166,7 +231,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     }                                                                         \
   }
 
-#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+#define HIGH_FUN_CONV_2D(avg, opt, is_avg)                                     \
   void vpx_highbd_convolve8_##avg##opt(                                        \
       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
@@ -175,7 +240,8 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
     assert(w <= 64);                                                           \
     assert(h <= 64);                                                           \
     if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+      if ((filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]) ||           \
+          filter_x[3] == 128) {                                                \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
         vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
                                          fdata2, 64, filter, x0_q4, x_step_q4, \
@@ -183,6 +249,16 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
         vpx_highbd_convolve8_##avg##vert_##opt(                                \
             fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
             y0_q4, y_step_q4, w, h, bd);                                       \
+      } else if (filter_x[2] | filter_x[5]) {                                  \
+        const int num_taps = is_avg ? 8 : 4;                                   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(                                      \
+            src - (num_taps / 2 - 1) * src_stride, src_stride, fdata2, 64,     \
+            filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h + num_taps - 1,   \
+            bd);                                                               \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 64 * (num_taps / 2 - 1), 64, dst, dst_stride, filter,     \
+            x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);                     \
       } else {                                                                 \
         DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
         vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
@@ -198,6 +274,6 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                     bd);                                       \
     }                                                                          \
   }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#endif  // VPX_DSP_X86_CONVOLVE_H_
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h
index bc96b738f46..99bc9637fcb 100644
--- a/libs/libvpx/vpx_dsp/x86/convolve_avx2.h
+++ b/libs/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_
-#define VPX_DSP_X86_CONVOLVE_AVX2_H_
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
 
 #include <immintrin.h>  // AVX2
 
@@ -100,6 +100,63 @@ static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
   return sum1;
 }
 
+static INLINE __m256i mm256_loadu2_si128(const void *lo, const void *hi) {
+  const __m256i tmp =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)lo));
+  return _mm256_inserti128_si256(tmp, _mm_loadu_si128((const __m128i *)hi), 1);
+}
+
+static INLINE __m256i mm256_loadu2_epi64(const void *lo, const void *hi) {
+  const __m256i tmp =
+      _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)lo));
+  return _mm256_inserti128_si256(tmp, _mm_loadl_epi64((const __m128i *)hi), 1);
+}
+
+static INLINE void mm256_store2_si128(__m128i *const dst_ptr_1,
+                                      __m128i *const dst_ptr_2,
+                                      const __m256i *const src) {
+  _mm_store_si128(dst_ptr_1, _mm256_castsi256_si128(*src));
+  _mm_store_si128(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
+                                       __m128i *const dst_ptr_2,
+                                       const __m256i *const src) {
+  _mm_storel_epi64(dst_ptr_1, _mm256_castsi256_si128(*src));
+  _mm_storel_epi64(dst_ptr_2, _mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
+                                       __m128i *const dst_ptr_2,
+                                       const __m256i *const src) {
+  *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+  *((uint32_t *)(dst_ptr_2)) =
+      _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+}
+
+static INLINE __m256i mm256_round_epi32(const __m256i *const src,
+                                        const __m256i *const half_depth,
+                                        const int depth) {
+  const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth);
+  return _mm256_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_round_epi16(const __m256i *const src,
+                                        const __m256i *const half_depth,
+                                        const int depth) {
+  const __m256i nearest_src = _mm256_adds_epi16(*src, *half_depth);
+  return _mm256_srai_epi16(nearest_src, depth);
+}
+
+static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0,
+                                           const __m256i *const src_1,
+                                           const __m256i *const ker_0,
+                                           const __m256i *const ker_1) {
+  const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0);
+  const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1);
+  return _mm256_add_epi32(tmp_0, tmp_1);
+}
+
 #undef MM256_BROADCASTSI128_SI256
 
-#endif  // VPX_DSP_X86_CONVOLVE_AVX2_H_
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve_sse2.h b/libs/libvpx/vpx_dsp/x86/convolve_sse2.h
new file mode 100644
index 00000000000..84435463949
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/convolve_sse2.h
@@ -0,0 +1,88 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 3 2 3 2 3 2 3 2 as 16-bit words
+static INLINE __m128i extract_quarter_2_epi16_sse2(const __m128i *const reg) {
+  __m128i tmp = _mm_unpacklo_epi32(*reg, *reg);
+  return _mm_unpackhi_epi64(tmp, tmp);
+}
+
+// Interprets the input register as 16-bit words 7 6 5 4 3 2 1 0, then returns
+// values at index 2 and 3 to return 5 4 5 4 5 4 5 4 as 16-bit words.
+static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) {
+  __m128i tmp = _mm_unpackhi_epi32(*reg, *reg);
+  return _mm_unpacklo_epi64(tmp, tmp);
+}
+
+// Interprets src as 8-bit words, zero extends to form 16-bit words, then
+// multiplies with ker and add the adjacent results to form 32-bit words.
+// Finally adds the result from 1 and 2 together.
+static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1,
+                                            const __m128i *const src_2,
+                                            const __m128i *const ker_1,
+                                            const __m128i *const ker_2) {
+  const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128());
+  const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128());
+  const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1);
+  const __m128i madd_2 = _mm_madd_epi16(src_2_half, *ker_2);
+  return _mm_add_epi32(madd_1, madd_2);
+}
+
+// Interprets src as 16-bit words, then multiplies with ker and add the
+// adjacent results to form 32-bit words. Finally adds the result from 1 and 2
+// together.
+static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1,
+                                             const __m128i *const src_2,
+                                             const __m128i *const ker_1,
+                                             const __m128i *const ker_2) {
+  const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1);
+  const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2);
+  return _mm_add_epi32(madd_1, madd_2);
+}
+
+static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0,
+                                               const __m128i *const src_1,
+                                               const __m128i *const ker) {
+  const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker);
+  const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker);
+  return _mm_packs_epi32(madd_1, madd_2);
+}
+
+// Interleaves src_1 and src_2
+static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1,
+                                        const __m128i *const src_2) {
+  const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2);
+  const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2);
+  return _mm_packs_epi32(tmp_1, tmp_2);
+}
+
+static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
+  const __m128i nearest_src = _mm_add_epi32(*src, *half_depth);
+  return _mm_srai_epi32(nearest_src, depth);
+}
+
+static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src,
+                                          const __m128i *const half_depth,
+                                          const int depth) {
+  const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth);
+  return _mm_srai_epi16(nearest_src, depth);
+}
+
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h
index e5d452f99ea..8a4b1651336 100644
--- a/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h
+++ b/libs/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
-#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#ifndef VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
 
 #include <assert.h>
 #include <tmmintrin.h>  // SSSE3
@@ -109,4 +109,4 @@ static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
   return temp;
 }
 
-#endif  // VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#endif  // VPX_VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
index 97cb43b6711..9d8e5e3e090 100644
--- a/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
-;                               int pitch, int rows, int cols,int flimit)
-extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_sse2) PRIVATE
-sym(vpx_mbpost_proc_down_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 128+16
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-    mov         [rsp+128+8], eax
-    mov         [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vpx_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword arg(2), 8
-
-    ;for(c=0; c<cols; c+=8)
-.loop_col:
-            mov         rsi,        arg(0) ; s
-            pxor        xmm0,       xmm0        ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-
-            ; this copies the last row down into the border 8 rows
-            mov         rdi,        rsi
-            mov         rdx,        arg(2)
-            sub         rdx,        9
-            imul        rdx,        rax
-            lea         rdi,        [rdi+rdx]
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_borderd:                                                  ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_borderd
-
-            neg         rax                                     ; rax = -pitch
-
-            ; this copies the first row up into the border 8 rows
-            mov         rdi,        rsi
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_border:                                                   ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_border
-
-
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movq        xmm1,       QWORD PTR [rdi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [rsi+rax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vpx_rv))]
-            movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
-%else
-            movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
-%endif
-
-            paddw       xmm1,       xmm4
-            ;paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         rcx,        15
-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
-
-            cmp         edx,        8
-            jl          .skip_assignment
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-            and         rcx,        15
-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
-            movq        [rsi],      mm0
-
-.skip_assignment:
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-        add         dword arg(0), 8 ; s += 8
-        sub         dword arg(3), 8 ; cols -= 8
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 128+16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
 
 ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
 ;                                    int pitch, int rows, int cols,int flimit)
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 132e0652392..3f158b5e4e3 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -1374,59 +1374,37 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
         __m256i lstep1[64], lstep2[64], lstep3[64];
         __m256i u[32], v[32], sign[16];
         const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        const __m256i k__pOne_mOne = pair256_set_epi16(1, -1);
         // start using 32-bit operations
         // stage 3
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm256_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm256_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm256_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm256_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm256_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm256_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm256_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm256_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm256_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm256_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm256_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm256_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm256_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm256_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm256_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm256_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm256_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm256_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm256_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm256_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm256_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm256_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm256_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm256_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm256_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm256_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm256_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm256_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm256_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm256_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm256_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm256_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm256_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm256_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm256_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm256_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm256_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm256_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm256_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm256_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm256_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm256_sub_epi32(lstep2[1], lstep2[15]);
+          // expanding to 32-bit length while adding and subtracting
+          lstep2[0] = _mm256_unpacklo_epi16(step2[0], step2[7]);
+          lstep2[1] = _mm256_unpackhi_epi16(step2[0], step2[7]);
+          lstep2[2] = _mm256_unpacklo_epi16(step2[1], step2[6]);
+          lstep2[3] = _mm256_unpackhi_epi16(step2[1], step2[6]);
+          lstep2[4] = _mm256_unpacklo_epi16(step2[2], step2[5]);
+          lstep2[5] = _mm256_unpackhi_epi16(step2[2], step2[5]);
+          lstep2[6] = _mm256_unpacklo_epi16(step2[3], step2[4]);
+          lstep2[7] = _mm256_unpackhi_epi16(step2[3], step2[4]);
+
+          lstep3[0] = _mm256_madd_epi16(lstep2[0], kOne);
+          lstep3[1] = _mm256_madd_epi16(lstep2[1], kOne);
+          lstep3[2] = _mm256_madd_epi16(lstep2[2], kOne);
+          lstep3[3] = _mm256_madd_epi16(lstep2[3], kOne);
+          lstep3[4] = _mm256_madd_epi16(lstep2[4], kOne);
+          lstep3[5] = _mm256_madd_epi16(lstep2[5], kOne);
+          lstep3[6] = _mm256_madd_epi16(lstep2[6], kOne);
+          lstep3[7] = _mm256_madd_epi16(lstep2[7], kOne);
+
+          lstep3[8] = _mm256_madd_epi16(lstep2[6], k__pOne_mOne);
+          lstep3[9] = _mm256_madd_epi16(lstep2[7], k__pOne_mOne);
+          lstep3[10] = _mm256_madd_epi16(lstep2[4], k__pOne_mOne);
+          lstep3[11] = _mm256_madd_epi16(lstep2[5], k__pOne_mOne);
+          lstep3[12] = _mm256_madd_epi16(lstep2[2], k__pOne_mOne);
+          lstep3[13] = _mm256_madd_epi16(lstep2[3], k__pOne_mOne);
+          lstep3[14] = _mm256_madd_epi16(lstep2[0], k__pOne_mOne);
+          lstep3[15] = _mm256_madd_epi16(lstep2[1], k__pOne_mOne);
         }
         {
           const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
@@ -1468,126 +1446,76 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
           lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], step2[23]);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], step2[23]);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], step2[22]);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], step2[22]);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], step2[21]);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], step2[21]);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], step2[20]);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], step2[20]);
+
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], step2[27]);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], step2[27]);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], step2[26]);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], step2[26]);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], step2[25]);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], step2[25]);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], step2[24]);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], step2[24]);
+
+          lstep3[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep3[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep3[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep3[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep3[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep3[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep3[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep3[39] = _mm256_madd_epi16(lstep1[39], kOne);
+
+          lstep3[40] = _mm256_madd_epi16(lstep1[38], k__pOne_mOne);
+          lstep3[41] = _mm256_madd_epi16(lstep1[39], k__pOne_mOne);
+          lstep3[42] = _mm256_madd_epi16(lstep1[36], k__pOne_mOne);
+          lstep3[43] = _mm256_madd_epi16(lstep1[37], k__pOne_mOne);
+          lstep3[44] = _mm256_madd_epi16(lstep1[34], k__pOne_mOne);
+          lstep3[45] = _mm256_madd_epi16(lstep1[35], k__pOne_mOne);
+          lstep3[46] = _mm256_madd_epi16(lstep1[32], k__pOne_mOne);
+          lstep3[47] = _mm256_madd_epi16(lstep1[33], k__pOne_mOne);
+
+          lstep3[48] = _mm256_madd_epi16(lstep1[62], k__pOne_mOne);
+          lstep3[49] = _mm256_madd_epi16(lstep1[63], k__pOne_mOne);
+          lstep3[50] = _mm256_madd_epi16(lstep1[60], k__pOne_mOne);
+          lstep3[51] = _mm256_madd_epi16(lstep1[61], k__pOne_mOne);
+          lstep3[52] = _mm256_madd_epi16(lstep1[58], k__pOne_mOne);
+          lstep3[53] = _mm256_madd_epi16(lstep1[59], k__pOne_mOne);
+          lstep3[54] = _mm256_madd_epi16(lstep1[56], k__pOne_mOne);
+          lstep3[55] = _mm256_madd_epi16(lstep1[57], k__pOne_mOne);
+
+          lstep3[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep3[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep3[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep3[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep3[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep3[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep3[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep3[63] = _mm256_madd_epi16(lstep1[63], kOne);
         }
 
         // stage 4
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm256_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm256_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm256_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm256_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+          // expanding to 32-bit length prior to addition operations
+          sign[0] = _mm256_cmpgt_epi16(kZero, step2[8]);
+          sign[1] = _mm256_cmpgt_epi16(kZero, step2[9]);
+          sign[2] = _mm256_cmpgt_epi16(kZero, step2[14]);
+          sign[3] = _mm256_cmpgt_epi16(kZero, step2[15]);
+          lstep2[16] = _mm256_unpacklo_epi16(step2[8], sign[0]);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[8], sign[0]);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[9], sign[1]);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[9], sign[1]);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], sign[2]);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], sign[2]);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], sign[3]);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], sign[3]);
 
           lstep1[0] = _mm256_add_epi32(lstep3[6], lstep3[0]);
           lstep1[1] = _mm256_add_epi32(lstep3[7], lstep3[1]);
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 32b9bd2813f..ac1246faa58 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -21,7 +21,7 @@
 #define ADD_EPI16 _mm_adds_epi16
 #define SUB_EPI16 _mm_subs_epi16
 #if FDCT32x32_HIGH_PRECISION
-void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
+static void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
   int i, j;
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
@@ -35,7 +35,8 @@ void vpx_fdct32x32_rows_c(const int16_t *intermediate, tran_low_t *out) {
 #define HIGH_FDCT32x32_2D_C vpx_highbd_fdct32x32_c
 #define HIGH_FDCT32x32_2D_ROWS_C vpx_fdct32x32_rows_c
 #else
-void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate, tran_low_t *out) {
+static void vpx_fdct32x32_rd_rows_c(const int16_t *intermediate,
+                                    tran_low_t *out) {
   int i, j;
   for (i = 0; i < 32; ++i) {
     tran_high_t temp_in[32], temp_out[32];
@@ -101,6 +102,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i kZero = _mm_set1_epi16(0);
   const __m128i kOne = _mm_set1_epi16(1);
+
   // Do the two transform/transpose passes
   int pass;
 #if DCT_HIGH_BIT_DEPTH
@@ -1508,59 +1510,37 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
         __m128i lstep1[64], lstep2[64], lstep3[64];
         __m128i u[32], v[32], sign[16];
         const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
+        const __m128i k__pOne_mOne = pair_set_epi16(1, -1);
         // start using 32-bit operations
         // stage 3
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[0] = _mm_unpacklo_epi16(step2[0], kZero);
-          lstep2[1] = _mm_unpackhi_epi16(step2[0], kZero);
-          lstep2[2] = _mm_unpacklo_epi16(step2[1], kZero);
-          lstep2[3] = _mm_unpackhi_epi16(step2[1], kZero);
-          lstep2[4] = _mm_unpacklo_epi16(step2[2], kZero);
-          lstep2[5] = _mm_unpackhi_epi16(step2[2], kZero);
-          lstep2[6] = _mm_unpacklo_epi16(step2[3], kZero);
-          lstep2[7] = _mm_unpackhi_epi16(step2[3], kZero);
-          lstep2[8] = _mm_unpacklo_epi16(step2[4], kZero);
-          lstep2[9] = _mm_unpackhi_epi16(step2[4], kZero);
-          lstep2[10] = _mm_unpacklo_epi16(step2[5], kZero);
-          lstep2[11] = _mm_unpackhi_epi16(step2[5], kZero);
-          lstep2[12] = _mm_unpacklo_epi16(step2[6], kZero);
-          lstep2[13] = _mm_unpackhi_epi16(step2[6], kZero);
-          lstep2[14] = _mm_unpacklo_epi16(step2[7], kZero);
-          lstep2[15] = _mm_unpackhi_epi16(step2[7], kZero);
-          lstep2[0] = _mm_madd_epi16(lstep2[0], kOne);
-          lstep2[1] = _mm_madd_epi16(lstep2[1], kOne);
-          lstep2[2] = _mm_madd_epi16(lstep2[2], kOne);
-          lstep2[3] = _mm_madd_epi16(lstep2[3], kOne);
-          lstep2[4] = _mm_madd_epi16(lstep2[4], kOne);
-          lstep2[5] = _mm_madd_epi16(lstep2[5], kOne);
-          lstep2[6] = _mm_madd_epi16(lstep2[6], kOne);
-          lstep2[7] = _mm_madd_epi16(lstep2[7], kOne);
-          lstep2[8] = _mm_madd_epi16(lstep2[8], kOne);
-          lstep2[9] = _mm_madd_epi16(lstep2[9], kOne);
-          lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
-          lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
-          lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
-          lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
-          lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
-          lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
-
-          lstep3[0] = _mm_add_epi32(lstep2[14], lstep2[0]);
-          lstep3[1] = _mm_add_epi32(lstep2[15], lstep2[1]);
-          lstep3[2] = _mm_add_epi32(lstep2[12], lstep2[2]);
-          lstep3[3] = _mm_add_epi32(lstep2[13], lstep2[3]);
-          lstep3[4] = _mm_add_epi32(lstep2[10], lstep2[4]);
-          lstep3[5] = _mm_add_epi32(lstep2[11], lstep2[5]);
-          lstep3[6] = _mm_add_epi32(lstep2[8], lstep2[6]);
-          lstep3[7] = _mm_add_epi32(lstep2[9], lstep2[7]);
-          lstep3[8] = _mm_sub_epi32(lstep2[6], lstep2[8]);
-          lstep3[9] = _mm_sub_epi32(lstep2[7], lstep2[9]);
-          lstep3[10] = _mm_sub_epi32(lstep2[4], lstep2[10]);
-          lstep3[11] = _mm_sub_epi32(lstep2[5], lstep2[11]);
-          lstep3[12] = _mm_sub_epi32(lstep2[2], lstep2[12]);
-          lstep3[13] = _mm_sub_epi32(lstep2[3], lstep2[13]);
-          lstep3[14] = _mm_sub_epi32(lstep2[0], lstep2[14]);
-          lstep3[15] = _mm_sub_epi32(lstep2[1], lstep2[15]);
+          // expanding to 32-bit length while adding and subtracting
+          lstep2[0] = _mm_unpacklo_epi16(step2[0], step2[7]);
+          lstep2[1] = _mm_unpackhi_epi16(step2[0], step2[7]);
+          lstep2[2] = _mm_unpacklo_epi16(step2[1], step2[6]);
+          lstep2[3] = _mm_unpackhi_epi16(step2[1], step2[6]);
+          lstep2[4] = _mm_unpacklo_epi16(step2[2], step2[5]);
+          lstep2[5] = _mm_unpackhi_epi16(step2[2], step2[5]);
+          lstep2[6] = _mm_unpacklo_epi16(step2[3], step2[4]);
+          lstep2[7] = _mm_unpackhi_epi16(step2[3], step2[4]);
+
+          lstep3[0] = _mm_madd_epi16(lstep2[0], kOne);
+          lstep3[1] = _mm_madd_epi16(lstep2[1], kOne);
+          lstep3[2] = _mm_madd_epi16(lstep2[2], kOne);
+          lstep3[3] = _mm_madd_epi16(lstep2[3], kOne);
+          lstep3[4] = _mm_madd_epi16(lstep2[4], kOne);
+          lstep3[5] = _mm_madd_epi16(lstep2[5], kOne);
+          lstep3[6] = _mm_madd_epi16(lstep2[6], kOne);
+          lstep3[7] = _mm_madd_epi16(lstep2[7], kOne);
+
+          lstep3[8] = _mm_madd_epi16(lstep2[6], k__pOne_mOne);
+          lstep3[9] = _mm_madd_epi16(lstep2[7], k__pOne_mOne);
+          lstep3[10] = _mm_madd_epi16(lstep2[4], k__pOne_mOne);
+          lstep3[11] = _mm_madd_epi16(lstep2[5], k__pOne_mOne);
+          lstep3[12] = _mm_madd_epi16(lstep2[2], k__pOne_mOne);
+          lstep3[13] = _mm_madd_epi16(lstep2[3], k__pOne_mOne);
+          lstep3[14] = _mm_madd_epi16(lstep2[0], k__pOne_mOne);
+          lstep3[15] = _mm_madd_epi16(lstep2[1], k__pOne_mOne);
         }
         {
           const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
@@ -1594,126 +1574,76 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
           lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
         }
         {
-          lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
-          lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
-          lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
-          lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
-          lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
-          lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
-          lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
-          lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
-          lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
-          lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
-          lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
-          lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
-          lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
-          lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
-          lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
-          lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
-          lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
-          lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
-          lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
-          lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
-          lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
-          lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
-          lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
-          lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
-          lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
-          lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
-          lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
-          lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
-          lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
-          lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
-          lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
-          lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
-
-          lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
-          lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
-          lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
-          lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
-          lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
-          lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
-          lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
-          lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
-          lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
-          lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
-          lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
-          lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
-          lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
-          lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
-          lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
-          lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
-          lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
-          lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
-          lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
-          lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
-          lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
-          lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
-          lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
-          lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
-          lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
-          lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
-          lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
-          lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
-          lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
-          lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
-          lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
-          lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
-
-          lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
-          lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
-
-          lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
-          lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
-          lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
-          lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
-          lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
-          lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
-          lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
-          lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
-          lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
-          lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
-          lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
-          lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
-          lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
-          lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
-          lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
-          lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
-          lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
-          lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
-          lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
-          lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
-          lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
-          lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
-          lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
-          lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
-          lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
-          lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
-          lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
-          lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
-          lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
-          lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
+          lstep1[32] = _mm_unpacklo_epi16(step1[16], step2[23]);
+          lstep1[33] = _mm_unpackhi_epi16(step1[16], step2[23]);
+          lstep1[34] = _mm_unpacklo_epi16(step1[17], step2[22]);
+          lstep1[35] = _mm_unpackhi_epi16(step1[17], step2[22]);
+          lstep1[36] = _mm_unpacklo_epi16(step1[18], step2[21]);
+          lstep1[37] = _mm_unpackhi_epi16(step1[18], step2[21]);
+          lstep1[38] = _mm_unpacklo_epi16(step1[19], step2[20]);
+          lstep1[39] = _mm_unpackhi_epi16(step1[19], step2[20]);
+
+          lstep1[56] = _mm_unpacklo_epi16(step1[28], step2[27]);
+          lstep1[57] = _mm_unpackhi_epi16(step1[28], step2[27]);
+          lstep1[58] = _mm_unpacklo_epi16(step1[29], step2[26]);
+          lstep1[59] = _mm_unpackhi_epi16(step1[29], step2[26]);
+          lstep1[60] = _mm_unpacklo_epi16(step1[30], step2[25]);
+          lstep1[61] = _mm_unpackhi_epi16(step1[30], step2[25]);
+          lstep1[62] = _mm_unpacklo_epi16(step1[31], step2[24]);
+          lstep1[63] = _mm_unpackhi_epi16(step1[31], step2[24]);
+
+          lstep3[32] = _mm_madd_epi16(lstep1[32], kOne);
+          lstep3[33] = _mm_madd_epi16(lstep1[33], kOne);
+          lstep3[34] = _mm_madd_epi16(lstep1[34], kOne);
+          lstep3[35] = _mm_madd_epi16(lstep1[35], kOne);
+          lstep3[36] = _mm_madd_epi16(lstep1[36], kOne);
+          lstep3[37] = _mm_madd_epi16(lstep1[37], kOne);
+          lstep3[38] = _mm_madd_epi16(lstep1[38], kOne);
+          lstep3[39] = _mm_madd_epi16(lstep1[39], kOne);
+
+          lstep3[40] = _mm_madd_epi16(lstep1[38], k__pOne_mOne);
+          lstep3[41] = _mm_madd_epi16(lstep1[39], k__pOne_mOne);
+          lstep3[42] = _mm_madd_epi16(lstep1[36], k__pOne_mOne);
+          lstep3[43] = _mm_madd_epi16(lstep1[37], k__pOne_mOne);
+          lstep3[44] = _mm_madd_epi16(lstep1[34], k__pOne_mOne);
+          lstep3[45] = _mm_madd_epi16(lstep1[35], k__pOne_mOne);
+          lstep3[46] = _mm_madd_epi16(lstep1[32], k__pOne_mOne);
+          lstep3[47] = _mm_madd_epi16(lstep1[33], k__pOne_mOne);
+
+          lstep3[48] = _mm_madd_epi16(lstep1[62], k__pOne_mOne);
+          lstep3[49] = _mm_madd_epi16(lstep1[63], k__pOne_mOne);
+          lstep3[50] = _mm_madd_epi16(lstep1[60], k__pOne_mOne);
+          lstep3[51] = _mm_madd_epi16(lstep1[61], k__pOne_mOne);
+          lstep3[52] = _mm_madd_epi16(lstep1[58], k__pOne_mOne);
+          lstep3[53] = _mm_madd_epi16(lstep1[59], k__pOne_mOne);
+          lstep3[54] = _mm_madd_epi16(lstep1[56], k__pOne_mOne);
+          lstep3[55] = _mm_madd_epi16(lstep1[57], k__pOne_mOne);
+
+          lstep3[56] = _mm_madd_epi16(lstep1[56], kOne);
+          lstep3[57] = _mm_madd_epi16(lstep1[57], kOne);
+          lstep3[58] = _mm_madd_epi16(lstep1[58], kOne);
+          lstep3[59] = _mm_madd_epi16(lstep1[59], kOne);
+          lstep3[60] = _mm_madd_epi16(lstep1[60], kOne);
+          lstep3[61] = _mm_madd_epi16(lstep1[61], kOne);
+          lstep3[62] = _mm_madd_epi16(lstep1[62], kOne);
+          lstep3[63] = _mm_madd_epi16(lstep1[63], kOne);
         }
 
         // stage 4
         {
-          // expanding to 32-bit length priori to addition operations
-          lstep2[16] = _mm_unpacklo_epi16(step2[8], kZero);
-          lstep2[17] = _mm_unpackhi_epi16(step2[8], kZero);
-          lstep2[18] = _mm_unpacklo_epi16(step2[9], kZero);
-          lstep2[19] = _mm_unpackhi_epi16(step2[9], kZero);
-          lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
-          lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
-          lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
-          lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
-          lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
-          lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
-          lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
-          lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
-          lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
-          lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
-          lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
-          lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
+          // expanding to 32-bit length prior to addition operations
+          sign[0] = _mm_cmpgt_epi16(kZero, step2[8]);
+          sign[1] = _mm_cmpgt_epi16(kZero, step2[9]);
+          sign[2] = _mm_cmpgt_epi16(kZero, step2[14]);
+          sign[3] = _mm_cmpgt_epi16(kZero, step2[15]);
+          lstep2[16] = _mm_unpacklo_epi16(step2[8], sign[0]);
+          lstep2[17] = _mm_unpackhi_epi16(step2[8], sign[0]);
+          lstep2[18] = _mm_unpacklo_epi16(step2[9], sign[1]);
+          lstep2[19] = _mm_unpackhi_epi16(step2[9], sign[1]);
+          lstep2[28] = _mm_unpacklo_epi16(step2[14], sign[2]);
+          lstep2[29] = _mm_unpackhi_epi16(step2[14], sign[2]);
+          lstep2[30] = _mm_unpacklo_epi16(step2[15], sign[3]);
+          lstep2[31] = _mm_unpackhi_epi16(step2[15], sign[3]);
 
           lstep1[0] = _mm_add_epi32(lstep3[6], lstep3[0]);
           lstep1[1] = _mm_add_epi32(lstep3[7], lstep3[1]);
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c b/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
index 21f11f0c3e5..a2ed420e37e 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -9,7 +9,9 @@
  */
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
+#if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"
@@ -21,3 +23,4 @@
 #include "vpx_dsp/x86/fwd_dct32x32_impl_avx2.h"  // NOLINT
 #undef FDCT32x32_2D_AVX2
 #undef FDCT32x32_HIGH_PRECISION
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index f9abaecf28e..d546f02a148 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -93,9 +93,9 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
 #if DCT_HIGH_BIT_DEPTH
   // Check inputs small enough to use optimised code
   cmp0 = _mm_xor_si128(_mm_cmpgt_epi16(in0, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in0, _mm_set1_epi16(0xfc00)));
+                       _mm_cmplt_epi16(in0, _mm_set1_epi16((int16_t)0xfc00)));
   cmp1 = _mm_xor_si128(_mm_cmpgt_epi16(in1, _mm_set1_epi16(0x3ff)),
-                       _mm_cmplt_epi16(in1, _mm_set1_epi16(0xfc00)));
+                       _mm_cmplt_epi16(in1, _mm_set1_epi16((int16_t)0xfc00)));
   test = _mm_movemask_epi8(_mm_or_si128(cmp0, cmp1));
   if (test) {
     vpx_highbd_fdct4x4_c(input, output, stride);
@@ -778,6 +778,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
             return;
           }
 #endif  // DCT_HIGH_BIT_DEPTH
+
           // Interleave to do the multiply by constants which gets us
           // into 32 bits.
           {
@@ -834,6 +835,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
               return;
             }
 #endif  // DCT_HIGH_BIT_DEPTH
+
             // Interleave to do the multiply by constants which gets us
             // into 32 bits.
             {
diff --git a/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
index 5201e764c83..ffd37ff3761 100644
--- a/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/fwd_txfm_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_FWD_TXFM_SSE2_H_
-#define VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -368,4 +368,4 @@ static INLINE void transpose_and_output8x8(
 }  // extern "C"
 #endif
 
-#endif  // VPX_DSP_X86_FWD_TXFM_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 7e75d5d10c2..3209625617d 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -9,9 +9,9 @@
  */
 
 #include <immintrin.h>
-
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
 
 // -----------------------------------------------------------------------------
 // Copy and average
@@ -20,7 +20,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *filter, int x0_q4,
                                    int x_step_q4, int y0_q4, int y_step_q4,
-                                   int width, int h, int bd) {
+                                   int w, int h, int bd) {
   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
@@ -28,8 +28,8 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
   (void)bd;
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
     do {
       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -43,7 +43,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
     do {
       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
@@ -53,7 +53,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
     __m256i p0, p1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -67,7 +67,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h -= 2;
     } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
     __m128i p0, p1;
     do {
       p0 = _mm_loadu_si128((const __m128i *)src);
@@ -81,7 +81,7 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h -= 2;
     } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
     __m128i p0, p1;
     do {
       p0 = _mm_loadl_epi64((const __m128i *)src);
@@ -102,7 +102,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
                                   const InterpKernel *filter, int x0_q4,
                                   int x_step_q4, int y0_q4, int y_step_q4,
-                                  int width, int h, int bd) {
+                                  int w, int h, int bd) {
   (void)filter;
   (void)x0_q4;
   (void)x_step_q4;
@@ -110,8 +110,8 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
   (void)y_step_q4;
   (void)bd;
 
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
+  assert(w % 4 == 0);
+  if (w > 32) {  // w = 64
     __m256i p0, p1, p2, p3, u0, u1, u2, u3;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -130,7 +130,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 16) {  // width = 32
+  } else if (w > 16) {  // w = 32
     __m256i p0, p1, u0, u1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -143,7 +143,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride;
       h--;
     } while (h > 0);
-  } else if (width > 8) {  // width = 16
+  } else if (w > 8) {  // w = 16
     __m256i p0, p1, u0, u1;
     do {
       p0 = _mm256_loadu_si256((const __m256i *)src);
@@ -158,7 +158,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride << 1;
       h -= 2;
     } while (h > 0);
-  } else if (width > 4) {  // width = 8
+  } else if (w > 4) {  // w = 8
     __m128i p0, p1, u0, u1;
     do {
       p0 = _mm_loadu_si128((const __m128i *)src);
@@ -172,7 +172,7 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
       dst += dst_stride << 1;
       h -= 2;
     } while (h > 0);
-  } else {  // width = 4
+  } else {  // w = 4
     __m128i p0, p1, u0, u1;
     do {
       p0 = _mm_loadl_epi64((const __m128i *)src);
@@ -192,8 +192,6 @@ void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
 // -----------------------------------------------------------------------------
 // Horizontal and vertical filtering
 
-#define CONV8_ROUNDING_BITS (7)
-
 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
@@ -210,6 +208,9 @@ static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
 
 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
 
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
 // -----------------------------------------------------------------------------
 // Horizontal Filtering
 
@@ -923,6 +924,196 @@ static void vpx_highbd_filter_block1d16_h8_avg_avx2(
   } while (height > 0);
 }
 
+static void vpx_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We extract the middle four elements of the kernel into two registers in
+  // the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add on the two
+  // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we
+  // can do this two rows at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the result
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg));
+  }
+}
+
+static void vpx_highbd_filter_block1d8_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will extract the middle four elements of the kernel into two registers
+  // in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum of the first half.
+  // Calling add gives us first half of the output. Repat again to get the whole
+  // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows
+  // at a time.
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i res_reg, res_first, res_last;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2,
+                       3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4,
+                       5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13);
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for first half
+    res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                     &kernel_reg_23, &kernel_reg_45);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Result for second half
+    res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                    &kernel_reg_23, &kernel_reg_45);
+
+    // Round each result
+    res_first = mm256_round_epi32(&res_first, &reg_round, CONV8_ROUNDING_BITS);
+    res_last = mm256_round_epi32(&res_last, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Finally combine to get the final dst
+    res_reg = _mm256_packus_epi32(res_first, res_last);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + 4), &res_reg);
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
 static void vpx_highbd_filter_block1d8_v8_avg_avx2(
     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
@@ -1058,39 +1249,235 @@ static void vpx_highbd_filter_block1d8_v2_avg_avx2(
   } while (height > 0);
 }
 
-void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
+static void vpx_highbd_filter_block1d4_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223;
+
+  // Result after multiply and add
+  __m256i res_reg;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel used
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+
+    // Output
+    res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223,
+                                   &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg = mm256_round_epi32(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg, res_reg);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels and rearrange them into the form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+  __m128i kernel_reg_128;                            // Kernel
+  __m256i kernel_reg, kernel_reg_23, kernel_reg_45;  // Segments of kernel
+
+  // Result after multiply and add
+  __m256i res_reg, res_reg_lo, res_reg_hi;
+
+  const __m256i reg_round =
+      _mm256_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1);
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55);
+  kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa);
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01);
+  src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23);
+    src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23);
+
+    // Output from first half
+    res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Output from second half
+    res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi,
+                                      &kernel_reg_23, &kernel_reg_45);
+
+    // Round the words
+    res_reg_lo =
+        mm256_round_epi32(&res_reg_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_hi =
+        mm256_round_epi32(&res_reg_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi);
+    res_reg = _mm256_min_epi16(res_reg, reg_max);
+
+    // Save the result
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001_lo = src_reg_1223_lo;
+    src_reg_m1001_hi = src_reg_1223_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d16_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+
 #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-HIGH_FUN_CONV_2D(, avx2);
-
-void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
-void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
-                                            uint16_t *, ptrdiff_t, uint32_t,
-                                            const int16_t *, int);
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_avx2 \
+  vpx_highbd_filter_block1d16_v8_avg_avx2
+#define vpx_highbd_filter_block1d16_h4_avg_avx2 \
+  vpx_highbd_filter_block1d16_h8_avg_avx2
+#define vpx_highbd_filter_block1d8_v4_avg_avx2 \
+  vpx_highbd_filter_block1d8_v8_avg_avx2
+#define vpx_highbd_filter_block1d8_h4_avg_avx2 \
+  vpx_highbd_filter_block1d8_h8_avg_avx2
+#define vpx_highbd_filter_block1d4_v4_avg_avx2 \
+  vpx_highbd_filter_block1d4_v8_avg_avx2
+#define vpx_highbd_filter_block1d4_h4_avg_avx2 \
+  vpx_highbd_filter_block1d4_h8_avg_avx2
+
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
+HIGH_FUN_CONV_2D(, avx2, 0);
+
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+
 #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_sse2
 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
@@ -1100,9 +1487,9 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 avx2);
-HIGH_FUN_CONV_2D(avg_, avx2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+HIGH_FUN_CONV_2D(avg_, avx2, 1);
 
 #undef HIGHBD_FUNC
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
index de097c66a69..7898ee12c84 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -53,7 +53,7 @@ static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
   out[15] = in[15];
 }
 
-static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/) {
   __m128i step1[16], step2[16];
 
   // stage 2
@@ -233,7 +233,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       in = all[i];
       highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
       highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
-      highbd_idct16_4col(in);
+      vpx_highbd_idct16_4col_sse4_1(in);
       input += 4 * 16;
     }
 
@@ -243,7 +243,7 @@ void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
       transpose_32bit_4x4(all[1] + i, out + 4);
       transpose_32bit_4x4(all[2] + i, out + 8);
       transpose_32bit_4x4(all[3] + i, out + 12);
-      highbd_idct16_4col(out);
+      vpx_highbd_idct16_4col_sse4_1(out);
 
       for (j = 0; j < 16; ++j) {
         highbd_write_buffer_4(dest + j * stride, out[j], bd);
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
index 38e64f3bc94..fe74d272add 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -16,28 +16,6 @@
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static INLINE void highbd_idct4(__m128i *const io) {
-  __m128i temp[2], step[4];
-
-  transpose_32bit_4x4(io, io);
-
-  // stage 1
-  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
-  extend_64bit(temp[0], temp);
-  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
-  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
-  extend_64bit(temp[0], temp);
-  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
-  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
-                          &step[3]);
-
-  // stage 2
-  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
-  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
-  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
-  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
-}
-
 void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
   __m128i io[4];
@@ -59,8 +37,8 @@ void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[0] = _mm_srai_epi16(io_short[0], 4);
     io[1] = _mm_srai_epi16(io_short[1], 4);
   } else {
-    highbd_idct4(io);
-    highbd_idct4(io);
+    highbd_idct4_sse4_1(io);
+    highbd_idct4_sse4_1(io);
     io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
     io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 909a6b79485..bb7a510e15b 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -124,8 +124,8 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
index ae391b2c02c..8b2e3d2415e 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -17,7 +17,7 @@
 #include "vpx_dsp/x86/inv_txfm_ssse3.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 
-static void highbd_idct8x8_half1d(__m128i *const io) {
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io) {
   __m128i step1[8], step2[8];
 
   transpose_32bit_4x4x2(io, io);
@@ -126,13 +126,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io_short[6] = _mm_packs_epi32(io[10], io[14]);
     io_short[7] = _mm_packs_epi32(io[11], io[15]);
 
-    idct8_sse2(io_short);
-    idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
+    vpx_idct8_sse2(io_short);
     round_shift_8x8(io_short, io);
   } else {
     __m128i temp[4];
 
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
     io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
@@ -142,7 +142,7 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
     io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
     io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     temp[0] = io[4];
     temp[1] = io[5];
@@ -152,13 +152,13 @@ void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
     io[5] = io[9];
     io[6] = io[10];
     io[7] = io[11];
-    highbd_idct8x8_half1d(io);
+    vpx_highbd_idct8x8_half1d_sse4_1(io);
 
     io[8] = temp[0];
     io[9] = temp[1];
     io[10] = temp[2];
     io[11] = temp[3];
-    highbd_idct8x8_half1d(&io[8]);
+    vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
 
     highbd_idct8x8_final_round(io);
   }
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
index 2051381aa88..43634aea3af 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -460,7 +460,8 @@ void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
   const int J = left[1];
   const int K = left[2];
   const int L = left[3];
-  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+  const __m128i XXXXXABC = _mm_castps_si128(
+      _mm_loadh_pi(_mm_setzero_ps(), (const __m64 *)(above - 1)));
   const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
   const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
   const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
index b9dcef205bf..d673fac493d 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -170,9 +170,9 @@ void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
   }
 }
 
-DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
-  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
-};
+DECLARE_ALIGNED(16, static const uint8_t,
+                rotate_right_epu16[16]) = { 2,  3,  4,  5,  6,  7,  8, 9,
+                                            10, 11, 12, 13, 14, 15, 0, 1 };
 
 static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
   *a = _mm_shuffle_epi8(*a, *rotrw);
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
index c61b62104f8..caf506ac07e 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm
@@ -256,7 +256,7 @@ cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bd
   movd                  m1, [aboveq-2]
   movq                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -264,7 +264,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   movlhps               m1, m1         ; tl tl tl tl tl tl tl tl
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   psubw                 m0, m1         ; t1-tl t2-tl t3-tl t4-tl
   psllw                 m3, m4
   pcmpeqw               m2, m2
@@ -295,7 +295,7 @@ cglobal highbd_tm_predictor_4x4, 5, 5, 6, dst, stride, above, left, bps
   RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
+cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bd, one
   movd                  m1, [aboveq-2]
   mova                  m0, [aboveq]
   pshuflw               m1, m1, 0x0
@@ -304,7 +304,7 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   pxor                  m3, m3
   pxor                  m4, m4
   pinsrw                m3, oned, 0
-  pinsrw                m4, bpsd, 0
+  pinsrw                m4, bdd, 0
   pshuflw               m3, m3, 0x0
   DEFINE_ARGS dst, stride, line, left
   punpcklqdq            m3, m3
@@ -339,14 +339,14 @@ cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bd
   movd                  m2, [aboveq-2]
   mova                  m0, [aboveq]
   mova                  m1, [aboveq+16]
   pshuflw               m2, m2, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m3, m3
-  movd                  m4, bpsd
+  movd                  m4, bdd
   punpcklqdq            m2, m2
   psllw                 m3, m4
   pcmpeqw               m5, m5
@@ -386,7 +386,7 @@ cglobal highbd_tm_predictor_16x16, 5, 5, 8, dst, stride, above, left, bps
   REP_RET
 
 INIT_XMM sse2
-cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
+cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bd
   movd                  m0, [aboveq-2]
   mova                  m1, [aboveq]
   mova                  m2, [aboveq+16]
@@ -395,7 +395,7 @@ cglobal highbd_tm_predictor_32x32, 5, 5, 8, dst, stride, above, left, bps
   pshuflw               m0, m0, 0x0
   ; Get the values to compute the maximum value at this bit depth
   pcmpeqw               m5, m5
-  movd                  m6, bpsd
+  movd                  m6, bdd
   psllw                 m5, m6
   pcmpeqw               m7, m7
   pxor                  m6, m6         ; min possible value
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index e0f7495521a..78cf9111d93 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -19,6 +19,10 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
+// Note: There is no 64-bit bit-level shifting SIMD instruction. All
+// coefficients are left shifted by 2, so that dct_const_round_shift() can be
+// done by right shifting 2 bytes.
+
 static INLINE void extend_64bit(const __m128i in,
                                 __m128i *const out /*out[2]*/) {
   out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
@@ -397,4 +401,4 @@ static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
   recon_and_store_4(out, dest, bd);
 }
 
-#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
index 9c8eef40f70..f446bb13f37 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
+++ b/libs/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
-#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#ifndef VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
 
 #include <smmintrin.h>  // SSE4.1
 
@@ -84,4 +84,29 @@ static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
   *out1 = multiplication_round_shift_sse4_1(temp, c1);
 }
 
-#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+static INLINE void highbd_idct4_sse4_1(__m128i *const io) {
+  __m128i temp[2], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  extend_64bit(temp[0], temp);
+  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp[0], temp);
+  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                          &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct8x8_half1d_sse4_1(__m128i *const io);
+void vpx_highbd_idct16_4col_sse4_1(__m128i *const io /*io[16]*/);
+
+#endif  // VPX_VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index ec22db9f4cd..d265fc1a921 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -47,13 +47,13 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
 
 // TODO(debargha, peter): Break up large functions into smaller ones
 // in this file.
-void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
-                                       const uint8_t *_blimit,
-                                       const uint8_t *_limit,
-                                       const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi16(1);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
   __m128i ps1, qs1, ps0, qs0;
@@ -70,35 +70,35 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   __m128i eight, four;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
   }
 
-  q4 = _mm_load_si128((__m128i *)(s + 4 * p));
-  p4 = _mm_load_si128((__m128i *)(s - 5 * p));
-  q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  q0 = _mm_load_si128((__m128i *)(s + 0 * p));
-  p0 = _mm_load_si128((__m128i *)(s - 1 * p));
+  q4 = _mm_load_si128((__m128i *)(s + 4 * pitch));
+  p4 = _mm_load_si128((__m128i *)(s - 5 * pitch));
+  q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
+  p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
 
   //  highbd_filter_mask
   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
@@ -111,14 +111,14 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
 
   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   work = _mm_max_epi16(
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
@@ -132,7 +132,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
 
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
 
   // lp filter
@@ -207,12 +207,12 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   // (because, in both vars, each block of 16 either all 1s or all 0s)
   flat = _mm_and_si128(flat, mask);
 
-  p5 = _mm_load_si128((__m128i *)(s - 6 * p));
-  q5 = _mm_load_si128((__m128i *)(s + 5 * p));
-  p6 = _mm_load_si128((__m128i *)(s - 7 * p));
-  q6 = _mm_load_si128((__m128i *)(s + 6 * p));
-  p7 = _mm_load_si128((__m128i *)(s - 8 * p));
-  q7 = _mm_load_si128((__m128i *)(s + 7 * p));
+  p5 = _mm_load_si128((__m128i *)(s - 6 * pitch));
+  q5 = _mm_load_si128((__m128i *)(s + 5 * pitch));
+  p6 = _mm_load_si128((__m128i *)(s - 7 * pitch));
+  q6 = _mm_load_si128((__m128i *)(s + 6 * pitch));
+  p7 = _mm_load_si128((__m128i *)(s - 8 * pitch));
+  q7 = _mm_load_si128((__m128i *)(s + 7 * pitch));
 
   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
   // but referred to as p0-p4 & q0-q4 in fn)
@@ -389,8 +389,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q6 = _mm_and_si128(flat2, flat2_q6);
   //  get values for when (flat2 && flat && mask)
   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
-  _mm_store_si128((__m128i *)(s - 7 * p), p6);
-  _mm_store_si128((__m128i *)(s + 6 * p), q6);
+  _mm_store_si128((__m128i *)(s - 7 * pitch), p6);
+  _mm_store_si128((__m128i *)(s + 6 * pitch), q6);
 
   p5 = _mm_andnot_si128(flat2, p5);
   //  p5 remains unchanged if !(flat2 && flat && mask)
@@ -404,8 +404,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   //  get values for when (flat2 && flat && mask)
   q5 = _mm_or_si128(q5, flat2_q5);
   //  full list of q5 values
-  _mm_store_si128((__m128i *)(s - 6 * p), p5);
-  _mm_store_si128((__m128i *)(s + 5 * p), q5);
+  _mm_store_si128((__m128i *)(s - 6 * pitch), p5);
+  _mm_store_si128((__m128i *)(s + 5 * pitch), q5);
 
   p4 = _mm_andnot_si128(flat2, p4);
   //  p4 remains unchanged if !(flat2 && flat && mask)
@@ -417,8 +417,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q4 = _mm_and_si128(flat2, flat2_q4);
   //  get values for when (flat2 && flat && mask)
   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
-  _mm_store_si128((__m128i *)(s - 5 * p), p4);
-  _mm_store_si128((__m128i *)(s + 4 * p), q4);
+  _mm_store_si128((__m128i *)(s - 5 * pitch), p4);
+  _mm_store_si128((__m128i *)(s + 4 * pitch), q4);
 
   p3 = _mm_andnot_si128(flat2, p3);
   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -430,8 +430,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q3 = _mm_and_si128(flat2, flat2_q3);
   //  get values for when (flat2 && flat && mask)
   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
-  _mm_store_si128((__m128i *)(s - 4 * p), p3);
-  _mm_store_si128((__m128i *)(s + 3 * p), q3);
+  _mm_store_si128((__m128i *)(s - 4 * pitch), p3);
+  _mm_store_si128((__m128i *)(s + 3 * pitch), q3);
 
   p2 = _mm_andnot_si128(flat2, p2);
   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -444,8 +444,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q2 = _mm_and_si128(flat2, flat2_q2);
   //  get values for when (flat2 && flat && mask)
   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 
   p1 = _mm_andnot_si128(flat2, p1);
   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -457,8 +457,8 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q1 = _mm_and_si128(flat2, flat2_q1);
   //  get values for when (flat2 && flat && mask)
   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
 
   p0 = _mm_andnot_si128(flat2, p0);
   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
@@ -470,22 +470,22 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
   flat2_q0 = _mm_and_si128(flat2, flat2_q0);
   //  get values for when (flat2 && flat && mask)
   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s - 0 * p), q0);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s - 0 * pitch), q0);
 }
 
-void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
-                                            const uint8_t *_blimit,
-                                            const uint8_t *_limit,
-                                            const uint8_t *_thresh, int bd) {
-  vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
-  vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int bd) {
+  vpx_highbd_lpf_horizontal_16_sse2(s, pitch, blimit, limit, thresh, bd);
+  vpx_highbd_lpf_horizontal_16_sse2(s + 8, pitch, blimit, limit, thresh, bd);
 }
 
-void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
@@ -493,16 +493,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
-  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
-  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
-  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
-  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
-  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
-  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
+  __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
+  __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * pitch));
+  __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * pitch));
+  __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * pitch));
+  __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * pitch));
+  __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * pitch));
+  __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * pitch));
   const __m128i one = _mm_set1_epi16(1);
   const __m128i ffff = _mm_cmpeq_epi16(one, one);
   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
@@ -519,25 +519,25 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_set1_epi16(0x200);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_set1_epi16(0x800);
   }
 
@@ -553,16 +553,16 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(abs_p1p0, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   mask = _mm_max_epi16(abs_q1q0, mask);
@@ -576,7 +576,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // flat_mask4
@@ -674,7 +674,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   q1 = _mm_and_si128(flat, q1);
   q1 = _mm_or_si128(work_a, q1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
   q2 = _mm_load_si128((__m128i *)flat_oq2);
   work_a = _mm_andnot_si128(flat, work_a);
   q2 = _mm_and_si128(flat, q2);
@@ -694,43 +694,43 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
   p1 = _mm_and_si128(flat, p1);
   p1 = _mm_or_si128(work_a, p1);
 
-  work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
   p2 = _mm_load_si128((__m128i *)flat_op2);
   work_a = _mm_andnot_si128(flat, work_a);
   p2 = _mm_and_si128(flat, p2);
   p2 = _mm_or_si128(work_a, p2);
 
-  _mm_store_si128((__m128i *)(s - 3 * p), p2);
-  _mm_store_si128((__m128i *)(s - 2 * p), p1);
-  _mm_store_si128((__m128i *)(s - 1 * p), p0);
-  _mm_store_si128((__m128i *)(s + 0 * p), q0);
-  _mm_store_si128((__m128i *)(s + 1 * p), q1);
-  _mm_store_si128((__m128i *)(s + 2 * p), q2);
+  _mm_store_si128((__m128i *)(s - 3 * pitch), p2);
+  _mm_store_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_store_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_store_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_store_si128((__m128i *)(s + 1 * pitch), q1);
+  _mm_store_si128((__m128i *)(s + 2 * pitch), q2);
 }
 
 void vpx_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_8_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_8_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
-void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
   const __m128i zero = _mm_set1_epi16(0);
-  __m128i blimit, limit, thresh;
+  __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
-  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   const __m128i abs_p1p0 =
       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
   const __m128i abs_q1q0 =
@@ -760,57 +760,57 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   __m128i filter1, filter2;
 
   if (bd == 8) {
-    blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
-    limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
-    thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+    blimit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero);
+    limit_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero);
+    thresh_v = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero);
     t80 = _mm_set1_epi16(0x80);
-    tff80 = _mm_set1_epi16(0xff80);
-    tffe0 = _mm_set1_epi16(0xffe0);
+    tff80 = _mm_set1_epi16((int16_t)0xff80);
+    tffe0 = _mm_set1_epi16((int16_t)0xffe0);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
   } else if (bd == 10) {
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 2);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 2);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 2);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 2);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 2);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
   } else {  // bd == 12
-    blimit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
-    limit = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
-    thresh = _mm_slli_epi16(
-        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+    blimit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)blimit), zero), 4);
+    limit_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)limit), zero), 4);
+    thresh_v = _mm_slli_epi16(
+        _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)thresh), zero), 4);
     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
-    tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
-    tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+    tff80 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xff80), 4);
+    tffe0 = _mm_slli_epi16(_mm_set1_epi16((int16_t)0xffe0), 4);
     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
   }
 
-  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
-  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
-  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
-  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+  ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
+  ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
+  qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
+  qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
 
   // filter_mask and hev_mask
   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu16(flat, thresh);
+  hev = _mm_subs_epu16(flat, thresh_v);
   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
 
   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit_v);
   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit_v, one));
   mask = _mm_max_epi16(flat, mask);
   // mask |= (abs(p1 - p0) > limit) * -1;
   // mask |= (abs(q1 - q0) > limit) * -1;
@@ -822,7 +822,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
   mask = _mm_max_epi16(work, mask);
-  mask = _mm_subs_epu16(mask, limit);
+  mask = _mm_subs_epu16(mask, limit_v);
   mask = _mm_cmpeq_epi16(mask, zero);
 
   // filter4
@@ -872,18 +872,18 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
                       t80);
 
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 }
 
 void vpx_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
-  vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  vpx_highbd_lpf_horizontal_4_sse2(s, pitch, blimit0, limit0, thresh0, bd);
+  vpx_highbd_lpf_horizontal_4_sse2(s + 8, pitch, blimit1, limit1, thresh1, bd);
 }
 
 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
@@ -998,9 +998,9 @@ static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
   highbd_transpose(src1, in_p, dest1, out_p, 1);
 }
 
-void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1009,7 +1009,7 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1018,11 +1018,11 @@ void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1030,7 +1030,7 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1038,15 +1038,15 @@ void vpx_highbd_lpf_vertical_4_dual_sse2(
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   uint16_t *src[1];
   uint16_t *dst[1];
@@ -1055,7 +1055,7 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  highbd_transpose(src, p, dst, 8, 1);
+  highbd_transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
@@ -1064,11 +1064,11 @@ void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[0] = s - 4;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 1);
+  highbd_transpose(src, 8, dst, pitch, 1);
 }
 
 void vpx_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
     const uint8_t *thresh1, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
@@ -1076,7 +1076,7 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   uint16_t *dst[2];
 
   // Transpose 8x16
-  highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  highbd_transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
@@ -1085,13 +1085,14 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  highbd_transpose(src, 16, dst, p, 2);
+  highbd_transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
                                      const uint8_t *limit,
                                      const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
@@ -1104,7 +1105,7 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  highbd_transpose(src, p, dst, 8, 2);
+  highbd_transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
   vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
@@ -1115,24 +1116,25 @@ void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   dst[1] = s;
 
   // Transpose back
-  highbd_transpose(src, 8, dst, p, 2);
+  highbd_transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch,
                                           const uint8_t *blimit,
                                           const uint8_t *limit,
                                           const uint8_t *thresh, int bd) {
   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
 
   //  Transpose 16x16
-  highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  highbd_transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  highbd_transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   //  Loop filtering
   vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
                                          thresh, bd);
 
   //  Transpose back
-  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch,
+                       pitch);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index cedf98aff4b..7149e4fb740 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -11,6 +11,7 @@
 #include <assert.h>
 #include <emmintrin.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index d9a6932e0b8..cefde0f57db 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -32,12 +32,12 @@ SECTION .text
 
 ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               const uint8_t *ref, ptrdiff_t ref_stride,
 ;                               int height, unsigned int *sse);
 ;
 ; This function returns the SE and stores SSE in the given pointer.
 
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
   psubw                %3, %4
   psubw                %1, %2
   mova                 %4, %3       ; make copies to manipulate to calc sum
@@ -91,81 +91,65 @@ SECTION .text
 %define filter_idx_shift 5
 
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
                                       x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
+                                      ref, ref_stride, \
+                                      second_pred, second_stride, height, sse
+    %define second_str second_strideq
   %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  ref, ref_stride, height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                        x_offset, y_offset, \
+                                        ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
+      %define second_str second_stridemp
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                x_offset, y_offset, dst, dst_stride, height, \
-                                sse, g_bilin_filter, g_pw_8
+                                    x_offset, y_offset, \
+                                    ref, ref_stride, height, sse
       %define block_height heightd
+    %endif
 
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
 
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
+    ; Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
 
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
 
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-      %endif
+      %define second_str second_stridemp
     %else
       cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                              x_offset, y_offset, dst, dst_stride, height, sse
+                                    x_offset, y_offset, \
+                                    ref, ref_stride, height, sse
       %define block_height heightd
     %endif
 
@@ -181,7 +165,7 @@ SECTION .text
   sar                   block_height, 1
 %endif
 %if %2 == 1 ; avg
-  shl             sec_str, 1
+  shl             second_str, 1
 %endif
 
   ; FIXME(rbultje) replace by jumptable?
@@ -196,35 +180,35 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m2, [srcq + 16]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + 16]
+  mova                 m1, [refq]
+  mova                 m3, [refq + 16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m2, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m2, [second_predq+16]
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m2, [srcq + src_strideq*2]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
+  mova                 m1, [refq]
+  mova                 m3, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -242,40 +226,40 @@ SECTION .text
   movu                 m1, [srcq+16]
   movu                 m4, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
+  mova                 m2, [refq]
+  mova                 m3, [refq+16]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*4]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq+ref_strideq*2]
   pavgw                m0, m1
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -284,14 +268,14 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -308,7 +292,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -318,8 +302,8 @@ SECTION .text
   movu                 m1, [srcq + 16]
   movu                 m4, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
+  mova                 m2, [refq]
+  mova                 m3, [refq+16]
   ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
   ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
   ; instructions is the same (5), but it is 1 mul instead of 2, so might be
@@ -336,23 +320,23 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m5, [srcq+src_strideq*4]
   mova                 m4, m1
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq+ref_strideq*2]
   pmullw               m1, filter_y_a
   pmullw               m5, filter_y_b
   paddw                m1, filter_rnd
@@ -364,16 +348,16 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -397,41 +381,41 @@ SECTION .text
   movu                 m1, [srcq + 16]
   movu                 m4, [srcq + 2]
   movu                 m5, [srcq + 18]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + 16]
+  mova                 m2, [refq]
+  mova                 m3, [refq + 16]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq + src_strideq*2]
   movu                 m4, [srcq + 2]
   movu                 m5, [srcq + src_strideq*2 + 2]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
+  mova                 m2, [refq]
+  mova                 m3, [refq + ref_strideq*2]
   pavgw                m0, m4
   pavgw                m1, m5
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -460,20 +444,20 @@ SECTION .text
   pavgw                m3, m5
   pavgw                m0, m2
   pavgw                m1, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + 16]
+  mova                 m4, [refq]
+  mova                 m5, [refq + 16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
   mova                 m0, m2
   mova                 m1, m3
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -489,20 +473,20 @@ SECTION .text
   pavgw                m3, m5
   pavgw                m0, m2
   pavgw                m2, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m4, m2, m5, m6, m7
   mova                 m0, m3
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -511,14 +495,14 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+y_offsetq]
   mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -535,7 +519,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -565,21 +549,21 @@ SECTION .text
   paddw                m0, filter_rnd
   psrlw                m1, 4
   paddw                m0, m2
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   psrlw                m0, 4
-  mova                 m3, [dstq+16]
+  mova                 m3, [refq+16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
   mova                 m0, m4
   mova                 m1, m5
 
   lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
+  lea                refq, [refq + ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -604,21 +588,21 @@ SECTION .text
   paddw                m0, filter_rnd
   psrlw                m4, 4
   paddw                m0, m2
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   psrlw                m0, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m3, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m4, [second_predq]
 %endif
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
   lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
+  lea                refq, [refq + ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -633,14 +617,14 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -657,7 +641,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -667,8 +651,8 @@ SECTION .text
   movu                 m1, [srcq+16]
   movu                 m2, [srcq+2]
   movu                 m3, [srcq+18]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
+  mova                 m4, [refq]
+  mova                 m5, [refq+16]
   pmullw               m1, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m1, filter_rnd
@@ -680,23 +664,23 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+src_strideq*2]
   movu                 m2, [srcq+2]
   movu                 m3, [srcq+src_strideq*2+2]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq+ref_strideq*2]
   pmullw               m1, filter_x_a
   pmullw               m3, filter_x_b
   paddw                m1, filter_rnd
@@ -708,16 +692,16 @@ SECTION .text
   psrlw                m1, 4
   psrlw                m0, 4
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m1, [second_predq]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
 
   lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
+  lea                refq, [refq+ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -732,14 +716,14 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && mmsize == 16
   mova                 m8, [bilin_filter+x_offsetq]
   mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -756,7 +740,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -789,24 +773,24 @@ SECTION .text
   paddw                m3, filter_rnd
   paddw                m2, m4
   paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
+  mova                 m4, [refq]
+  mova                 m5, [refq+16]
   psrlw                m2, 4
   psrlw                m3, 4
   pavgw                m0, m2
   pavgw                m1, m3
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m4, m1, m5, m6, m7
   mova                 m0, m2
   mova                 m1, m3
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -830,24 +814,24 @@ SECTION .text
   paddw                m3, filter_rnd
   paddw                m2, m4
   paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
+  mova                 m4, [refq]
+  mova                 m5, [refq+ref_strideq*2]
   psrlw                m2, 4
   psrlw                m3, 4
   pavgw                m0, m2
   pavgw                m2, m3
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m2, [second_predq]
 %endif
   SUM_SSE              m0, m4, m2, m5, m6, m7
   mova                 m0, m3
 
   lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
+  lea                refq, [refq+ref_strideq*4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
@@ -859,8 +843,8 @@ SECTION .text
 
 .x_nonhalf_y_nonhalf:
 ; loading filter - this is same as in 8-bit depth
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
   shl           y_offsetd, filter_idx_shift
@@ -869,7 +853,7 @@ SECTION .text
   mova                 m9, [bilin_filter+x_offsetq+16]
   mova                m10, [bilin_filter+y_offsetq]
   mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -897,7 +881,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 ; end of load filter
@@ -945,23 +929,23 @@ SECTION .text
   pmullw               m3, filter_y_b
   paddw                m0, m2
   paddw                m1, filter_rnd
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   paddw                m1, m3
   psrlw                m0, 4
   psrlw                m1, 4
-  mova                 m3, [dstq+16]
+  mova                 m3, [refq+16]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
+  pavgw                m0, [second_predq]
+  pavgw                m1, [second_predq+16]
 %endif
   SUM_SSE              m0, m2, m1, m3, m6, m7
   mova                 m0, m4
   mova                 m1, m5
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 2]
+  lea                refq, [refq + ref_strideq * 2]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %else ; %1 < 16
   movu                 m0, [srcq]
@@ -999,23 +983,23 @@ SECTION .text
   pmullw               m3, filter_y_b
   paddw                m0, m2
   paddw                m4, filter_rnd
-  mova                 m2, [dstq]
+  mova                 m2, [refq]
   paddw                m4, m3
   psrlw                m0, 4
   psrlw                m4, 4
-  mova                 m3, [dstq+dst_strideq*2]
+  mova                 m3, [refq+ref_strideq*2]
 %if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
+  pavgw                m0, [second_predq]
+  add                second_predq, second_str
+  pavgw                m4, [second_predq]
 %endif
   SUM_SSE              m0, m2, m4, m3, m6, m7
   mova                 m0, m5
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 4]
+  lea                refq, [refq + ref_strideq * 4]
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
 %endif
   dec                   block_height
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index e646767e190..a256a59ec0e 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -16,9 +16,9 @@ SECTION .text
 ;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
-;    int             source_stride,
+;    int             src_stride,
 ;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
+;    int             ref_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
@@ -36,8 +36,8 @@ sym(vpx_highbd_calc16x16var_sse2):
         mov         rsi,            arg(0) ;[src_ptr]
         mov         rdi,            arg(2) ;[ref_ptr]
 
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
         add         rax,            rax ; source stride in bytes
         add         rdx,            rdx ; recon stride in bytes
 
@@ -169,9 +169,9 @@ sym(vpx_highbd_calc16x16var_sse2):
 ;unsigned int vpx_highbd_calc8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
-;    int             source_stride,
+;    int             src_stride,
 ;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
+;    int             ref_stride,
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
@@ -189,8 +189,8 @@ sym(vpx_highbd_calc8x8var_sse2):
         mov         rsi,            arg(0) ;[src_ptr]
         mov         rdi,            arg(2) ;[ref_ptr]
 
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
         add         rax,            rax ; source stride in bytes
         add         rdx,            rdx ; recon stride in bytes
 
diff --git a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index a6f7c3d25df..dd6cfbb2c4c 100644
--- a/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,8 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "./vpx_config.h"
 
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
@@ -89,9 +90,9 @@ static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
 }
 
 #define HIGH_GET_VAR(S)                                                       \
-  void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                         const uint8_t *ref8, int ref_stride, \
-                                         uint32_t *sse, int *sum) {           \
+  void vpx_highbd_8_get##S##x##S##var_sse2(                                   \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, uint32_t *sse, int *sum) {                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
     uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
     vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
@@ -135,7 +136,7 @@ HIGH_GET_VAR(8);
     highbd_8_variance_sse2(                                                \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> (shift));             \
   }                                                                        \
                                                                            \
   uint32_t vpx_highbd_10_variance##w##x##h##_sse2(                         \
@@ -148,7 +149,7 @@ HIGH_GET_VAR(8);
     highbd_10_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }                                                                        \
                                                                            \
@@ -162,7 +163,7 @@ HIGH_GET_VAR(8);
     highbd_12_variance_sse2(                                               \
         src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
         vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> (shift));             \
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
@@ -251,7 +252,7 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
 #define DECL(w, opt)                                                         \
   int vpx_highbd_sub_pixel_variance##w##xh_##opt(                            \
       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
   DECL(8, opt);    \
@@ -265,28 +266,28 @@ DECLS(sse2);
 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
   uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
         NULL);                                                                 \
     if (w > wf) {                                                              \
       unsigned int sse2;                                                       \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
           &sse2, NULL, NULL);                                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -298,29 +299,29 @@ DECLS(sse2);
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
     int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, h, &sse, NULL,   \
         NULL);                                                                 \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride, h,   \
           &sse2, NULL, NULL);                                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride, h, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -335,40 +336,40 @@ DECLS(sse2);
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr) {                \
     int start_row;                                                             \
     uint32_t sse;                                                              \
     int se = 0;                                                                \
     int64_t var;                                                               \
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
       int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
+          ref + (start_row * ref_stride), ref_stride, height, &sse2, NULL,     \
           NULL);                                                               \
       se += se2;                                                               \
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                     \
             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
+            y_offset, ref + 16 + (start_row * ref_stride), ref_stride, height, \
             &sse2, NULL, NULL);                                                \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
               height, &sse2, NULL, NULL);                                      \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(                   \
               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
               height, &sse2, NULL, NULL);                                      \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
@@ -404,8 +405,8 @@ FNS(sse2);
 #define DECL(w, opt)                                                         \
   int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
       const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      const uint16_t *ref, ptrdiff_t ref_stride, const uint16_t *second,     \
+      ptrdiff_t second_stride, int height, unsigned int *sse, void *unused0, \
       void *unused);
 #define DECLS(opt1) \
   DECL(16, opt1)    \
@@ -418,30 +419,30 @@ DECLS(sse2);
 #define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
   uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
         NULL, NULL);                                                           \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
           sec + 16, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
             sec + 32, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
             sec + 48, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -453,31 +454,31 @@ DECLS(sse2);
                                                                                \
   uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     int64_t var;                                                               \
     uint32_t sse;                                                              \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        src, src_stride, x_offset, y_offset, ref, ref_stride, sec, w, h, &sse, \
         NULL, NULL);                                                           \
     if (w > wf) {                                                              \
       uint32_t sse2;                                                           \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
+          src + 16, src_stride, x_offset, y_offset, ref + 16, ref_stride,      \
           sec + 16, w, h, &sse2, NULL, NULL);                                  \
       se += se2;                                                               \
       sse += sse2;                                                             \
       if (w > wf * 2) {                                                        \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
+            src + 32, src_stride, x_offset, y_offset, ref + 32, ref_stride,    \
             sec + 32, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
+            src + 48, src_stride, x_offset, y_offset, ref + 48, ref_stride,    \
             sec + 48, w, h, &sse2, NULL, NULL);                                \
         se += se2;                                                             \
         sse += sse2;                                                           \
@@ -492,7 +493,7 @@ DECLS(sse2);
                                                                                \
   uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *ref8, int ref_stride, uint32_t *sse_ptr,                  \
       const uint8_t *sec8) {                                                   \
     int start_row;                                                             \
     int64_t var;                                                               \
@@ -500,34 +501,34 @@ DECLS(sse2);
     int se = 0;                                                                \
     uint64_t long_sse = 0;                                                     \
     uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
     uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
     for (start_row = 0; start_row < h; start_row += 16) {                      \
       uint32_t sse2;                                                           \
       int height = h - start_row < 16 ? h - start_row : 16;                    \
       int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
           src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          ref + (start_row * ref_stride), ref_stride, sec + (start_row * w),   \
           w, height, &sse2, NULL, NULL);                                       \
       se += se2;                                                               \
       long_sse += sse2;                                                        \
       if (w > wf) {                                                            \
         se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
             src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
+            y_offset, ref + 16 + (start_row * ref_stride), ref_stride,         \
             sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
         se += se2;                                                             \
         long_sse += sse2;                                                      \
         if (w > wf * 2) {                                                      \
           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
               src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 32 + (start_row * ref_stride), ref_stride,       \
               sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
           se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
               src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
+              y_offset, ref + 48 + (start_row * ref_stride), ref_stride,       \
               sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
           se += se2;                                                           \
           long_sse += sse2;                                                    \
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index f6e56b6f9ec..4b02da96664 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -100,49 +100,44 @@ void idct4_sse2(__m128i *const in) {
 }
 
 void iadst4_sse2(__m128i *const in) {
-  const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
-  const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
-  const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
-  const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
-  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8], in7;
-
-  transpose_16bit_4(in);
-  in7 = _mm_srli_si128(in[1], 8);
-  in7 = _mm_add_epi16(in7, in[0]);
-  in7 = _mm_sub_epi16(in7, in[1]);
-
-  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
-  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpackhi_epi16(in[0], kZero);
-
-  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
-  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
-  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x2
-  v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01);  // s1 - s4
-  v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04);  // s2 - s6
-  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s2
+  const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9);
+  const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9);
+  const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9);
+  const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
+  const __m128i k__sinpi_12_n3 =
+      pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9);
+  __m128i u[4], v[5];
+
+  // 00 01 20 21  02 03 22 23
+  // 10 11 30 31  12 13 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]);
+
+  // 00 01 10 11  20 21 30 31
+  // 02 03 12 13  22 23 32 33
+  in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3);    // s_1 * x0 + s_3 * x1
+  v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2);    // s_4 * x2 + s_2 * x3
+  v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3);    // s_2 * x0 + s_3 * x1
+  v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4);    // s_1 * x2 + s_4 * x3
+  v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3);  // (s_1 + s_2) * x0 - s_3 * x1
+  in[0] = _mm_sub_epi16(in[0], in[1]);           // x0 - x2
+  in[1] = _mm_srli_epi32(in[1], 16);
+  in[0] = _mm_add_epi16(in[0], in[1]);
+  in[0] = _mm_slli_epi32(in[0], 16);  // x0 - x2 + x3
 
   u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = _mm_add_epi32(v[3], v[4]);
-  u[2] = v[2];
-  u[3] = _mm_add_epi32(u[0], u[1]);
-  u[4] = _mm_slli_epi32(v[5], 2);
-  u[5] = _mm_add_epi32(u[3], v[5]);
-  u[6] = _mm_sub_epi32(u[5], u[4]);
-
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[1] = _mm_sub_epi32(v[2], v[3]);
+  u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3);
+  u[3] = _mm_sub_epi32(v[1], v[3]);
+  u[3] = _mm_add_epi32(u[3], v[4]);
+
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
 
   in[0] = _mm_packs_epi32(u[0], u[1]);
   in[1] = _mm_packs_epi32(u[2], u[3]);
@@ -170,7 +165,7 @@ void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    idct8_sse2(in);
+    vpx_idct8_sse2(in);
   }
 
   write_buffer_8x8(in, dest, stride);
@@ -226,7 +221,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   recon_and_store_8_dual(dest, dc_value, stride);
 }
 
-void idct8_sse2(__m128i *const in) {
+void vpx_idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
   transpose_16bit_8x8(in, in);
 
@@ -248,191 +243,149 @@ void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
-  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  const __m128i kZero = _mm_set1_epi16(0);
+  __m128i s[8], u[16], v[8], w[16];
 
   // transpose
   transpose_16bit_8x8(in, in);
 
-  // properly aligned for butterfly input
-  in0 = in[7];
-  in1 = in[0];
-  in2 = in[5];
-  in3 = in[2];
-  in4 = in[3];
-  in5 = in[4];
-  in6 = in[1];
-  in7 = in[6];
-
   // column transformation
   // stage 1
   // interleave and multiply/add into 32-bit integer
-  s0 = _mm_unpacklo_epi16(in0, in1);
-  s1 = _mm_unpackhi_epi16(in0, in1);
-  s2 = _mm_unpacklo_epi16(in2, in3);
-  s3 = _mm_unpackhi_epi16(in2, in3);
-  s4 = _mm_unpacklo_epi16(in4, in5);
-  s5 = _mm_unpackhi_epi16(in4, in5);
-  s6 = _mm_unpacklo_epi16(in6, in7);
-  s7 = _mm_unpackhi_epi16(in6, in7);
-
-  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
-  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
-  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
-  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
-  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
-  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
-  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
-  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
-  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
-  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
-  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
-  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
-  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
-  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
-  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
-  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+  s[0] = _mm_unpacklo_epi16(in[7], in[0]);
+  s[1] = _mm_unpackhi_epi16(in[7], in[0]);
+  s[2] = _mm_unpacklo_epi16(in[5], in[2]);
+  s[3] = _mm_unpackhi_epi16(in[5], in[2]);
+  s[4] = _mm_unpacklo_epi16(in[3], in[4]);
+  s[5] = _mm_unpackhi_epi16(in[3], in[4]);
+  s[6] = _mm_unpacklo_epi16(in[1], in[6]);
+  s[7] = _mm_unpackhi_epi16(in[1], in[6]);
+
+  u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30);
+  u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30);
+  u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02);
+  u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02);
+  u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22);
+  u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22);
+  u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10);
+  u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10);
+  u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14);
+  u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14);
+  u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18);
+  u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18);
+  u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06);
+  u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06);
+  u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26);
+  u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26);
 
   // addition
-  w0 = _mm_add_epi32(u0, u8);
-  w1 = _mm_add_epi32(u1, u9);
-  w2 = _mm_add_epi32(u2, u10);
-  w3 = _mm_add_epi32(u3, u11);
-  w4 = _mm_add_epi32(u4, u12);
-  w5 = _mm_add_epi32(u5, u13);
-  w6 = _mm_add_epi32(u6, u14);
-  w7 = _mm_add_epi32(u7, u15);
-  w8 = _mm_sub_epi32(u0, u8);
-  w9 = _mm_sub_epi32(u1, u9);
-  w10 = _mm_sub_epi32(u2, u10);
-  w11 = _mm_sub_epi32(u3, u11);
-  w12 = _mm_sub_epi32(u4, u12);
-  w13 = _mm_sub_epi32(u5, u13);
-  w14 = _mm_sub_epi32(u6, u14);
-  w15 = _mm_sub_epi32(u7, u15);
+  w[0] = _mm_add_epi32(u[0], u[8]);
+  w[1] = _mm_add_epi32(u[1], u[9]);
+  w[2] = _mm_add_epi32(u[2], u[10]);
+  w[3] = _mm_add_epi32(u[3], u[11]);
+  w[4] = _mm_add_epi32(u[4], u[12]);
+  w[5] = _mm_add_epi32(u[5], u[13]);
+  w[6] = _mm_add_epi32(u[6], u[14]);
+  w[7] = _mm_add_epi32(u[7], u[15]);
+  w[8] = _mm_sub_epi32(u[0], u[8]);
+  w[9] = _mm_sub_epi32(u[1], u[9]);
+  w[10] = _mm_sub_epi32(u[2], u[10]);
+  w[11] = _mm_sub_epi32(u[3], u[11]);
+  w[12] = _mm_sub_epi32(u[4], u[12]);
+  w[13] = _mm_sub_epi32(u[5], u[13]);
+  w[14] = _mm_sub_epi32(u[6], u[14]);
+  w[15] = _mm_sub_epi32(u[7], u[15]);
 
   // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
-  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
-  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
-  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
-  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
-  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
-  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
-  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
-  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
-  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
-  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
-  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
-  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
-  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
-  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(w[0]);
+  u[1] = dct_const_round_shift_sse2(w[1]);
+  u[2] = dct_const_round_shift_sse2(w[2]);
+  u[3] = dct_const_round_shift_sse2(w[3]);
+  u[4] = dct_const_round_shift_sse2(w[4]);
+  u[5] = dct_const_round_shift_sse2(w[5]);
+  u[6] = dct_const_round_shift_sse2(w[6]);
+  u[7] = dct_const_round_shift_sse2(w[7]);
+  u[8] = dct_const_round_shift_sse2(w[8]);
+  u[9] = dct_const_round_shift_sse2(w[9]);
+  u[10] = dct_const_round_shift_sse2(w[10]);
+  u[11] = dct_const_round_shift_sse2(w[11]);
+  u[12] = dct_const_round_shift_sse2(w[12]);
+  u[13] = dct_const_round_shift_sse2(w[13]);
+  u[14] = dct_const_round_shift_sse2(w[14]);
+  u[15] = dct_const_round_shift_sse2(w[15]);
 
   // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
-  in[4] = _mm_packs_epi32(u8, u9);
-  in[5] = _mm_packs_epi32(u10, u11);
-  in[6] = _mm_packs_epi32(u12, u13);
-  in[7] = _mm_packs_epi32(u14, u15);
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  in[2] = _mm_packs_epi32(u[4], u[5]);
+  in[3] = _mm_packs_epi32(u[6], u[7]);
+  in[4] = _mm_packs_epi32(u[8], u[9]);
+  in[5] = _mm_packs_epi32(u[10], u[11]);
+  in[6] = _mm_packs_epi32(u[12], u[13]);
+  in[7] = _mm_packs_epi32(u[14], u[15]);
 
   // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
-  u0 = _mm_unpacklo_epi16(in[4], in[5]);
-  u1 = _mm_unpackhi_epi16(in[4], in[5]);
-  u2 = _mm_unpacklo_epi16(in[6], in[7]);
-  u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
-  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
-  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
-  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
-  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
-  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
-  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
-  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
-  w0 = _mm_add_epi32(v0, v4);
-  w1 = _mm_add_epi32(v1, v5);
-  w2 = _mm_add_epi32(v2, v6);
-  w3 = _mm_add_epi32(v3, v7);
-  w4 = _mm_sub_epi32(v0, v4);
-  w5 = _mm_sub_epi32(v1, v5);
-  w6 = _mm_sub_epi32(v2, v6);
-  w7 = _mm_sub_epi32(v3, v7);
-
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  s[0] = _mm_add_epi16(in[0], in[2]);
+  s[1] = _mm_add_epi16(in[1], in[3]);
+  s[2] = _mm_sub_epi16(in[0], in[2]);
+  s[3] = _mm_sub_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[4], in[5]);
+  u[1] = _mm_unpackhi_epi16(in[4], in[5]);
+  u[2] = _mm_unpacklo_epi16(in[6], in[7]);
+  u[3] = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+
+  w[0] = _mm_add_epi32(v[0], v[4]);
+  w[1] = _mm_add_epi32(v[1], v[5]);
+  w[2] = _mm_add_epi32(v[2], v[6]);
+  w[3] = _mm_add_epi32(v[3], v[7]);
+  w[4] = _mm_sub_epi32(v[0], v[4]);
+  w[5] = _mm_sub_epi32(v[1], v[5]);
+  w[6] = _mm_sub_epi32(v[2], v[6]);
+  w[7] = _mm_sub_epi32(v[3], v[7]);
+
+  u[0] = dct_const_round_shift_sse2(w[0]);
+  u[1] = dct_const_round_shift_sse2(w[1]);
+  u[2] = dct_const_round_shift_sse2(w[2]);
+  u[3] = dct_const_round_shift_sse2(w[3]);
+  u[4] = dct_const_round_shift_sse2(w[4]);
+  u[5] = dct_const_round_shift_sse2(w[5]);
+  u[6] = dct_const_round_shift_sse2(w[6]);
+  u[7] = dct_const_round_shift_sse2(w[7]);
 
   // back to 16-bit intergers
-  s4 = _mm_packs_epi32(u0, u1);
-  s5 = _mm_packs_epi32(u2, u3);
-  s6 = _mm_packs_epi32(u4, u5);
-  s7 = _mm_packs_epi32(u6, u7);
+  s[4] = _mm_packs_epi32(u[0], u[1]);
+  s[5] = _mm_packs_epi32(u[2], u[3]);
+  s[6] = _mm_packs_epi32(u[4], u[5]);
+  s[7] = _mm_packs_epi32(u[6], u[7]);
 
   // stage 3
-  u0 = _mm_unpacklo_epi16(s2, s3);
-  u1 = _mm_unpackhi_epi16(s2, s3);
-  u2 = _mm_unpacklo_epi16(s6, s7);
-  u3 = _mm_unpackhi_epi16(s6, s7);
-
-  s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
-  s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
-  s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
-  s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
-
-  in[0] = s0;
-  in[1] = _mm_sub_epi16(k__const_0, s4);
-  in[2] = s6;
-  in[3] = _mm_sub_epi16(k__const_0, s2);
-  in[4] = s3;
-  in[5] = _mm_sub_epi16(k__const_0, s7);
-  in[6] = s5;
-  in[7] = _mm_sub_epi16(k__const_0, s1);
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+
+  s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+  s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[4]);
+  in[2] = s[6];
+  in[3] = _mm_sub_epi16(kZero, s[2]);
+  in[4] = s[3];
+  in[5] = _mm_sub_epi16(kZero, s[7]);
+  in[6] = s[5];
+  in[7] = _mm_sub_epi16(kZero, s[1]);
 }
 
 static INLINE void idct16_load8x8(const tran_low_t *const input,
@@ -561,7 +514,7 @@ void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   }
 }
 
-static void iadst16_8col(__m128i *const in) {
+void vpx_iadst16_8col_sse2(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -593,7 +546,6 @@ static void iadst16_8col(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i kZero = _mm_set1_epi16(0);
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
@@ -679,71 +631,38 @@ static void iadst16_8col(__m128i *const in) {
   u[30] = _mm_sub_epi32(v[14], v[30]);
   u[31] = _mm_sub_epi32(v[15], v[31]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
-  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
-  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
-  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
-  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
-  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
-  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
-  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
-  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
-  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
-  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
-  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
-  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
-  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
-  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
-  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
-  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
-  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
-  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
-  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
-  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
-  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
-  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
-  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
-  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
-  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
-  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
-  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
-  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
-  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
-  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
+  u[4] = dct_const_round_shift_sse2(u[4]);
+  u[5] = dct_const_round_shift_sse2(u[5]);
+  u[6] = dct_const_round_shift_sse2(u[6]);
+  u[7] = dct_const_round_shift_sse2(u[7]);
+  u[8] = dct_const_round_shift_sse2(u[8]);
+  u[9] = dct_const_round_shift_sse2(u[9]);
+  u[10] = dct_const_round_shift_sse2(u[10]);
+  u[11] = dct_const_round_shift_sse2(u[11]);
+  u[12] = dct_const_round_shift_sse2(u[12]);
+  u[13] = dct_const_round_shift_sse2(u[13]);
+  u[14] = dct_const_round_shift_sse2(u[14]);
+  u[15] = dct_const_round_shift_sse2(u[15]);
+  u[16] = dct_const_round_shift_sse2(u[16]);
+  u[17] = dct_const_round_shift_sse2(u[17]);
+  u[18] = dct_const_round_shift_sse2(u[18]);
+  u[19] = dct_const_round_shift_sse2(u[19]);
+  u[20] = dct_const_round_shift_sse2(u[20]);
+  u[21] = dct_const_round_shift_sse2(u[21]);
+  u[22] = dct_const_round_shift_sse2(u[22]);
+  u[23] = dct_const_round_shift_sse2(u[23]);
+  u[24] = dct_const_round_shift_sse2(u[24]);
+  u[25] = dct_const_round_shift_sse2(u[25]);
+  u[26] = dct_const_round_shift_sse2(u[26]);
+  u[27] = dct_const_round_shift_sse2(u[27]);
+  u[28] = dct_const_round_shift_sse2(u[28]);
+  u[29] = dct_const_round_shift_sse2(u[29]);
+  u[30] = dct_const_round_shift_sse2(u[30]);
+  u[31] = dct_const_round_shift_sse2(u[31]);
 
   s[0] = _mm_packs_epi32(u[0], u[1]);
   s[1] = _mm_packs_epi32(u[2], u[3]);
@@ -806,39 +725,22 @@ static void iadst16_8col(__m128i *const in) {
   u[14] = _mm_sub_epi32(v[6], v[14]);
   u[15] = _mm_sub_epi32(v[7], v[15]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[0] = dct_const_round_shift_sse2(u[0]);
+  u[1] = dct_const_round_shift_sse2(u[1]);
+  u[2] = dct_const_round_shift_sse2(u[2]);
+  u[3] = dct_const_round_shift_sse2(u[3]);
+  u[4] = dct_const_round_shift_sse2(u[4]);
+  u[5] = dct_const_round_shift_sse2(u[5]);
+  u[6] = dct_const_round_shift_sse2(u[6]);
+  u[7] = dct_const_round_shift_sse2(u[7]);
+  u[8] = dct_const_round_shift_sse2(u[8]);
+  u[9] = dct_const_round_shift_sse2(u[9]);
+  u[10] = dct_const_round_shift_sse2(u[10]);
+  u[11] = dct_const_round_shift_sse2(u[11]);
+  u[12] = dct_const_round_shift_sse2(u[12]);
+  u[13] = dct_const_round_shift_sse2(u[13]);
+  u[14] = dct_const_round_shift_sse2(u[14]);
+  u[15] = dct_const_round_shift_sse2(u[15]);
 
   x[0] = _mm_add_epi16(s[0], s[4]);
   x[1] = _mm_add_epi16(s[1], s[5]);
@@ -901,39 +803,22 @@ static void iadst16_8col(__m128i *const in) {
   u[14] = _mm_sub_epi32(v[10], v[14]);
   u[15] = _mm_sub_epi32(v[11], v[15]);
 
-  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  v[0] = dct_const_round_shift_sse2(u[0]);
+  v[1] = dct_const_round_shift_sse2(u[1]);
+  v[2] = dct_const_round_shift_sse2(u[2]);
+  v[3] = dct_const_round_shift_sse2(u[3]);
+  v[4] = dct_const_round_shift_sse2(u[4]);
+  v[5] = dct_const_round_shift_sse2(u[5]);
+  v[6] = dct_const_round_shift_sse2(u[6]);
+  v[7] = dct_const_round_shift_sse2(u[7]);
+  v[8] = dct_const_round_shift_sse2(u[8]);
+  v[9] = dct_const_round_shift_sse2(u[9]);
+  v[10] = dct_const_round_shift_sse2(u[10]);
+  v[11] = dct_const_round_shift_sse2(u[11]);
+  v[12] = dct_const_round_shift_sse2(u[12]);
+  v[13] = dct_const_round_shift_sse2(u[13]);
+  v[14] = dct_const_round_shift_sse2(u[14]);
+  v[15] = dct_const_round_shift_sse2(u[15]);
 
   s[0] = _mm_add_epi16(x[0], x[2]);
   s[1] = _mm_add_epi16(x[1], x[3]);
@@ -989,8 +874,8 @@ void idct16_sse2(__m128i *const in0, __m128i *const in1) {
 
 void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
   transpose_16bit_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
+  vpx_iadst16_8col_sse2(in0);
+  vpx_iadst16_8col_sse2(in1);
 }
 
 // Group the coefficient calculation into smaller functions to prevent stack
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index 5cd5098f149..b4bbd186d2b 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_INV_TXFM_SSE2_H_
-#define VPX_DSP_X86_INV_TXFM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -697,13 +697,14 @@ static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
 }
 
 void idct4_sse2(__m128i *const in);
-void idct8_sse2(__m128i *const in);
+void vpx_idct8_sse2(__m128i *const in);
 void idct16_sse2(__m128i *const in0, __m128i *const in1);
 void iadst4_sse2(__m128i *const in);
 void iadst8_sse2(__m128i *const in);
+void vpx_iadst16_8col_sse2(__m128i *const in);
 void iadst16_sse2(__m128i *const in0, __m128i *const in1);
 void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
 void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
 void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
 
-#endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
index e785c8eda13..e9f0f690334 100644
--- a/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
+++ b/libs/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
-#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#ifndef VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
 
 #include <tmmintrin.h>
 
@@ -107,4 +107,4 @@ static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
 
 void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
 
-#endif  // VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#endif  // VPX_VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
index 6652a62dcfc..be391992af8 100644
--- a/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -13,38 +13,38 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
@@ -52,19 +52,19 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((int8_t)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
     abs_p1q1 =
         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -76,7 +76,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -84,7 +84,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi16(0x1);
     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
@@ -136,21 +136,21 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
 
       flat2 = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
 
       work = _mm_max_epu8(
           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
@@ -321,44 +321,44 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -367,10 +367,10 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
 };
 
-void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
@@ -380,32 +380,32 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
 
-  const __m128i thresh =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
-  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
-  const __m128i blimit =
-      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
-
-  p256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
-  p256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
-  p256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
-  p256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
-  p256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
-  q256_0 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
-  q256_1 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
-  q256_2 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
-  q256_3 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
-  q256_4 =
-      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)thresh[0]));
+  const __m128i limit_v = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)limit[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)blimit[0]));
+
+  p256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 5 * pitch)));
+  p256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 4 * pitch)));
+  p256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 3 * pitch)));
+  p256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 2 * pitch)));
+  p256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 1 * pitch)));
+  q256_0 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s - 0 * pitch)));
+  q256_1 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 1 * pitch)));
+  q256_2 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 2 * pitch)));
+  q256_3 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 3 * pitch)));
+  q256_4 = _mm256_castpd_si256(
+      _mm256_broadcast_pd((__m128d const *)(s + 4 * pitch)));
 
   p4 = _mm256_castsi256_si128(p256_4);
   p3 = _mm256_castsi256_si128(p256_3);
@@ -423,7 +423,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -431,12 +431,12 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
     __m128i work;
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(flat, mask);
@@ -450,7 +450,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -458,8 +458,8 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
@@ -532,9 +532,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat = _mm_and_si128(flat, mask);
 
       p256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * pitch)));
       q256_5 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * pitch)));
       p5 = _mm256_castsi256_si128(p256_5);
       q5 = _mm256_castsi256_si128(q256_5);
       flat2 = _mm_max_epu8(
@@ -543,9 +543,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
 
       flat2 = _mm_max_epu8(work, flat2);
       p256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * pitch)));
       q256_6 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * pitch)));
       p6 = _mm256_castsi256_si128(p256_6);
       q6 = _mm256_castsi256_si128(q256_6);
       work = _mm_max_epu8(
@@ -555,9 +555,9 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
       flat2 = _mm_max_epu8(work, flat2);
 
       p256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s - 8 * pitch)));
       q256_7 = _mm256_castpd_si256(
-          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
+          _mm256_broadcast_pd((__m128d const *)(s + 7 * pitch)));
       p7 = _mm256_castsi256_si128(p256_7);
       q7 = _mm256_castsi256_si128(q256_7);
       work = _mm_max_epu8(
@@ -843,71 +843,71 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
     p6 = _mm_andnot_si128(flat2, p6);
     flat2_p6 = _mm_and_si128(flat2, flat2_p6);
     p6 = _mm_or_si128(flat2_p6, p6);
-    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+    _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
     p5 = _mm_andnot_si128(flat2, p5);
     flat2_p5 = _mm_and_si128(flat2, flat2_p5);
     p5 = _mm_or_si128(flat2_p5, p5);
-    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+    _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
     p4 = _mm_andnot_si128(flat2, p4);
     flat2_p4 = _mm_and_si128(flat2, flat2_p4);
     p4 = _mm_or_si128(flat2_p4, p4);
-    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+    _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
     p3 = _mm_andnot_si128(flat2, p3);
     flat2_p3 = _mm_and_si128(flat2, flat2_p3);
     p3 = _mm_or_si128(flat2_p3, p3);
-    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+    _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
     p2 = _mm_andnot_si128(flat2, p2);
     flat2_p2 = _mm_and_si128(flat2, flat2_p2);
     p2 = _mm_or_si128(flat2_p2, p2);
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
 
     p1 = _mm_andnot_si128(flat2, p1);
     flat2_p1 = _mm_and_si128(flat2, flat2_p1);
     p1 = _mm_or_si128(flat2_p1, p1);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
 
     p0 = _mm_andnot_si128(flat2, p0);
     flat2_p0 = _mm_and_si128(flat2, flat2_p0);
     p0 = _mm_or_si128(flat2_p0, p0);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
 
     q0 = _mm_andnot_si128(flat2, q0);
     flat2_q0 = _mm_and_si128(flat2, flat2_q0);
     q0 = _mm_or_si128(flat2_q0, q0);
-    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s - 0 * pitch), q0);
 
     q1 = _mm_andnot_si128(flat2, q1);
     flat2_q1 = _mm_and_si128(flat2, flat2_q1);
     q1 = _mm_or_si128(flat2_q1, q1);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
 
     q2 = _mm_andnot_si128(flat2, q2);
     flat2_q2 = _mm_and_si128(flat2, flat2_q2);
     q2 = _mm_or_si128(flat2_q2, q2);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
 
     q3 = _mm_andnot_si128(flat2, q3);
     flat2_q3 = _mm_and_si128(flat2, flat2_q3);
     q3 = _mm_or_si128(flat2_q3, q3);
-    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+    _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
     q4 = _mm_andnot_si128(flat2, q4);
     flat2_q4 = _mm_and_si128(flat2, flat2_q4);
     q4 = _mm_or_si128(flat2_q4, q4);
-    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+    _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
     q5 = _mm_andnot_si128(flat2, q5);
     flat2_q5 = _mm_and_si128(flat2, flat2_q5);
     q5 = _mm_or_si128(flat2_q5, q5);
-    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+    _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
     q6 = _mm_andnot_si128(flat2, q6);
     flat2_q6 = _mm_and_si128(flat2, flat2_q6);
     q6 = _mm_or_si128(flat2_q6, q6);
-    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+    _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index 28e6fd65f92..f90522cd7d0 100644
--- a/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -13,6 +13,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
@@ -30,7 +31,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
     hev =                                                                     \
         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
-    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_cmpgt_epi16(hev, thresh_v);                                     \
     hev = _mm_packs_epi16(hev, hev);                                          \
                                                                               \
     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
@@ -51,7 +52,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     flat = _mm_max_epu8(work, flat);                                          \
     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
     mask = _mm_unpacklo_epi64(mask, flat);                                    \
-    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_subs_epu8(mask, limit_v);                                      \
     mask = _mm_cmpeq_epi8(mask, zero);                                        \
     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
   } while (0)
@@ -60,7 +61,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
   do {                                                                      \
     const __m128i t3t4 =                                                    \
         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
-    const __m128i t80 = _mm_set1_epi8(0x80);                                \
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);                        \
     __m128i filter, filter2filter1, work;                                   \
                                                                             \
     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
@@ -103,27 +104,26 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
   } while (0)
 
-void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
+void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
-  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
-  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 4 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 0 * pitch)));
+  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
@@ -132,41 +132,40 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
   FILTER_HEV_MASK;
   FILTER4;
 
-  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
-  _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
+  _mm_storeh_pi((__m64 *)(s - 2 * pitch), _mm_castsi128_ps(ps1ps0));  // *op1
+  _mm_storel_epi64((__m128i *)(s - 1 * pitch), ps1ps0);               // *op0
+  _mm_storel_epi64((__m128i *)(s + 0 * pitch), qs1qs0);               // *oq0
+  _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(qs1qs0));  // *oq1
 }
 
-void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
+void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i limit =
-      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
-                         _mm_loadl_epi64((const __m128i *)_limit));
-  const __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+  const __m128i limit_v =
+      _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
+                         _mm_loadl_epi64((const __m128i *)limit));
+  const __m128i thresh_v =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)thresh), zero);
   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
   __m128i x0, x1, x2, x3;
   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
   __m128i mask, hev;
 
   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
-                           _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
+  q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * pitch - 4)),
+                           _mm_loadl_epi64((__m128i *)(s + 1 * pitch - 4)));
 
   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
+  x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 3 * pitch - 4)));
 
   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
+  x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 5 * pitch - 4)));
 
   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
-                         _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
+  x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * pitch - 4)),
+                         _mm_loadl_epi64((__m128i *)(s + 7 * pitch - 4)));
 
   // Transpose 8x8
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
@@ -212,69 +211,69 @@ void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
+  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
+  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
-void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
+                                const unsigned char *blimit,
+                                const unsigned char *limit,
+                                const unsigned char *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
 
-  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
+  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * pitch));
   q4p4 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
-  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * pitch)));
+  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * pitch));
   q3p3 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
-  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * pitch)));
+  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * pitch));
   q2p2 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
-  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * pitch)));
+  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * pitch));
   q1p1 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
-  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * pitch));
   q0p0 = _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
+      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * pitch)));
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
+    fe = _mm_set1_epi8((int8_t)0xfe);
     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -284,7 +283,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -292,7 +291,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi16(0x1);
     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
@@ -342,18 +341,18 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
       flat = _mm_cmpeq_epi8(flat, zero);
       flat = _mm_and_si128(flat, mask);
 
-      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
+      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * pitch));
       q5p5 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * pitch)));
 
-      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
+      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * pitch));
       q6p6 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * pitch)));
       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
 
-      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
+      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * pitch));
       q7p7 = _mm_castps_si128(
-          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
+          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * pitch)));
       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
@@ -520,44 +519,44 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     q6p6 = _mm_andnot_si128(flat2, q6p6);
     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
-    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
-    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
+    _mm_storel_epi64((__m128i *)(s - 7 * pitch), q6p6);
+    _mm_storeh_pi((__m64 *)(s + 6 * pitch), _mm_castsi128_ps(q6p6));
 
     q5p5 = _mm_andnot_si128(flat2, q5p5);
     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
-    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
+    _mm_storel_epi64((__m128i *)(s - 6 * pitch), q5p5);
+    _mm_storeh_pi((__m64 *)(s + 5 * pitch), _mm_castsi128_ps(q5p5));
 
     q4p4 = _mm_andnot_si128(flat2, q4p4);
     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
-    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
+    _mm_storel_epi64((__m128i *)(s - 5 * pitch), q4p4);
+    _mm_storeh_pi((__m64 *)(s + 4 * pitch), _mm_castsi128_ps(q4p4));
 
     q3p3 = _mm_andnot_si128(flat2, q3p3);
     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
-    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
+    _mm_storel_epi64((__m128i *)(s - 4 * pitch), q3p3);
+    _mm_storeh_pi((__m64 *)(s + 3 * pitch), _mm_castsi128_ps(q3p3));
 
     q2p2 = _mm_andnot_si128(flat2, q2p2);
     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
-    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), q2p2);
+    _mm_storeh_pi((__m64 *)(s + 2 * pitch), _mm_castsi128_ps(q2p2));
 
     q1p1 = _mm_andnot_si128(flat2, q1p1);
     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), q1p1);
+    _mm_storeh_pi((__m64 *)(s + 1 * pitch), _mm_castsi128_ps(q1p1));
 
     q0p0 = _mm_andnot_si128(flat2, q0p0);
     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), q0p0);
+    _mm_storeh_pi((__m64 *)(s - 0 * pitch), _mm_castsi128_ps(q0p0));
   }
 }
 
@@ -591,15 +590,15 @@ static INLINE __m128i filter16_mask(const __m128i *const flat,
   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
 }
 
-void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit,
-                                     const unsigned char *_limit,
-                                     const unsigned char *_thresh) {
+void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
+                                     const unsigned char *blimit,
+                                     const unsigned char *limit,
+                                     const unsigned char *thresh) {
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
@@ -609,27 +608,27 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
   __m128i max_abs_p1p0q1q0;
 
-  p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
-  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
-  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
-  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
-  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
-  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
-  q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+  p7 = _mm_loadu_si128((__m128i *)(s - 8 * pitch));
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q7 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
 
   {
     const __m128i abs_p1p0 = abs_diff(p1, p0);
     const __m128i abs_q1q0 = abs_diff(q1, q0);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     __m128i abs_p0q0 = abs_diff(p0, q0);
     __m128i abs_p1q1 = abs_diff(p1, q1);
@@ -638,7 +637,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
@@ -648,7 +647,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     mask = _mm_max_epu8(work, mask);
     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     mask = _mm_max_epu8(work, mask);
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
   }
 
@@ -678,8 +677,8 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
@@ -694,7 +693,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     oq0 = _mm_xor_si128(q0, t80);
     oq1 = _mm_xor_si128(q1, t80);
 
-    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
 
@@ -851,82 +850,82 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
 
       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
+      _mm_storeu_si128((__m128i *)(s - 7 * pitch), p6);
 
       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+      _mm_storeu_si128((__m128i *)(s - 6 * pitch), p5);
 
       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+      _mm_storeu_si128((__m128i *)(s - 5 * pitch), p4);
 
       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+      _mm_storeu_si128((__m128i *)(s - 4 * pitch), p3);
 
       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+      _mm_storeu_si128((__m128i *)(s - 3 * pitch), op2);
 
       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 2 * pitch), op1);
 
       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 1 * pitch), op0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s - 0 * pitch), oq0);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+      _mm_storeu_si128((__m128i *)(s + 1 * pitch), oq1);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      _mm_storeu_si128((__m128i *)(s + 2 * pitch), oq2);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+      _mm_storeu_si128((__m128i *)(s + 3 * pitch), q3);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+      _mm_storeu_si128((__m128i *)(s + 4 * pitch), q4);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      _mm_storeu_si128((__m128i *)(s + 5 * pitch), q5);
 
       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
-      _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
+      _mm_storeu_si128((__m128i *)(s + 6 * pitch), q6);
     }
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   }
 }
 
-void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
+void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
+                               const unsigned char *blimit,
+                               const unsigned char *limit,
+                               const unsigned char *thresh) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -934,28 +933,28 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
-  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * pitch)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * pitch)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * pitch)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * pitch)));
   p1q1 = _mm_shuffle_epi32(q1p1, 78);
   p0q0 = _mm_shuffle_epi32(q0p0, 78);
 
   {
     // filter_mask and hev_mask
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
     abs_p1p0 = abs_diff(q1p1, q0p0);
@@ -964,12 +963,12 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     abs_p0q0 = abs_diff(q0p0, p0q0);
     abs_p1q1 = abs_diff(q1p1, p1q1);
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_subs_epu8(flat, thresh_v);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
 
     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     mask = _mm_max_epu8(abs_p1p0, mask);
@@ -979,7 +978,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     mask = _mm_max_epu8(work, mask);
     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_subs_epu8(mask, limit_v);
     mask = _mm_cmpeq_epi8(mask, zero);
 
     // flat_mask4
@@ -997,14 +996,22 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     unsigned char *src = s;
     {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1047,16 +1054,16 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1102,7 +1109,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1120,27 +1127,25 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storel_epi64((__m128i *)(s - 3 * pitch), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * pitch), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * pitch), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * pitch), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * pitch), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
+void vpx_lpf_horizontal_8_dual_sse2(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
@@ -1149,33 +1154,33 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
 
   __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
   {
     const __m128i abs_p1p0 =
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
     const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -1227,14 +1232,22 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
 
     do {
       __m128i workp_a, workp_b, workp_shft;
-      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
-      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
-      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
-      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
-      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
-      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
-      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
-      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * pitch)),
+                             zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * pitch)),
+                             zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * pitch)),
+                             zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * pitch)),
+                             zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * pitch)),
+                             zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * pitch)),
+                             zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * pitch)),
+                             zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * pitch)),
+                             zero);
 
       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
@@ -1279,20 +1292,20 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1344,7 +1357,7 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
     q2 = _mm_load_si128((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
@@ -1362,49 +1375,49 @@ void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
-    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
     p2 = _mm_load_si128((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * pitch), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * pitch), q2);
   }
 }
 
-void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
+void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
+                                    const unsigned char *blimit0,
+                                    const unsigned char *limit0,
+                                    const unsigned char *thresh0,
+                                    const unsigned char *blimit1,
+                                    const unsigned char *limit1,
+                                    const unsigned char *thresh1) {
   const __m128i blimit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
+                         _mm_load_si128((const __m128i *)blimit1));
   const __m128i limit =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)limit0),
+                         _mm_load_si128((const __m128i *)limit1));
   const __m128i thresh =
-      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
+                         _mm_load_si128((const __m128i *)thresh1));
   const __m128i zero = _mm_set1_epi16(0);
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * pitch));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
 
   // filter_mask and hev_mask
   {
@@ -1412,7 +1425,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
     const __m128i abs_q1q0 =
         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
-    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     __m128i abs_p0q0 =
         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
@@ -1448,20 +1461,20 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
   {
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
-    const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
     const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
     const __m128i ps1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * pitch)), t80);
     const __m128i ps0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * pitch)), t80);
     const __m128i qs0 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * pitch)), t80);
     const __m128i qs1 =
-        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+        _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * pitch)), t80);
     __m128i filt;
     __m128i work_a;
     __m128i filter1, filter2;
@@ -1506,10 +1519,10 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
 
-    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s - 2 * pitch), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * pitch), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * pitch), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * pitch), q1);
   }
 }
 
@@ -1626,16 +1639,12 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpacklo_epi16(x2, x3);
     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 0 * out_p),
-                  _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
-    _mm_storeh_pd((double *)(out + 1 * out_p),
-                  _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
+    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
+    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
     x7 = _mm_unpackhi_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 2 * out_p),
-                  _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
-    _mm_storeh_pd((double *)(out + 3 * out_p),
-                  _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
+    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
+    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
 
     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
     x4 = _mm_unpackhi_epi16(x0, x1);
@@ -1643,21 +1652,17 @@ static INLINE void transpose(unsigned char *src[], int in_p,
     x5 = _mm_unpackhi_epi16(x2, x3);
     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
     x6 = _mm_unpacklo_epi32(x4, x5);
-    _mm_storel_pd((double *)(out + 4 * out_p),
-                  _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
-    _mm_storeh_pd((double *)(out + 5 * out_p),
-                  _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
+    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
+    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
     x7 = _mm_unpackhi_epi32(x4, x5);
 
-    _mm_storel_pd((double *)(out + 6 * out_p),
-                  _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
-    _mm_storeh_pd((double *)(out + 7 * out_p),
-                  _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
+    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
+    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1666,7 +1671,7 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1674,13 +1679,13 @@ void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   src[0] = t_dst;
   src[1] = t_dst + 8;
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_8_sse2(unsigned char *s, int pitch,
                              const unsigned char *blimit,
                              const unsigned char *limit,
                              const unsigned char *thresh) {
@@ -1692,7 +1697,7 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   src[0] = s - 4;
   dst[0] = t_dst;
 
-  transpose(src, p, dst, 8, 1);
+  transpose(src, pitch, dst, 8, 1);
 
   // Loop filtering
   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
@@ -1701,10 +1706,10 @@ void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   dst[0] = s - 4;
 
   // Transpose back
-  transpose(src, 8, dst, p, 1);
+  transpose(src, 8, dst, pitch, 1);
 }
 
-void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
+void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0,
                                   const uint8_t *limit0, const uint8_t *thresh0,
                                   const uint8_t *blimit1, const uint8_t *limit1,
                                   const uint8_t *thresh1) {
@@ -1713,7 +1718,7 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   unsigned char *dst[2];
 
   // Transpose 8x16
-  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+  transpose8x16(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
@@ -1722,13 +1727,13 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   src[1] = t_dst + 8;
 
   dst[0] = s - 4;
-  dst[1] = s - 4 + p * 8;
+  dst[1] = s - 4 + pitch * 8;
 
   // Transpose back
-  transpose(src, 16, dst, p, 2);
+  transpose(src, 16, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_sse2(unsigned char *s, int pitch,
                               const unsigned char *blimit,
                               const unsigned char *limit,
                               const unsigned char *thresh) {
@@ -1742,7 +1747,7 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = t_dst + 8 * 8;
 
   // Transpose 16x8
-  transpose(src, p, dst, 8, 2);
+  transpose(src, pitch, dst, 8, 2);
 
   // Loop filtering
   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
@@ -1753,22 +1758,22 @@ void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   dst[1] = s;
 
   // Transpose back
-  transpose(src, 8, dst, p, 2);
+  transpose(src, 8, dst, pitch, 2);
 }
 
-void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
+void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int pitch,
                                    const uint8_t *blimit, const uint8_t *limit,
                                    const uint8_t *thresh) {
   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
 
   // Transpose 16x16
-  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
-  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+  transpose8x16(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose8x16(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
 
   // Loop filtering
   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
 
   // Transpose back
-  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
-  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/mem_sse2.h b/libs/libvpx/vpx_dsp/x86/mem_sse2.h
index 2ce738fb770..258ab38e606 100644
--- a/libs/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -8,13 +8,43 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_MEM_SSE2_H_
-#define VPX_DSP_X86_MEM_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_VPX_DSP_X86_MEM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+#include <string.h>
 
 #include "./vpx_config.h"
 
+static INLINE void storeu_uint32(void *dst, uint32_t v) {
+  memcpy(dst, &v, sizeof(v));
+}
+
+static INLINE uint32_t loadu_uint32(const void *src) {
+  uint32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE __m128i load_unaligned_u32(const void *a) {
+  uint32_t val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
+  const uint32_t val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
 static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
                                  __m128i *const d) {
   d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
@@ -121,4 +151,4 @@ static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
   _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
 }
 
-#endif  // VPX_DSP_X86_MEM_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_MEM_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c
new file mode 100644
index 00000000000..d1029afc4fe
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+                               int cols, int flimit) {
+  int col;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i f = _mm_set1_epi32(flimit);
+  DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    int row, i;
+    __m128i s = _mm_loadl_epi64((__m128i *)dst);
+    __m128i sum, sumsq_0, sumsq_1;
+    __m128i tmp_0, tmp_1;
+    __m128i below_context;
+
+    s = _mm_unpacklo_epi8(s, zero);
+
+    for (i = 0; i < 8; ++i) {
+      _mm_store_si128((__m128i *)above_context + i, s);
+    }
+
+    // sum *= 9
+    sum = _mm_slli_epi16(s, 3);
+    sum = _mm_add_epi16(s, sum);
+
+    // sum^2 * 9 == (sum * 9) * sum
+    tmp_0 = _mm_mullo_epi16(sum, s);
+    tmp_1 = _mm_mulhi_epi16(sum, s);
+
+    sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+    sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+    // Prime sum/sumsq
+    for (i = 1; i <= 6; ++i) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+      a = _mm_unpacklo_epi8(a, zero);
+      sum = _mm_add_epi16(sum, a);
+      a = _mm_mullo_epi16(a, a);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+    }
+
+    for (row = 0; row < rows + 8; row++) {
+      const __m128i above =
+          _mm_load_si128((__m128i *)above_context + (row & 7));
+      __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+      __m128i above_sq, below_sq;
+      __m128i mask_0, mask_1;
+      __m128i multmp_0, multmp_1;
+      __m128i rv;
+      __m128i out;
+
+      this_row = _mm_unpacklo_epi8(this_row, zero);
+
+      if (row + 7 < rows) {
+        // Instead of copying the end context we just stop loading when we get
+        // to the last one.
+        below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+        below_context = _mm_unpacklo_epi8(below_context, zero);
+      }
+
+      sum = _mm_sub_epi16(sum, above);
+      sum = _mm_add_epi16(sum, below_context);
+
+      // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+      // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+      // because x86 does not have unpack with sign extension.
+      above_sq = _mm_mullo_epi16(above, above);
+      sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+      sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+      below_sq = _mm_mullo_epi16(below_context, below_context);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+      // sumsq * 16 - sumsq == sumsq * 15
+      mask_0 = _mm_slli_epi32(sumsq_0, 4);
+      mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+      mask_1 = _mm_slli_epi32(sumsq_1, 4);
+      mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+      multmp_0 = _mm_mullo_epi16(sum, sum);
+      multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+      mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+      mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+      // mask - f gives a negative value when mask < f
+      mask_0 = _mm_sub_epi32(mask_0, f);
+      mask_1 = _mm_sub_epi32(mask_1, f);
+
+      // Shift the sign bit down to create a mask
+      mask_0 = _mm_srai_epi32(mask_0, 31);
+      mask_1 = _mm_srai_epi32(mask_1, 31);
+
+      mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+      rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+      mask_1 = _mm_add_epi16(rv, sum);
+      mask_1 = _mm_add_epi16(mask_1, this_row);
+      mask_1 = _mm_srai_epi16(mask_1, 4);
+
+      mask_1 = _mm_and_si128(mask_0, mask_1);
+      mask_0 = _mm_andnot_si128(mask_0, this_row);
+      out = _mm_or_si128(mask_1, mask_0);
+
+      _mm_storel_epi64((__m128i *)(dst + row * pitch),
+                       _mm_packus_epi16(out, zero));
+
+      _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+    }
+
+    dst += 8;
+  }
+}
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_avx.c b/libs/libvpx/vpx_dsp/x86/quantize_avx.c
index 6f4489004dc..0a91d36eaf9 100644
--- a/libs/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libs/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -17,15 +17,16 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
-#include "vpx_dsp/x86/quantize_x86.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         int skip_block, const int16_t *zbin_ptr,
                         const int16_t *round_ptr, const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan_ptr,
-                        const int16_t *iscan_ptr) {
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
@@ -37,7 +38,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -90,15 +91,12 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(qcoeff0, qcoeff_ptr);
     store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
     dequant = _mm_unpackhi_epi64(dequant, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-    store_tran_low(coeff0, dqcoeff_ptr);
-    store_tran_low(coeff1, dqcoeff_ptr + 8);
-
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -135,26 +133,25 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(qcoeff0, qcoeff_ptr + index);
     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-    store_tran_low(coeff0, dqcoeff_ptr + index);
-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
@@ -167,7 +164,7 @@ void vpx_quantize_b_32x32_avx(
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;
   (void)skip_block;
   assert(!skip_block);
@@ -233,28 +230,12 @@ void vpx_quantize_b_32x32_avx(
     store_tran_low(qcoeff0, qcoeff_ptr);
     store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    // Un-sign to bias rounding like C.
-    // dequant is almost always negative, so this is probably the backwards way
-    // to handle the sign. However, it matches the previous assembly.
-    coeff0 = _mm_abs_epi16(qcoeff0);
-    coeff1 = _mm_abs_epi16(qcoeff1);
-
-    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
     dequant = _mm_unpackhi_epi64(dequant, dequant);
-    coeff1 = calculate_dqcoeff(coeff1, dequant);
-
-    // "Divide" by 2.
-    coeff0 = _mm_srli_epi16(coeff0, 1);
-    coeff1 = _mm_srli_epi16(coeff1, 1);
-
-    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
-    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    store_tran_low(coeff0, dqcoeff_ptr);
-    store_tran_low(coeff1, dqcoeff_ptr + 8);
-
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -291,23 +272,13 @@ void vpx_quantize_b_32x32_avx(
     store_tran_low(qcoeff0, qcoeff_ptr + index);
     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-    coeff0 = _mm_abs_epi16(qcoeff0);
-    coeff1 = _mm_abs_epi16(qcoeff1);
-
-    coeff0 = calculate_dqcoeff(coeff0, dequant);
-    coeff1 = calculate_dqcoeff(coeff1, dequant);
-
-    coeff0 = _mm_srli_epi16(coeff0, 1);
-    coeff1 = _mm_srli_epi16(coeff1, 1);
-
-    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
-    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
-
-    store_tran_low(coeff0, dqcoeff_ptr + index);
-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
index c020b398c3b..e38a4059ab7 100644
--- a/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -15,15 +15,15 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
-#include "vpx_dsp/x86/quantize_x86.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
                          const int16_t *round_ptr, const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
 
@@ -33,7 +33,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -74,15 +74,11 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   store_tran_low(qcoeff0, qcoeff_ptr);
   store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
   dequant = _mm_unpackhi_epi64(dequant, dequant);
-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  store_tran_low(coeff0, dqcoeff_ptr);
-  store_tran_low(coeff1, dqcoeff_ptr + 8);
-
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -109,14 +105,11 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(qcoeff0, qcoeff_ptr + index);
     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-    store_tran_low(coeff0, dqcoeff_ptr + index);
-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_x86.h b/libs/libvpx/vpx_dsp/x86/quantize_sse2.h
similarity index 70%
rename from libs/libvpx/vpx_dsp/x86/quantize_x86.h
rename to libs/libvpx/vpx_dsp/x86/quantize_sse2.h
index 34928fbb56d..afe2f924b34 100644
--- a/libs/libvpx/vpx_dsp/x86/quantize_x86.h
+++ b/libs/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -8,11 +8,13 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
+
 #include <emmintrin.h>
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
                                  const int16_t *round_ptr, __m128i *round,
@@ -42,21 +44,35 @@ static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
   *coeff = _mm_mulhi_epi16(qcoeff, shift);
 }
 
-static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
-  return _mm_mullo_epi16(qcoeff, dequant);
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               tran_low_t *dqcoeff) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+  const __m128i dqcoeff16 = _mm_mullo_epi16(qcoeff, dequant);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff16);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
+// zbin to add 1 to the index in 'scan'.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
                                    const __m128i zbin_mask0,
                                    const __m128i zbin_mask1,
-                                   const int16_t *scan_ptr, const int index,
+                                   const int16_t *scan, const int index,
                                    const __m128i zero) {
   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
   const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
   __m128i eob0, eob1;
   // Add one to convert from indices to counts
   scan0 = _mm_sub_epi16(scan0, zbin_mask0);
@@ -76,3 +92,5 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
   eob = _mm_max_epi16(eob, eob_shuffled);
   return _mm_extract_epi16(eob, 1);
 }
+
+#endif  // VPX_VPX_DSP_X86_QUANTIZE_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c
index 3f528e1a978..fc1d91959f1 100644
--- a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -14,7 +14,8 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
-#include "vpx_dsp/x86/quantize_x86.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           int skip_block, const int16_t *zbin_ptr,
@@ -22,7 +23,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+                          const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
 
@@ -32,7 +33,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)skip_block;
   assert(!skip_block);
 
@@ -67,15 +68,11 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   store_tran_low(qcoeff0, qcoeff_ptr);
   store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
   dequant = _mm_unpackhi_epi64(dequant, dequant);
-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  store_tran_low(coeff0, dqcoeff_ptr);
-  store_tran_low(coeff1, dqcoeff_ptr + 8);
-
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -100,14 +97,11 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     store_tran_low(qcoeff0, qcoeff_ptr + index);
     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-    store_tran_low(coeff0, dqcoeff_ptr + index);
-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
@@ -116,12 +110,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   int index;
@@ -133,7 +129,7 @@ void vpx_quantize_b_32x32_ssse3(
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan_ptr;
+  (void)scan;
   (void)n_coeffs;
   (void)skip_block;
   assert(!skip_block);
@@ -206,28 +202,12 @@ void vpx_quantize_b_32x32_ssse3(
     store_tran_low(qcoeff0, qcoeff_ptr);
     store_tran_low(qcoeff1, qcoeff_ptr + 8);
 
-    // Un-sign to bias rounding like C.
-    // dequant is almost always negative, so this is probably the backwards way
-    // to handle the sign. However, it matches the previous assembly.
-    coeff0 = _mm_abs_epi16(qcoeff0);
-    coeff1 = _mm_abs_epi16(qcoeff1);
-
-    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
     dequant = _mm_unpackhi_epi64(dequant, dequant);
-    coeff1 = calculate_dqcoeff(coeff1, dequant);
-
-    // "Divide" by 2.
-    coeff0 = _mm_srli_epi16(coeff0, 1);
-    coeff1 = _mm_srli_epi16(coeff1, 1);
-
-    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
-    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    store_tran_low(coeff0, dqcoeff_ptr);
-    store_tran_low(coeff1, dqcoeff_ptr + 8);
-
-    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
-                       zero);
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -268,23 +248,13 @@ void vpx_quantize_b_32x32_ssse3(
     store_tran_low(qcoeff0, qcoeff_ptr + index);
     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 
-    coeff0 = _mm_abs_epi16(qcoeff0);
-    coeff1 = _mm_abs_epi16(qcoeff1);
-
-    coeff0 = calculate_dqcoeff(coeff0, dequant);
-    coeff1 = calculate_dqcoeff(coeff1, dequant);
-
-    coeff0 = _mm_srli_epi16(coeff0, 1);
-    coeff1 = _mm_srli_epi16(coeff1, 1);
-
-    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
-    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
-
-    store_tran_low(coeff0, dqcoeff_ptr + index);
-    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
 
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h
new file mode 100644
index 00000000000..e8d2a057710
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/quantize_ssse3.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+#define VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+
+static INLINE void calculate_dqcoeff_and_store_32x32(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 1);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 1);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+#else
+  _mm_store_si128((__m128i *)(dqcoeff),
+                  _mm_packs_epi32(dqcoeff32_0, dqcoeff32_1));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+
+#endif  // VPX_VPX_DSP_X86_QUANTIZE_SSSE3_H_
diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c
index 962b8fb11a4..b18fecf709c 100644
--- a/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -11,154 +11,120 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
+static INLINE void calc_final(const __m256i *const sums /*[4]*/,
+                              uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
   int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
+  const uint8_t *refs[4];
+  __m256i sums[4];
+
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
+  sums[0] = _mm256_setzero_si256();
+  sums[1] = _mm256_setzero_si256();
+  sums[2] = _mm256_setzero_si256();
+  sums[3] = _mm256_setzero_si256();
 
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
+  for (i = 0; i < 32; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // sum of the absolute differences between every ref[] to src
+    r[0] = _mm256_sad_epu8(r[0], s);
+    r[1] = _mm256_sad_epu8(r[1], s);
+    r[2] = _mm256_sad_epu8(r[2], s);
+    r[3] = _mm256_sad_epu8(r[3], s);
+
+    // sum every ref[]
+    sums[0] = _mm256_add_epi32(sums[0], r[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r[3]);
+
+    src_ptr += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
 
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
+  calc_final(sums, sad_array);
+}
 
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *const ref_array[4], int ref_stride,
+                          uint32_t sad_array[4]) {
+  __m256i sums[4];
+  int i;
+  const uint8_t *refs[4];
 
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
+  refs[0] = ref_array[0];
+  refs[1] = ref_array[1];
+  refs[2] = ref_array[2];
+  refs[3] = ref_array[3];
+  sums[0] = _mm256_setzero_si256();
+  sums[1] = _mm256_setzero_si256();
+  sums[2] = _mm256_setzero_si256();
+  sums[3] = _mm256_setzero_si256();
 
-    _mm_storeu_si128((__m128i *)(res), sum);
+  for (i = 0; i < 64; i++) {
+    __m256i r_lo[4], r_hi[4];
+    // load 64 bytes from src and all ref[]
+    const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
+    const __m256i s_hi = _mm256_load_si256((const __m256i *)(src_ptr + 32));
+    r_lo[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r_hi[0] = _mm256_loadu_si256((const __m256i *)(refs[0] + 32));
+    r_lo[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r_hi[1] = _mm256_loadu_si256((const __m256i *)(refs[1] + 32));
+    r_lo[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r_hi[2] = _mm256_loadu_si256((const __m256i *)(refs[2] + 32));
+    r_lo[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r_hi[3] = _mm256_loadu_si256((const __m256i *)(refs[3] + 32));
+
+    // sum of the absolute differences between every ref[] to src
+    r_lo[0] = _mm256_sad_epu8(r_lo[0], s_lo);
+    r_lo[1] = _mm256_sad_epu8(r_lo[1], s_lo);
+    r_lo[2] = _mm256_sad_epu8(r_lo[2], s_lo);
+    r_lo[3] = _mm256_sad_epu8(r_lo[3], s_lo);
+    r_hi[0] = _mm256_sad_epu8(r_hi[0], s_hi);
+    r_hi[1] = _mm256_sad_epu8(r_hi[1], s_hi);
+    r_hi[2] = _mm256_sad_epu8(r_hi[2], s_hi);
+    r_hi[3] = _mm256_sad_epu8(r_hi[3], s_hi);
+
+    // sum every ref[]
+    sums[0] = _mm256_add_epi32(sums[0], r_lo[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r_lo[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r_lo[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r_lo[3]);
+    sums[0] = _mm256_add_epi32(sums[0], r_hi[0]);
+    sums[1] = _mm256_add_epi32(sums[1], r_hi[1]);
+    sums[2] = _mm256_add_epi32(sums[2], r_hi[2]);
+    sums[3] = _mm256_add_epi32(sums[3], r_hi[3]);
+
+    src_ptr += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
   }
+
+  calc_final(sums, sad_array);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c
index 5f2ab6ea71b..4c5d70464de 100644
--- a/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c
+++ b/libs/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -11,8 +11,8 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
-void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
+void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *const ref_array[4], int ref_stride,
                             uint32_t res[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
@@ -20,33 +20,33 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
   int i;
   const uint8_t *ref0, *ref1, *ref2, *ref3;
 
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
+  ref0 = ref_array[0];
+  ref1 = ref_array[1];
+  ref2 = ref_array[2];
+  ref3 = ref_array[3];
   sum_ref0 = _mm512_set1_epi16(0);
   sum_ref1 = _mm512_set1_epi16(0);
   sum_ref2 = _mm512_set1_epi16(0);
   sum_ref3 = _mm512_set1_epi16(0);
   for (i = 0; i < 64; i++) {
-    // load src and all refs
-    src_reg = _mm512_loadu_si512((const __m512i *)src);
+    // load src and all ref[]
+    src_reg = _mm512_loadu_si512((const __m512i *)src_ptr);
     ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
     ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
     ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
     ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
-    // sum of the absolute differences between every ref-i to src
+    // sum of the absolute differences between every ref[] to src
     ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
     ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
     ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
     ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
+    // sum every ref[]
     sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
     sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
     sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
     sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
 
-    src += src_stride;
+    src_ptr += src_stride;
     ref0 += ref_stride;
     ref1 += ref_stride;
     ref2 += ref_stride;
@@ -55,7 +55,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
   {
     __m256i sum256;
     __m128i sum128;
-    // in sum_ref-i the result is saved in the first 4 bytes
+    // in sum_ref[] the result is saved in the first 4 bytes
     // the other 4 bytes are zeroed.
     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
@@ -65,7 +65,7 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
     sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
     sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
 
-    // merge every 64 bit from each sum_ref-i
+    // merge every 64 bit from each sum_ref[]
     sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
     sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
 
diff --git a/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
index cee4468c1f0..5adb9b8c3da 100644
--- a/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/subpel_variance_sse2.asm
@@ -41,12 +41,12 @@ SECTION .text
 
 ; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               const uint8_t *ref, ptrdiff_t ref_stride,
 ;                               int height, unsigned int *sse);
 ;
 ; This function returns the SE and stores SSE in the given pointer.
 
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+%macro SUM_SSE 6 ; src1, ref1, src2, ref2, sum, sse
   psubw                %3, %4
   psubw                %1, %2
   paddw                %5, %3
@@ -114,84 +114,65 @@ SECTION .text
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
 
-%ifdef PIC    ; 64bit PIC
+%if ARCH_X86_64
   %if %2 == 1 ; avg
     cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
+                                        x_offset, y_offset, ref, ref_stride, \
+                                        second_pred, second_stride, height, sse
+    %define second_str second_strideq
   %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
-                                  y_offset, dst, dst_stride, height, sse
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, ref, ref_stride, \
+                                    height, sse
   %endif
   %define block_height heightd
   %define bilin_filter sseq
 %else
-  %if ARCH_X86=1 && CONFIG_PIC=1
+  %if CONFIG_PIC=1
     %if %2 == 1 ; avg
       cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, \
-                                  sec, sec_stride, \
-                                  height, sse, g_bilin_filter, g_pw_8
+                                          x_offset, y_offset, ref, ref_stride, \
+                                          second_pred, second_stride, height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
+      %define second_str second_stridemp
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                                y_offset, dst, dst_stride, height, sse, \
-                                g_bilin_filter, g_pw_8
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, ref, ref_stride, \
+                                      height, sse
       %define block_height heightd
+    %endif
 
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
 
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
+    ;Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
 
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
 
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
   %else
     %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                             x_offset, y_offset, \
-                                             dst, dst_stride, \
-                                             sec, sec_stride, \
-                                             height, sse
-      %if ARCH_X86_64
-      %define block_height heightd
-      %define sec_str sec_strideq
-      %else
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          ref, ref_stride, second_pred, second_stride, \
+                                          height, sse
       %define block_height dword heightm
-      %define sec_str sec_stridemp
-      %endif
+      %define second_str second_stridemp
     %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
-                              y_offset, dst, dst_stride, height, sse
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, ref, ref_stride, \
+                                      height, sse
       %define block_height heightd
     %endif
-
     %define bilin_filter bilin_filter_m
   %endif
 %endif
@@ -211,7 +192,7 @@ SECTION .text
 %if %1 < 16
   sar                   block_height, 1
 %if %2 == 1 ; avg
-  shl             sec_str, 1
+  shl             second_str, 1
 %endif
 %endif
 
@@ -226,9 +207,9 @@ SECTION .text
 .x_zero_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m3, m1, m5
   punpcklbw            m1, m5
 %endif
@@ -242,7 +223,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
 %if %2 == 1 ; avg
@@ -256,14 +237,14 @@ SECTION .text
   movx                 m2, [srcq+src_strideq]
 %endif
 
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
 
 %if %2 == 1 ; avg
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
@@ -284,10 +265,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_zero_loop
@@ -302,11 +283,11 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
@@ -314,7 +295,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
@@ -325,22 +306,22 @@ SECTION .text
   movx                 m1, [srcq+src_strideq*2]
   punpckldq            m2, m1
 %endif
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
 %if %1 > 4
   movlhps              m0, m2
 %else ; 4xh
   punpckldq            m0, m2
 %endif
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m0, m2
   punpcklbw            m1, m5
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpcklbw            m3, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m4, [secq]
+  movh                 m4, [second_predq]
   pavgb                m0, m4
   punpcklbw            m3, m5
   punpcklbw            m0, m5
@@ -348,9 +329,9 @@ SECTION .text
 %endif
 %else ; !avg
   movx                 m4, [srcq+src_strideq*2]
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -360,10 +341,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_half_loop
@@ -371,8 +352,8 @@ SECTION .text
 
 .x_zero_y_nonhalf:
   ; x_offset == 0 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -380,7 +361,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -397,7 +378,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -405,7 +386,7 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -437,7 +418,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -446,14 +427,14 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq*2]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -473,7 +454,7 @@ SECTION .text
   pmullw               m4, filter_y_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
@@ -485,11 +466,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -499,10 +480,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_zero_y_other_loop
@@ -523,11 +504,11 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m0, m4
   punpckhbw            m3, m1, m5
 %if %2 == 1 ; avg
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
@@ -535,7 +516,7 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m4, [srcq+1]
@@ -549,17 +530,17 @@ SECTION .text
   movx                 m2, [srcq+src_strideq+1]
   punpckldq            m4, m2
 %endif
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m0, m4
   punpcklbw            m3, m5
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpcklbw            m1, m5
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else ; 4xh
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m1, m5
   punpcklbw            m0, m5
@@ -567,10 +548,10 @@ SECTION .text
 %endif
 %else ; !avg
   movx                 m2, [srcq+src_strideq]
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m4
   movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   pavgb                m2, m4
   punpcklbw            m0, m5
   punpcklbw            m2, m5
@@ -580,10 +561,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_zero_loop
@@ -602,13 +583,13 @@ SECTION .text
 .x_half_y_half_loop:
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m4, m3
   punpckhbw            m3, m1, m5
   pavgb                m0, m4
 %if %2 == 1 ; avg
   punpcklbw            m1, m5
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
@@ -620,7 +601,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
@@ -647,13 +628,13 @@ SECTION .text
   punpckldq            m0, m2
   pshuflw              m4, m2, 0xe
 %endif
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
 %endif
   punpcklbw            m3, m5
@@ -672,8 +653,8 @@ SECTION .text
   pavgb                m4, m1
   pavgb                m0, m2
   pavgb                m2, m4
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   punpcklbw            m0, m5
   punpcklbw            m2, m5
   punpcklbw            m3, m5
@@ -683,10 +664,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_half_loop
@@ -694,8 +675,8 @@ SECTION .text
 
 .x_half_y_nonhalf:
   ; x_offset == 0.5 && y_offset == bilin interpolation
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           y_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -703,7 +684,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
@@ -720,7 +701,7 @@ SECTION .text
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -732,7 +713,7 @@ SECTION .text
 .x_half_y_other_loop:
   movu                 m4, [srcq]
   movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   pavgb                m4, m2
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
@@ -762,7 +743,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -771,7 +752,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m3, [srcq+1]
@@ -787,9 +768,9 @@ SECTION .text
   movx                 m3, [srcq+src_strideq+1]
   pavgb                m2, m1
   pavgb                m4, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m0, m2
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_y_a
@@ -809,7 +790,7 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
 %endif
   psraw                m0, 4
   psraw                m2, 4
@@ -820,11 +801,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -835,10 +816,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_half_y_other_loop
@@ -852,8 +833,8 @@ SECTION .text
   jnz .x_nonhalf_y_nonzero
 
   ; x_offset == bilin interpolation && y_offset == 0
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -861,7 +842,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -878,7 +859,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -886,7 +867,7 @@ SECTION .text
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
 %if cpuflag(ssse3)
   punpckhbw            m2, m0, m4
   punpcklbw            m0, m4
@@ -913,7 +894,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -922,16 +903,16 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
   movx                 m2, [srcq+src_strideq]
   movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %if cpuflag(ssse3)
   punpcklbw            m0, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   punpcklbw            m2, m4
   pmaddubsw            m0, filter_x_a
   pmaddubsw            m2, filter_x_a
@@ -951,7 +932,7 @@ SECTION .text
   pmullw               m4, filter_x_b
   paddw                m0, m1
   paddw                m2, filter_rnd
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m2, m4
 %endif
   psraw                m0, 4
@@ -963,11 +944,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -977,10 +958,10 @@ SECTION .text
   SUM_SSE              m0, m1, m2, m3, m6, m7
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_zero_loop
@@ -994,8 +975,8 @@ SECTION .text
   jne .x_nonhalf_y_nonhalf
 
   ; x_offset == bilin interpolation && y_offset == 0.5
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
 %if ARCH_X86_64 && %1 > 4
@@ -1003,7 +984,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                 m9, [bilin_filter+x_offsetq+16]
 %endif
-  mova                m10, [pw_8]
+  mova                m10, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
@@ -1020,7 +1001,7 @@ SECTION .text
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1056,7 +1037,7 @@ SECTION .text
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
 %if cpuflag(ssse3)
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1082,7 +1063,7 @@ SECTION .text
   paddw                m2, filter_rnd
   paddw                m4, m3
   paddw                m2, m1
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   psraw                m4, 4
   psraw                m2, 4
   punpckhbw            m3, m1, m5
@@ -1096,7 +1077,7 @@ SECTION .text
 %endif
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
 %endif
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
@@ -1104,7 +1085,7 @@ SECTION .text
   mova                 m0, m4
 
   add                srcq, src_strideq
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
@@ -1132,8 +1113,8 @@ SECTION .text
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [refq]
+  movx                 m3, [refq+ref_strideq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
 %else
@@ -1148,9 +1129,9 @@ SECTION .text
   pmullw               m3, filter_x_b
   paddw                m4, filter_rnd
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   paddw                m4, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
 %endif
   psraw                m2, 4
   psraw                m4, 4
@@ -1163,11 +1144,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -1179,10 +1160,10 @@ SECTION .text
   mova                 m0, m4
 
   lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_half_loop
@@ -1192,8 +1173,8 @@ SECTION .text
   STORE_AND_RET %1
 
 .x_nonhalf_y_nonhalf:
-%ifdef PIC
-  lea        bilin_filter, [bilin_filter_m]
+%if ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
 %endif
   shl           x_offsetd, filter_idx_shift
   shl           y_offsetd, filter_idx_shift
@@ -1206,7 +1187,7 @@ SECTION .text
 %if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
   mova                m11, [bilin_filter+y_offsetq+16]
 %endif
-  mova                m12, [pw_8]
+  mova                m12, [GLOBAL(pw_8)]
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_y_a m10
@@ -1234,7 +1215,7 @@ SECTION .text
 %define filter_x_b [x_offsetq+16]
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
-%define filter_rnd [pw_8]
+%define filter_rnd [GLOBAL(pw_8)]
 %endif
 %endif
 
@@ -1273,7 +1254,7 @@ SECTION .text
 %if cpuflag(ssse3)
   movu                 m4, [srcq]
   movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   punpckhbw            m2, m4, m3
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
@@ -1319,7 +1300,7 @@ SECTION .text
   pmullw               m0, filter_y_a
   pmullw               m3, filter_y_b
   paddw                m2, m1
-  mova                 m1, [dstq]
+  mova                 m1, [refq]
   paddw                m0, filter_rnd
   psraw                m2, 4
   paddw                m0, m3
@@ -1330,7 +1311,7 @@ SECTION .text
 %if %2 == 1 ; avg
   ; FIXME(rbultje) pipeline
   packuswb             m0, m2
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %endif
@@ -1338,7 +1319,7 @@ SECTION .text
   mova                 m0, m4
 
   INC_SRC_BY_SRC_STRIDE
-  add                dstq, dst_strideq
+  add                refq, ref_strideq
 %else ; %1 < 16
   movx                 m0, [srcq]
   movx                 m1, [srcq+1]
@@ -1374,8 +1355,8 @@ SECTION .text
   punpcklbw            m4, m3
   pmaddubsw            m2, filter_x_a
   pmaddubsw            m4, filter_x_a
-  movx                 m3, [dstq+dst_strideq]
-  movx                 m1, [dstq]
+  movx                 m3, [refq+ref_strideq]
+  movx                 m1, [refq]
   paddw                m2, filter_rnd
   paddw                m4, filter_rnd
   psraw                m2, 4
@@ -1414,9 +1395,9 @@ SECTION .text
   pmullw               m1, m4, filter_y_b
   paddw                m2, filter_rnd
   paddw                m0, m3
-  movx                 m3, [dstq+dst_strideq]
+  movx                 m3, [refq+ref_strideq]
   paddw                m2, m1
-  movx                 m1, [dstq]
+  movx                 m1, [refq]
   psraw                m0, 4
   psraw                m2, 4
   punpcklbw            m3, m5
@@ -1429,11 +1410,11 @@ SECTION .text
 %endif
   packuswb             m0, m2
 %if %1 > 4
-  pavgb                m0, [secq]
+  pavgb                m0, [second_predq]
   punpckhbw            m2, m0, m5
   punpcklbw            m0, m5
 %else
-  movh                 m2, [secq]
+  movh                 m2, [second_predq]
   pavgb                m0, m2
   punpcklbw            m0, m5
   movhlps              m2, m0
@@ -1443,10 +1424,10 @@ SECTION .text
   mova                 m0, m4
 
   INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq+dst_strideq*2]
+  lea                refq, [refq+ref_strideq*2]
 %endif
 %if %2 == 1 ; avg
-  add                secq, sec_str
+  add                second_predq, second_str
 %endif
   dec                   block_height
   jg .x_other_y_other_loop
diff --git a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
index 026d0ca2f27..9eaf6ee1b8f 100644
--- a/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -10,120 +10,96 @@
 
 #include <assert.h>
 #include <emmintrin.h>
-#include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
-static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
-                                                int stride) {
-  const __m128i v_val_0_w =
-      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
-  const __m128i v_val_1_w =
-      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
-  const __m128i v_val_2_w =
-      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
-  const __m128i v_val_3_w =
-      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
-  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-  const __m128i v_sum_d =
-      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
-  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-// TODO(jingning): Evaluate the performance impact here.
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-static uint64_t
-vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
-  int r, c;
-  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
-  __m128i v_acc_q = _mm_setzero_si128();
-
-  for (r = 0; r < size; r += 8) {
-    __m128i v_acc_d = _mm_setzero_si128();
-
-    for (c = 0; c < size; c += 8) {
-      const int16_t *b = src + c;
-      const __m128i v_val_0_w =
-          _mm_load_si128((const __m128i *)(b + 0 * stride));
-      const __m128i v_val_1_w =
-          _mm_load_si128((const __m128i *)(b + 1 * stride));
-      const __m128i v_val_2_w =
-          _mm_load_si128((const __m128i *)(b + 2 * stride));
-      const __m128i v_val_3_w =
-          _mm_load_si128((const __m128i *)(b + 3 * stride));
-      const __m128i v_val_4_w =
-          _mm_load_si128((const __m128i *)(b + 4 * stride));
-      const __m128i v_val_5_w =
-          _mm_load_si128((const __m128i *)(b + 5 * stride));
-      const __m128i v_val_6_w =
-          _mm_load_si128((const __m128i *)(b + 6 * stride));
-      const __m128i v_val_7_w =
-          _mm_load_si128((const __m128i *)(b + 7 * stride));
-
-      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
-      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
-      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
-    }
-
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+  // Over 75% of all calls are with size == 4.
+  if (size == 4) {
+    __m128i s[2], sq[2], ss;
+
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+    s[0] = loadh_epi64(s[0], src + 1 * stride);
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+    s[1] = loadh_epi64(s[1], src + 3 * stride);
+    sq[0] = _mm_madd_epi16(s[0], s[0]);
+    sq[1] = _mm_madd_epi16(s[1], s[1]);
+    sq[0] = _mm_add_epi32(sq[0], sq[1]);
+    ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+    ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+    return (uint64_t)_mm_cvtsi128_si32(ss);
+  } else {
+    // Generic case
+    int r = size;
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    __m128i v_acc_q = _mm_setzero_si128();
 
-    src += 8 * stride;
-  }
+    assert(size % 8 == 0);
 
-  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+    do {
+      int c = 0;
+      __m128i v_acc_d = _mm_setzero_si128();
+
+      do {
+        const int16_t *const b = src + c;
+        const __m128i v_val_0_w =
+            _mm_load_si128((const __m128i *)(b + 0 * stride));
+        const __m128i v_val_1_w =
+            _mm_load_si128((const __m128i *)(b + 1 * stride));
+        const __m128i v_val_2_w =
+            _mm_load_si128((const __m128i *)(b + 2 * stride));
+        const __m128i v_val_3_w =
+            _mm_load_si128((const __m128i *)(b + 3 * stride));
+        const __m128i v_val_4_w =
+            _mm_load_si128((const __m128i *)(b + 4 * stride));
+        const __m128i v_val_5_w =
+            _mm_load_si128((const __m128i *)(b + 5 * stride));
+        const __m128i v_val_6_w =
+            _mm_load_si128((const __m128i *)(b + 6 * stride));
+        const __m128i v_val_7_w =
+            _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+        const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+        const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+        const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+        const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+        const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+        const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+        const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+        const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+        const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+        const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+        const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+        const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+        const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+        const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+        v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+        c += 8;
+      } while (c < size);
+
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+      v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+      src += 8 * stride;
+      r -= 8;
+    } while (r);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
 
 #if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+    return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
 #else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
-    return tmp;
-  }
+    {
+      uint64_t tmp;
+      _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+      return tmp;
+    }
 #endif
-}
-
-uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
-  // 4 elements per row only requires half an XMM register, so this
-  // must be a special case, but also note that over 75% of all calls
-  // are with size == 4, so it is also the common case.
-  if (size == 4) {
-    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else {
-    // Generic case
-    assert(size % 8 == 0);
-    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
   }
 }
diff --git a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h
index 8a0119ca7e5..6e07871b181 100644
--- a/libs/libvpx/vpx_dsp/x86/transpose_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
-#define VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#define VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
 
@@ -364,4 +364,4 @@ static INLINE void transpose_32bit_8x4(const __m128i *const in,
   out[7] = _mm_unpackhi_epi64(a6, a7);
 }
 
-#endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
index 0a9542c85b4..de5ce43b00a 100644
--- a/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ b/libs/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_DSP_X86_TXFM_COMMON_SSE2_H_
-#define VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#ifndef VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#define VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
 
 #include <emmintrin.h>
 #include "vpx/vpx_integer.h"
@@ -29,4 +29,4 @@
   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
 
-#endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
+#endif  // VPX_VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/libs/libvpx/vpx_dsp/x86/variance_avx2.c b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
index d15a89c746b..9232acbfbb3 100644
--- a/libs/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -38,130 +38,140 @@ DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
 };
 /* clang-format on */
 
-void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *sse, int *sum) {
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i sum_reg = _mm256_setzero_si256();
-  __m256i sse_reg = _mm256_setzero_si256();
-  // process two 16 byte locations in a 256 bit register
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; ++i) {
-    // convert up values in 128 bit registers across lanes
-    const __m256i src0 =
-        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    const __m256i src1 = _mm256_cvtepu8_epi16(
-        _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)));
-    const __m256i ref0 =
-        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    const __m256i ref1 = _mm256_cvtepu8_epi16(
-        _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)));
-    const __m256i diff0 = _mm256_sub_epi16(src0, ref0);
-    const __m256i diff1 = _mm256_sub_epi16(src1, ref1);
-    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
-    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-
-    // add to the running totals
-    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
-    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-  {
-    // extract the low lane and add it to the high lane
-    const __m128i sum_reg_128 = _mm_add_epi16(
-        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
-    const __m128i sse_reg_128 = _mm_add_epi32(
-        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
-
-    // sum upper and lower 64 bits together and convert up to 32 bit values
-    const __m128i sum_reg_64 =
-        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
-    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
-
-    // unpack sse and sum registers and add
-    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
-    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
-    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
-    // perform the final summation and extract the results
-    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
-    *((int *)sse) = _mm_cvtsi128_si32(res);
-    *((int *)sum) = _mm_extract_epi32(res, 1);
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE void variance_final_from_32bit_sum_avx2(__m256i vsse,
+                                                      __m128i vsum,
+                                                      unsigned int *const sse,
+                                                      int *const sum) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = _mm_add_epi32(_mm256_castsi256_si128(vsse),
+                                            _mm256_extractf128_si256(vsse, 1));
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  *((int *)sum) = _mm_extract_epi32(res, 1);
+}
+
+static INLINE void variance_final_from_16bit_sum_avx2(__m256i vsse,
+                                                      __m256i vsum,
+                                                      unsigned int *const sse,
+                                                      int *const sum) {
+  // extract the low lane and add it to the high lane
+  const __m128i sum_reg_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+                                            _mm256_extractf128_si256(vsum, 1));
+  const __m128i sum_reg_64 =
+      _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+  variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse, sum);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
   }
 }
 
-static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                             const unsigned char *ref_ptr, int recon_stride,
-                             unsigned int *sse, int *sum) {
-  unsigned int i, src_2strides, ref_2strides;
-  const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
-  __m256i sum_reg = _mm256_setzero_si256();
-  __m256i sse_reg = _mm256_setzero_si256();
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
 
-  // process 64 elements in an iteration
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr));
-    const __m256i src1 =
-        _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride));
-    const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-    const __m256i ref1 =
-        _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride));
-
-    // unpack into pairs of source and reference values
-    const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0);
-    const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0);
-    const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1);
-    const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1);
-
-    // subtract adjacent elements using src*1 + ref*-1
-    const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
-    const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
-    const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub);
-    const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub);
-    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
-    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-    const __m256i madd2 = _mm256_madd_epi16(diff2, diff2);
-    const __m256i madd3 = _mm256_madd_epi16(diff3, diff3);
-
-    // add to the running totals
-    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
-    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3));
-    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
-    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
+  for (i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
   }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
 
-  {
-    // extract the low lane and add it to the high lane
-    const __m128i sum_reg_128 = _mm_add_epi16(
-        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
-    const __m128i sse_reg_128 = _mm_add_epi32(
-        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
-
-    // sum upper and lower 64 bits together and convert up to 32 bit values
-    const __m128i sum_reg_64 =
-        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
-    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
-
-    // unpack sse and sum registers and add
-    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
-    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
-    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
-    // perform the final summation and extract the results
-    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
-    *((int *)sse) = _mm_cvtsi128_si32(res);
-    *((int *)sum) = _mm_extract_epi32(res, 1);
+  for (i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
   }
 }
 
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, sum);
+}
+
 #define FILTER_SRC(filter)                               \
   /* filter the source */                                \
   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
@@ -214,8 +224,9 @@ static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride,
 
 static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
   const __m256i zero_reg = _mm256_setzero_si256();
   __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
   int i;
@@ -223,11 +234,11 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
     const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
     const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     } else {
       exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
@@ -241,9 +252,10 @@ static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
 // (x == 0, y == 4) or (x == 4, y == 0).  sstep determines the direction.
 static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
                                    const uint8_t *dst, int dst_stride,
-                                   const uint8_t *sec, int sec_stride,
-                                   int do_sec, int height, __m256i *sum_reg,
-                                   __m256i *sse_reg, int sstep) {
+                                   const uint8_t *second_pred,
+                                   int second_stride, int do_sec, int height,
+                                   __m256i *sum_reg, __m256i *sse_reg,
+                                   int sstep) {
   const __m256i zero_reg = _mm256_setzero_si256();
   __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
   int i;
@@ -253,11 +265,11 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
     const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
     const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     } else {
       exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
@@ -270,24 +282,27 @@ static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
 
 static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg) {
-  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, sum_reg, sse_reg, src_stride);
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, sum_reg, sse_reg, src_stride);
 }
 
 static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg) {
-  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, sum_reg, sse_reg, 1);
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, sum_reg, sse_reg, 1);
 }
 
 static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg) {
   const __m256i zero_reg = _mm256_setzero_si256();
   const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
   const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
@@ -304,11 +319,11 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
     prev_src_avg = src_avg;
 
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     } else {
       exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
@@ -323,9 +338,10 @@ static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
 // (x == 0, y == bil) or (x == 4, y == bil).  sstep determines the direction.
 static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
                                     const uint8_t *dst, int dst_stride,
-                                    const uint8_t *sec, int sec_stride,
-                                    int do_sec, int height, __m256i *sum_reg,
-                                    __m256i *sse_reg, int offset, int sstep) {
+                                    const uint8_t *second_pred,
+                                    int second_stride, int do_sec, int height,
+                                    __m256i *sum_reg, __m256i *sse_reg,
+                                    int offset, int sstep) {
   const __m256i zero_reg = _mm256_setzero_si256();
   const __m256i pw8 = _mm256_set1_epi16(8);
   const __m256i filter = _mm256_load_si256(
@@ -341,10 +357,10 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
 
     FILTER_SRC(filter)
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
       const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
     }
@@ -356,27 +372,27 @@ static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
 
 static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg,
-                               int y_offset) {
-  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                   height, sum_reg, sse_reg, y_offset, src_stride);
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int y_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                   do_sec, height, sum_reg, sse_reg, y_offset, src_stride);
 }
 
 static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg,
-                               int x_offset) {
-  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                   height, sum_reg, sse_reg, x_offset, 1);
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                   do_sec, height, sum_reg, sse_reg, x_offset, 1);
 }
 
 static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg,
-                               int y_offset) {
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int y_offset) {
   const __m256i zero_reg = _mm256_setzero_si256();
   const __m256i pw8 = _mm256_set1_epi16(8);
   const __m256i filter = _mm256_load_si256(
@@ -398,12 +414,12 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
 
     FILTER_SRC(filter)
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
       const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     }
     CALC_SUM_SSE_INSIDE_LOOP
     dst += dst_stride;
@@ -413,9 +429,9 @@ static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
 
 static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg,
-                               int x_offset) {
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset) {
   const __m256i zero_reg = _mm256_setzero_si256();
   const __m256i pw8 = _mm256_set1_epi16(8);
   const __m256i filter = _mm256_load_si256(
@@ -446,11 +462,11 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
     src_pack = _mm256_avg_epu8(src_pack, src_reg);
 
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     } else {
       exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
@@ -464,9 +480,9 @@ static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
 
 static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
                                const uint8_t *dst, int dst_stride,
-                               const uint8_t *sec, int sec_stride, int do_sec,
-                               int height, __m256i *sum_reg, __m256i *sse_reg,
-                               int x_offset, int y_offset) {
+                               const uint8_t *second_pred, int second_stride,
+                               int do_sec, int height, __m256i *sum_reg,
+                               __m256i *sse_reg, int x_offset, int y_offset) {
   const __m256i zero_reg = _mm256_setzero_si256();
   const __m256i pw8 = _mm256_set1_epi16(8);
   const __m256i xfilter = _mm256_load_si256(
@@ -501,12 +517,12 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
 
     FILTER_SRC(yfilter)
     if (do_sec) {
-      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)second_pred);
       const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
       const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
       exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
       exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
-      sec += sec_stride;
+      second_pred += second_stride;
     }
 
     prev_src_pack = src_pack;
@@ -520,7 +536,7 @@ static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
 static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
                                   int x_offset, int y_offset,
                                   const uint8_t *dst, int dst_stride,
-                                  const uint8_t *sec, int sec_stride,
+                                  const uint8_t *second_pred, int second_stride,
                                   int do_sec, int height, unsigned int *sse) {
   const __m256i zero_reg = _mm256_setzero_si256();
   __m256i sum_reg = _mm256_setzero_si256();
@@ -530,44 +546,44 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
   // x_offset = 0 and y_offset = 0
   if (x_offset == 0) {
     if (y_offset == 0) {
-      spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg);
+      spv32_x0_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
       // x_offset = 0 and y_offset = 4
     } else if (y_offset == 4) {
-      spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg);
+      spv32_x0_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
       // x_offset = 0 and y_offset = bilin interpolation
     } else {
-      spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg, y_offset);
+      spv32_x0_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, y_offset);
     }
     // x_offset = 4  and y_offset = 0
   } else if (x_offset == 4) {
     if (y_offset == 0) {
-      spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg);
+      spv32_x4_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
       // x_offset = 4  and y_offset = 4
     } else if (y_offset == 4) {
-      spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg);
+      spv32_x4_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg);
       // x_offset = 4  and y_offset = bilin interpolation
     } else {
-      spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg, y_offset);
+      spv32_x4_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, y_offset);
     }
     // x_offset = bilin interpolation and y_offset = 0
   } else {
     if (y_offset == 0) {
-      spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg, x_offset);
+      spv32_xb_y0(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset);
       // x_offset = bilin interpolation and y_offset = 4
     } else if (y_offset == 4) {
-      spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg, x_offset);
+      spv32_xb_y4(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset);
       // x_offset = bilin interpolation and y_offset = bilin interpolation
     } else {
-      spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
-                  height, &sum_reg, &sse_reg, x_offset, y_offset);
+      spv32_xb_yb(src, src_stride, dst, dst_stride, second_pred, second_stride,
+                  do_sec, height, &sum_reg, &sse_reg, x_offset, y_offset);
     }
   }
   CALC_SUM_AND_SSE
@@ -583,127 +599,177 @@ static unsigned int sub_pixel_variance32xh_avx2(
 
 static unsigned int sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
+    const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
+    int second_stride, int height, unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
-                         sec, sec_stride, 1, height, sse);
+                         second_pred, second_stride, 1, height, sse);
 }
 
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
+typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse, int *sum);
 
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, int w, int h,
-                          unsigned int *sse, int *sum, get_var_avx2 var_fn,
-                          int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 7);
 }
 
-unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                vpx_get16x16var_avx2, 16);
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
-unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
   int sum;
-  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                get32x16var_avx2, 32);
+  __m256i vsse, vsum;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                get32x16var_avx2, 32);
+  __m256i vsse, vsum;
+  __m128i vsum_128;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  vsum_128 = _mm_add_epi16(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  vsum_128 = _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                           _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                get32x16var_avx2, 32);
-  return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
+  __m256i vsse, vsum;
+  __m128i vsum_128;
+  variance32_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64, &vsse, &vsum);
+  vsum = sum_to_32bit_avx2(vsum);
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m256i vsse = _mm256_setzero_si256();
+  __m256i vsum = _mm256_setzero_si256();
+  __m128i vsum_128;
   int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                get32x16var_avx2, 32);
+  variance64_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  vsum = sum_to_32bit_avx2(vsum);
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    unsigned int *sse) {
+  __m256i vsse = _mm256_setzero_si256();
+  __m256i vsum = _mm256_setzero_si256();
+  __m128i vsum_128;
+  int sum;
+  int i = 0;
+
+  for (i = 0; i < 2; i++) {
+    __m256i vsum16;
+    variance64_avx2(src_ptr + 32 * i * src_stride, src_stride,
+                    ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+                    &vsum16);
+    vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));
+  }
+  vsum_128 = _mm_add_epi32(_mm256_castsi256_si128(vsum),
+                           _mm256_extractf128_si256(vsum, 1));
+  variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
+                              unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  __m256i vsse, vsum;
+  variance16_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
   unsigned int sse1;
   const int se1 = sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
+      src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 64, &sse1);
   unsigned int sse2;
   const int se2 =
-      sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                  dst + 32, dst_stride, 64, &sse2);
+      sub_pixel_variance32xh_avx2(src_ptr + 32, src_stride, x_offset, y_offset,
+                                  ref_ptr + 32, ref_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
   return *sse - (uint32_t)(((int64_t)se * se) >> 12);
 }
 
-unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride, int x_offset,
-                                              int y_offset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
+unsigned int vpx_sub_pixel_variance32x32_avx2(
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {
   const int se = sub_pixel_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
+      src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
 
 unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred) {
   unsigned int sse1;
-  const int se1 = sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
+  const int se1 = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+                                                  y_offset, ref_ptr, ref_stride,
+                                                  second_pred, 64, 64, &sse1);
   unsigned int sse2;
   const int se2 = sub_pixel_avg_variance32xh_avx2(
-      src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
-      64, 64, &sse2);
+      src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32, ref_stride,
+      second_pred + 32, 64, 64, &sse2);
   const int se = se1 + se2;
 
   *sse = sse1 + sse2;
@@ -712,10 +778,12 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
 }
 
 unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
+    const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,
+    const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,
+    const uint8_t *second_pred) {
   // Process 32 elements in parallel.
-  const int se = sub_pixel_avg_variance32xh_avx2(
-      src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
+  const int se = sub_pixel_avg_variance32xh_avx2(src_ptr, src_stride, x_offset,
+                                                 y_offset, ref_ptr, ref_stride,
+                                                 second_pred, 32, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
diff --git a/libs/libvpx/vpx_dsp/x86/variance_sse2.c b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
index 8d8bf183b28..37ef64ecaa0 100644
--- a/libs/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libs/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -8,312 +8,426 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-
 #include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
 
-typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
-                               const unsigned char *ref, int ref_stride,
-                               unsigned int *sse, int *sum);
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return _mm_cvtsi128_si32(val);
+}
 
-unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
   __m128i vsum = _mm_setzero_si128();
   int i;
 
   for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v = _mm_loadu_si128((const __m128i *)src_ptr);
     vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
+    src_ptr += 8;
   }
 
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return _mm_cvtsi128_si32(vsum);
+  return add32x4_sse2(vsum);
 }
 
-#define READ64(p, stride, i)                                  \
-  _mm_unpacklo_epi8(                                          \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
+  return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src_ptr,
+                                        const __m128i ref_ptr,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src_ptr, ref_ptr);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
   *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
 
-  // sse
-  vsum =
-      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
 }
 
-void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
-                        int ref_stride, unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = add32x4_sse2(vsum);
+}
+
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE int sum_final_sse2(const __m128i sum) {
+  const __m128i t = sum_to_32bit_sse2(sum);
+  return add32x4_sse2(t);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
+                                  const uint8_t *ref_ptr, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
   int i;
 
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(
-        _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  assert(h <= 256);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src_ptr, src_stride);
+    const __m128i r = load4x2_sse2(ref_ptr, ref_stride);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
   }
+}
 
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+static INLINE void variance8_sse2(const uint8_t *src_ptr, const int src_stride,
+                                  const uint8_t *ref_ptr, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  int i;
+
+  assert(h <= 128);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
 
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
+  for (i = 0; i < h; i++) {
+    const __m128i s =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src_ptr), zero);
+    const __m128i r =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ref_ptr), zero);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
 }
 
-void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum) {
+static INLINE void variance16_kernel_sse2(const uint8_t *const src_ptr,
+                                          const uint8_t *const ref_ptr,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
   const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src_ptr);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref_ptr);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
   int i;
 
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  assert(h <= 64);  // May overflow for larger height.
+  *sse = _mm_setzero_si128();
+  *sum = _mm_setzero_si128();
 
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr, ref_ptr, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
 
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+static INLINE void variance32_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  int i;
 
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
 
-    src += src_stride;
-    ref += ref_stride;
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
+}
 
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum =
-      (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride, int w,
-                          int h, unsigned int *sse, int *sum,
-                          getNxMvar_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
+static INLINE void variance64_sse2(const uint8_t *src_ptr, const int src_stride,
+                                   const uint8_t *ref_ptr, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  int i;
+
+  assert(h <= 16);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src_ptr + 0, ref_ptr + 0, sse, sum);
+    variance16_kernel_sse2(src_ptr + 16, ref_ptr + 16, sse, sum);
+    variance16_kernel_sse2(src_ptr + 32, ref_ptr + 32, sse, sum);
+    variance16_kernel_sse2(src_ptr + 48, ref_ptr + 48, sse, sum);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
   }
 }
 
-unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride,
+                        const uint8_t *ref_ptr, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, sum);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride,
+                          const uint8_t *ref_ptr, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  __m128i vsse, vsum;
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_256_pel_sse2(vsse, vsum, sse, sum);
+}
+
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 4);
 }
 
-unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
-                get4x4var_sse2, 4);
+  variance4_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 5);
 }
 
-unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
-                get4x4var_sse2, 4);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 5);
 }
 
-unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
                                   unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 6);
 }
 
-unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
-                vpx_get8x8var_sse2, 8);
+  variance8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 7);
 }
 
-unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
-                vpx_get8x8var_sse2, 8);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_128_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - ((sum * sum) >> 7);
 }
 
-unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_256_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
-unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse, vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
+  variance16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                vpx_get16x16var_sse2, 16);
+  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_512_pel_sse2(vsse, vsum, sse, &sum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
 }
 
-unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum;
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
+  variance32_sse2(src_ptr, src_stride, ref_ptr, ref_stride, 32, &vsse, &vsum);
+  *sse = add32x4_sse2(vsse);
+  sum = sum_final_sse2(vsum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
 }
 
-unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
+  int i = 0;
+
+  for (i = 0; i < 2; i++) {
+    __m128i vsum16;
+    variance32_sse2(src_ptr + 32 * i * src_stride, src_stride,
+                    ref_ptr + 32 * i * ref_stride, ref_stride, 32, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = add32x4_sse2(vsum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                vpx_get16x16var_sse2, 16);
+  int i = 0;
+
+  for (i = 0; i < 2; i++) {
+    __m128i vsum16;
+    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
                                     unsigned int *sse) {
+  __m128i vsse = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
   int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
-                vpx_get16x16var_sse2, 16);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
+  int i = 0;
+
+  for (i = 0; i < 4; i++) {
+    __m128i vsum16;
+    variance64_sse2(src_ptr + 16 * i * src_stride, src_stride,
+                    ref_ptr + 16 * i * ref_stride, ref_stride, 16, &vsse,
+                    &vsum16);
+    vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
+  }
+  *sse = add32x4_sse2(vsse);
+  sum = add32x4_sse2(vsum);
+  return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 
-unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse) {
-  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance8x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
                               unsigned int *sse) {
-  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance8x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride,
+                              const uint8_t *ref_ptr, int ref_stride,
                               unsigned int *sse) {
-  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance16x8_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
-unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
                                unsigned int *sse) {
-  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  vpx_variance16x16_sse2(src_ptr, src_stride, ref_ptr, ref_stride, sse);
   return *sse;
 }
 
 // The 2 unused parameters are place holders for PIC enabled build.
 // These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int vpx_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
+#define DECL(w, opt)                                                          \
+  int vpx_sub_pixel_variance##w##xh_##opt(                                    \
+      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,             \
+      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride, int height, \
+      unsigned int *sse, void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
   DECL(4, opt1);          \
   DECL(8, opt1);          \
@@ -324,36 +438,37 @@ DECLS(ssse3, ssse3);
 #undef DECLS
 #undef DECL
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {             \
-    unsigned int sse;                                                          \
-    int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset,   \
-                                                  y_offset, dst, dst_stride,   \
-                                                  h, &sse, NULL, NULL);        \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                          \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                            \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) {        \
+    unsigned int sse_tmp;                                                 \
+    int se = vpx_sub_pixel_variance##wf##xh_##opt(                        \
+        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride, h,  \
+        &sse_tmp, NULL, NULL);                                            \
+    if (w > wf) {                                                         \
+      unsigned int sse2;                                                  \
+      int se2 = vpx_sub_pixel_variance##wf##xh_##opt(                     \
+          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
+          ref_stride, h, &sse2, NULL, NULL);                              \
+      se += se2;                                                          \
+      sse_tmp += sse2;                                                    \
+      if (w > wf * 2) {                                                   \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
+            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
+            ref_stride, h, &sse2, NULL, NULL);                            \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+        se2 = vpx_sub_pixel_variance##wf##xh_##opt(                       \
+            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
+            ref_stride, h, &sse2, NULL, NULL);                            \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+      }                                                                   \
+    }                                                                     \
+    *sse = sse_tmp;                                                       \
+    return sse_tmp -                                                      \
+           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
 #define FNS(opt1, opt2)                              \
@@ -378,12 +493,12 @@ FNS(ssse3, ssse3);
 #undef FN
 
 // The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int vpx_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
+#define DECL(w, opt)                                                   \
+  int vpx_sub_pixel_avg_variance##w##xh_##opt(                         \
+      const uint8_t *src_ptr, ptrdiff_t src_stride, int x_offset,      \
+      int y_offset, const uint8_t *ref_ptr, ptrdiff_t ref_stride,      \
+      const uint8_t *second_pred, ptrdiff_t second_stride, int height, \
+      unsigned int *sse, void *unused0, void *unused)
 #define DECLS(opt1, opt2) \
   DECL(4, opt1);          \
   DECL(8, opt1);          \
@@ -394,37 +509,38 @@ DECLS(ssse3, ssse3);
 #undef DECL
 #undef DECLS
 
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                       \
-  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(                    \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,          \
-      const uint8_t *dst, int dst_stride, unsigned int *sseptr,                \
-      const uint8_t *sec) {                                                    \
-    unsigned int sse;                                                          \
-    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                         \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                      \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                        \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sseptr = sse;                                                             \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));   \
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                  \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *ref_ptr, int ref_stride, unsigned int *sse,          \
+      const uint8_t *second_pred) {                                       \
+    unsigned int sse_tmp;                                                 \
+    int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(                    \
+        src_ptr, src_stride, x_offset, y_offset, ref_ptr, ref_stride,     \
+        second_pred, w, h, &sse_tmp, NULL, NULL);                         \
+    if (w > wf) {                                                         \
+      unsigned int sse2;                                                  \
+      int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                 \
+          src_ptr + 16, src_stride, x_offset, y_offset, ref_ptr + 16,     \
+          ref_stride, second_pred + 16, w, h, &sse2, NULL, NULL);         \
+      se += se2;                                                          \
+      sse_tmp += sse2;                                                    \
+      if (w > wf * 2) {                                                   \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
+            src_ptr + 32, src_stride, x_offset, y_offset, ref_ptr + 32,   \
+            ref_stride, second_pred + 32, w, h, &sse2, NULL, NULL);       \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+        se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(                   \
+            src_ptr + 48, src_stride, x_offset, y_offset, ref_ptr + 48,   \
+            ref_stride, second_pred + 48, w, h, &sse2, NULL, NULL);       \
+        se += se2;                                                        \
+        sse_tmp += sse2;                                                  \
+      }                                                                   \
+    }                                                                     \
+    *sse = sse_tmp;                                                       \
+    return sse_tmp -                                                      \
+           (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
 #define FNS(opt1, opt2)                              \
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
deleted file mode 100644
index 4f164afeb4e..00000000000
--- a/libs/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction vpx_filter_block1d16_v8_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_sse2;
-filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
-
-filter8_1dfunction vpx_filter_block1d16_v2_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_sse2;
-filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
-filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
-
-// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const InterpKernel *filter, int x0_q4,
-//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                               int w, int h);
-// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const InterpKernel *filter, int x0_q4,
-//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                              int w, int h);
-// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const InterpKernel *filter, int x0_q4,
-//                                   int32_t x_step_q4, int y0_q4,
-//                                   int y_step_q4, int w, int h);
-// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const InterpKernel *filter, int x0_q4,
-//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
-
-// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const InterpKernel *filter, int x0_q4,
-//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                         int w, int h);
-// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const InterpKernel *filter, int x0_q4,
-//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                             int w, int h);
-FUN_CONV_2D(, sse2);
-FUN_CONV_2D(avg_, sse2);
-
-#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
-
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
-highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
-
-// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
-//                                          ptrdiff_t src_stride,
-//                                          uint8_t *dst,
-//                                          ptrdiff_t dst_stride,
-//                                          const int16_t *filter_x,
-//                                          int x_step_q4,
-//                                          const int16_t *filter_y,
-//                                          int y_step_q4,
-//                                          int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
-//                                         ptrdiff_t src_stride,
-//                                         uint8_t *dst,
-//                                         ptrdiff_t dst_stride,
-//                                         const int16_t *filter_x,
-//                                         int x_step_q4,
-//                                         const int16_t *filter_y,
-//                                         int y_step_q4,
-//                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
-                 sse2);
-
-// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const InterpKernel *filter, int x0_q4,
-//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
-//                                int w, int h, int bd);
-// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const InterpKernel *filter, int x0_q4,
-//                                    int32_t x_step_q4, int y0_q4,
-//                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2);
-HIGH_FUN_CONV_2D(avg_, sse2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index d83507dc995..c57149657ab 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -45,7 +45,7 @@
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movsxd      rcx, DWORD PTR arg(6)      ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
@@ -121,7 +121,7 @@
 
     ;Compute max and min values of a pixel
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm0, rdx
     movq        xmm1, rcx
     pshufd      xmm0, xmm0, 0b
@@ -199,7 +199,7 @@
 
 SECTION .text
 
-;void vpx_filter_block1d4_v8_sse2
+;void vpx_highbd_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -269,7 +269,7 @@ sym(vpx_highbd_filter_block1d4_v8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d8_v8_sse2
+;void vpx_highbd_filter_block1d8_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -328,7 +328,7 @@ sym(vpx_highbd_filter_block1d8_v8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d16_v8_sse2
+;void vpx_highbd_filter_block1d16_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
 ;    unsigned int   src_pitch,
@@ -554,7 +554,7 @@ sym(vpx_highbd_filter_block1d16_v8_avg_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d4_h8_sse2
+;void vpx_highbd_filter_block1d4_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -629,7 +629,7 @@ sym(vpx_highbd_filter_block1d4_h8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d8_h8_sse2
+;void vpx_highbd_filter_block1d8_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
@@ -695,7 +695,7 @@ sym(vpx_highbd_filter_block1d8_h8_sse2):
     pop         rbp
     ret
 
-;void vpx_filter_block1d16_h8_sse2
+;void vpx_highbd_filter_block1d16_h8_sse2
 ;(
 ;    unsigned char  *src_ptr,
 ;    unsigned int    src_pixels_per_line,
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 9bffe504b1f..87bf75ebb8e 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/libs/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -26,7 +26,7 @@
     pshufd      xmm3, xmm3, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm5, rdx
     movq        xmm2, rcx
     pshufd      xmm5, xmm5, 0b
@@ -82,7 +82,7 @@
     pshufd      xmm4, xmm4, 0
 
     mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movsxd      rcx, DWORD PTR arg(6)       ;bd
     movq        xmm8, rdx
     movq        xmm5, rcx
     pshufd      xmm8, xmm8, 0b
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
new file mode 100644
index 00000000000..e0e8b8f9010
--- /dev/null
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -0,0 +1,1161 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_ports/mem.h"
+
+#define CONV8_ROUNDING_BITS (7)
+#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1))
+
+static void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first, dst_second;
+  __m128i even, odd;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[2] s[1] s[0] s[-1]
+    // ... s[4] s[3] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+    // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together for the first half of even
+    // output.
+    // Repeat multiple times to get the whole outoput
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 6 4 2 0
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 7 5 3 1
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
+
+    // Do again to get the second half of dst
+    src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 14 12 10 8
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 15 13 11 9
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the second half of the dst
+    dst_second = mm_zip_epi32_sse2(&even, &odd);
+
+    // Round each result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm_packus_epi16(dst_first, dst_second);
+    _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+/* The macro used to generate functions shifts the src_ptr up by 3 rows already
+ * */
+
+static void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1, src_reg_m10_lo_2, src_reg_m10_hi_1,
+      src_reg_m10_hi_2;
+  __m128i src_reg_01_lo_1, src_reg_01_lo_2, src_reg_01_hi_1, src_reg_01_hi_2;
+  __m128i src_reg_12_lo_1, src_reg_12_lo_2, src_reg_12_hi_1, src_reg_12_hi_2;
+  __m128i src_reg_23_lo_1, src_reg_23_lo_2, src_reg_23_hi_1, src_reg_23_hi_2;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_hi_1 = _mm_unpacklo_epi8(src_reg_m10_hi, _mm_setzero_si128());
+  src_reg_m10_hi_2 = _mm_unpackhi_epi8(src_reg_m10_hi, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_hi_1 = _mm_unpacklo_epi8(src_reg_01_hi, _mm_setzero_si128());
+  src_reg_01_hi_2 = _mm_unpackhi_epi8(src_reg_01_hi, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+    // Partial output from first half
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
+
+    // Add to get first half of the results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Now repeat everything again for the second half
+    // Partial output for second half
+    res_reg_m10_hi = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23);
+
+    res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128());
+    src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128());
+    res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128());
+    src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128());
+    res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2,
+                                             &kernel_reg_45);
+
+    // Second half of the results
+    res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_m10_lo_2 = src_reg_12_lo_2;
+    src_reg_m10_hi_1 = src_reg_12_hi_1;
+    src_reg_m10_hi_2 = src_reg_12_hi_2;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_01_lo_2 = src_reg_23_lo_2;
+    src_reg_01_hi_1 = src_reg_23_hi_1;
+    src_reg_01_hi_2 = src_reg_23_hi_2;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first;
+  __m128i even, odd;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[2] s[1] s[0] s[-1]
+    // ... s[4] s[3] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+    // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together to get the even output
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Output 6 4 2 0
+    even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                 &kernel_reg_45);
+
+    // Output 7 5 3 1
+    odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    dst_first = mm_zip_epi32_sse2(&even, &odd);
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Saturate and convert to 8-bit words
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo;
+  __m128i src_reg_12_lo, src_reg_23_lo;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1, src_reg_m10_lo_2;
+  __m128i src_reg_01_lo_1, src_reg_01_lo_2;
+  __m128i src_reg_12_lo_1, src_reg_12_lo_2;
+  __m128i src_reg_23_lo_1, src_reg_23_lo_2;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+  src_reg_m10_lo_2 = _mm_unpackhi_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+  src_reg_01_lo_2 = _mm_unpackhi_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10_lo = mm_madd_packs_epi16_sse2(
+        &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23);
+
+    res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2,
+                                             &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2,
+                                             &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2,
+                                             &kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+    // Convert to 8-bit words
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128());
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, _mm_setzero_si128());
+
+    // Save only half of the register (8 words)
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_m10_lo_2 = src_reg_12_lo_2;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_01_lo_2 = src_reg_23_lo_2;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i dst_first;
+  __m128i tmp_0, tmp_1;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will load multiple shifted versions of the row and shuffle them into
+    // 16-bit words of the form
+    // ... s[1] s[0] s[0] s[-1]
+    // ... s[3] s[2] s[2] s[1]
+    // Then we call multiply and add to get partial results
+    // s[1]k[3]+s[0]k[2] s[0]k[3]s[-1]k[2]
+    // s[3]k[5]+s[2]k[4] s[2]k[5]s[1]k[4]
+    // The two results are then added together to get the output
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 1);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 3);
+
+    // Convert to 16-bit words
+    src_reg = _mm_unpacklo_epi8(src_reg, _mm_setzero_si128());
+    src_reg_shift_1 = _mm_unpacklo_epi8(src_reg_shift_1, _mm_setzero_si128());
+    src_reg_shift_2 = _mm_unpacklo_epi8(src_reg_shift_2, _mm_setzero_si128());
+    src_reg_shift_3 = _mm_unpacklo_epi8(src_reg_shift_3, _mm_setzero_si128());
+
+    // Shuffle into the right format
+    tmp_0 = _mm_unpacklo_epi32(src_reg, src_reg_shift_1);
+    tmp_1 = _mm_unpacklo_epi32(src_reg_shift_2, src_reg_shift_3);
+
+    // Partial output
+    tmp_0 = _mm_madd_epi16(tmp_0, kernel_reg_23);
+    tmp_1 = _mm_madd_epi16(tmp_1, kernel_reg_45);
+
+    // Output
+    dst_first = _mm_add_epi32(tmp_0, tmp_1);
+    dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128());
+
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Saturate and convert to 8-bit words
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+
+    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo;
+  __m128i src_reg_12_lo, src_reg_23_lo;
+  // Half of half of the interleaved rows
+  __m128i src_reg_m10_lo_1;
+  __m128i src_reg_01_lo_1;
+  __m128i src_reg_12_lo_1;
+  __m128i src_reg_23_lo_1;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // We will load two rows of pixels as 8-bit words, rearrange them as 16-bit
+  // words,
+  // shuffle the data into the form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_lo_1 = _mm_unpacklo_epi8(src_reg_m10_lo, _mm_setzero_si128());
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_lo_1 = _mm_unpacklo_epi8(src_reg_01_lo, _mm_setzero_si128());
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, &reg_zero, &kernel_reg_23);
+
+    res_reg_01_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &reg_zero, &kernel_reg_23);
+
+    src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128());
+    res_reg_12_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &reg_zero, &kernel_reg_45);
+
+    src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128());
+    res_reg_23_lo =
+        mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &reg_zero, &kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+
+    // Convert to 8-bit words
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
+
+    // Save only half of the register (8 words)
+    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo_1 = src_reg_12_lo_1;
+    src_reg_01_lo_1 = src_reg_23_lo_1;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+static void vpx_highbd_filter_block1d4_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together to get the even output
+
+  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
+    src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
+    src_reg_shift_3 = _mm_srli_si128(src_reg, 6);
+
+    // Output 2 0
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 3 1
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    res_reg = _mm_unpacklo_epi32(even, odd);
+    res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = _mm_packs_epi32(res_reg, reg_zero);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_highbd_filter_block1d4_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+  __m128i res_reg_m1012, res_reg_0123;
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23);
+    res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23);
+    res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45);
+    res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12);
+    res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23);
+
+    // Round the words
+    res_reg_m1012 =
+        mm_round_epi32_sse2(&res_reg_m1012, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123 =
+        mm_round_epi32_sse2(&res_reg_0123, &reg_round, CONV8_ROUNDING_BITS);
+
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10 = src_reg_12;
+    src_reg_01 = src_reg_23;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load multiple shifted versions of the row and shuffle them into
+  // 16-bit words of the form
+  // ... s[2] s[1] s[0] s[-1]
+  // ... s[4] s[3] s[2] s[1]
+  // Then we call multiply and add to get partial results
+  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
+  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
+  // The two results are then added together for the first half of even
+  // output.
+  // Repeat multiple times to get the whole outoput
+
+  __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2,
+      src_reg_shift_3;
+  __m128i res_reg;
+  __m128i even, odd;
+  __m128i tmp_0, tmp_1;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+  int h;
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  for (h = height; h > 0; --h) {
+    // We will put first half in the first half of the reg, and second half in
+    // second half
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+
+    // Output 6 4 2 0
+    tmp_0 = _mm_srli_si128(src_reg, 4);
+    tmp_1 = _mm_srli_si128(src_reg_next, 2);
+    src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
+                                  &kernel_reg_45);
+
+    // Output 7 5 3 1
+    tmp_0 = _mm_srli_si128(src_reg, 2);
+    tmp_1 = src_reg_next;
+    src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    tmp_0 = _mm_srli_si128(src_reg, 6);
+    tmp_1 = _mm_srli_si128(src_reg_next, 4);
+    src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1);
+
+    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
+                                 &kernel_reg_23, &kernel_reg_45);
+
+    // Combine to get the first half of the dst
+    even = mm_round_epi32_sse2(&even, &reg_round, CONV8_ROUNDING_BITS);
+    odd = mm_round_epi32_sse2(&odd, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg = mm_zip_epi32_sse2(&even, &odd);
+
+    // Saturate the result and save
+    res_reg = _mm_min_epi16(res_reg, reg_max);
+    res_reg = _mm_max_epi16(res_reg, reg_zero);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_highbd_filter_block1d8_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  // We will load two rows of pixels as 16-bit words, and shuffle them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,7] s[-1,7] s[0,6] s[-1,6]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // ... s[0,13] s[-1,13] s[0,12] s[-1,12]
+  // so that we can call multiply and add with the kernel to get 32-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi;
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo;
+  __m128i res_reg_m1012_hi, res_reg_0123_hi;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  const __m128i reg_round =
+      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
+  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i reg_zero = _mm_setzero_si128();
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
+  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3);
+
+    // Partial output for first half
+    res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23);
+    res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23);
+    res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45);
+    res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo);
+
+    // Round the words
+    res_reg_m1012_lo =
+        mm_round_epi32_sse2(&res_reg_m1012_lo, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_lo =
+        mm_round_epi32_sse2(&res_reg_0123_lo, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Partial output for first half
+    res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23);
+    res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23);
+    res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45);
+    res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45);
+
+    // Add to get results
+    res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_hi =
+        mm_round_epi32_sse2(&res_reg_m1012_hi, &reg_round, CONV8_ROUNDING_BITS);
+    res_reg_0123_hi =
+        mm_round_epi32_sse2(&res_reg_0123_hi, &reg_round, CONV8_ROUNDING_BITS);
+
+    // Combine the two halfs
+    res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi);
+
+    // Saturate according to bit depth
+    res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max);
+    res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max);
+    res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero);
+    res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero);
+
+    // Save only half of the register (8 words)
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo = src_reg_12_lo;
+    src_reg_m10_hi = src_reg_12_hi;
+    src_reg_01_lo = src_reg_23_lo;
+    src_reg_01_hi = src_reg_23_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_highbd_filter_block1d16_h4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+
+static void vpx_highbd_filter_block1d16_v4_sse2(
+    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                     height, kernel, bd);
+  vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8,
+                                     dst_stride, height, kernel, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+
+// From vpx_subpixel_8t_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v8_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_sse2;
+filter8_1dfunction vpx_filter_block1d16_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_sse2 vpx_filter_block1d16_v8_avg_sse2
+#define vpx_filter_block1d16_h4_avg_sse2 vpx_filter_block1d16_h8_avg_sse2
+#define vpx_filter_block1d8_v4_avg_sse2 vpx_filter_block1d8_v8_avg_sse2
+#define vpx_filter_block1d8_h4_avg_sse2 vpx_filter_block1d8_h8_avg_sse2
+#define vpx_filter_block1d4_v4_avg_sse2 vpx_filter_block1d4_v8_avg_sse2
+#define vpx_filter_block1d4_h4_avg_sse2 vpx_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm.
+filter8_1dfunction vpx_filter_block1d16_v2_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_sse2;
+filter8_1dfunction vpx_filter_block1d16_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_sse2;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
+
+// void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                               int w, int h);
+// void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
+// void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                  uint8_t *dst, ptrdiff_t dst_stride,
+//                                  const InterpKernel *filter, int x0_q4,
+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                                  int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
+            sse2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+
+// void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                         uint8_t *dst, ptrdiff_t dst_stride,
+//                         const InterpKernel *filter, int x0_q4,
+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                         int w, int h);
+// void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                             uint8_t *dst, ptrdiff_t dst_stride,
+//                             const InterpKernel *filter, int x0_q4,
+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                             int w, int h);
+FUN_CONV_2D(, sse2, 0);
+FUN_CONV_2D(avg_, sse2, 1);
+
+#if CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
+// From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
+
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_highbd_filter_block1d16_v4_avg_sse2 \
+  vpx_highbd_filter_block1d16_v8_avg_sse2
+#define vpx_highbd_filter_block1d16_h4_avg_sse2 \
+  vpx_highbd_filter_block1d16_h8_avg_sse2
+#define vpx_highbd_filter_block1d8_v4_avg_sse2 \
+  vpx_highbd_filter_block1d8_v8_avg_sse2
+#define vpx_highbd_filter_block1d8_h4_avg_sse2 \
+  vpx_highbd_filter_block1d8_h8_avg_sse2
+#define vpx_highbd_filter_block1d4_v4_avg_sse2 \
+  vpx_highbd_filter_block1d4_v8_avg_sse2
+#define vpx_highbd_filter_block1d4_h4_avg_sse2 \
+  vpx_highbd_filter_block1d4_h8_avg_sse2
+
+// From vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm.
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2;
+highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
+
+// void vpx_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void vpx_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_horiz_sse2(const uint8_t *src,
+//                                          ptrdiff_t src_stride,
+//                                          uint8_t *dst,
+//                                          ptrdiff_t dst_stride,
+//                                          const int16_t *filter_x,
+//                                          int x_step_q4,
+//                                          const int16_t *filter_y,
+//                                          int y_step_q4,
+//                                          int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_vert_sse2(const uint8_t *src,
+//                                         ptrdiff_t src_stride,
+//                                         uint8_t *dst,
+//                                         ptrdiff_t dst_stride,
+//                                         const int16_t *filter_x,
+//                                         int x_step_q4,
+//                                         const int16_t *filter_y,
+//                                         int y_step_q4,
+//                                         int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+
+// void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                                int w, int h, int bd);
+// void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h, int bd);
+HIGH_FUN_CONV_2D(, sse2, 0);
+HIGH_FUN_CONV_2D(avg_, sse2, 1);
+#endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index d0919695ce9..55919f9a0ce 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -9,22 +9,24 @@
  */
 
 #include <immintrin.h>
+#include <stdio.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
 #include "vpx_dsp/x86/convolve_avx2.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8
-DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
 
-DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
 
 DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
   4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
@@ -326,6 +328,570 @@ static void vpx_filter_block1d16_v8_avg_avx2(
                                  height, filter, 1);
 }
 
+static void vpx_filter_block1d16_h4_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+  // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+  // time.
+
+  __m128i kernel_reg;  // Kernel
+  __m256i kernel_reg_256, kernel_reg_23,
+      kernel_reg_45;                             // Segments of the kernel used
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i dst_first, dst_second;
+  __m256i tmp_0, tmp_1;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+                       2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+                       4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+  kernel_reg_23 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for first half
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr + 8, src_ptr + src_stride + 8);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for second half
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_second = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    // Round each result
+    dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+    dst_second = mm256_round_epi16(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm256_packus_epi16(dst_first, dst_second);
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &dst_first);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    src_reg = _mm256_loadu_si256((const __m256i *)src_ptr);
+    // Reorder into 2 1 1 2
+    src_reg = _mm256_permute4x64_epi64(src_reg, 0x94);
+
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    dst_first = mm256_round_epi16(&dst_first, &reg_32, 6);
+
+    dst_first = _mm256_packus_epi16(dst_first, dst_first);
+    dst_first = _mm256_permute4x64_epi64(dst_first, 0x8);
+
+    _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(dst_first));
+  }
+}
+
+static void vpx_filter_block1d16_v4_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi;
+
+  __m128i kernel_reg;  // Kernel
+  __m256i kernel_reg_256, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m256i res_reg_m1001_lo, res_reg_1223_lo, res_reg_m1001_hi, res_reg_1223_hi;
+  __m256i res_reg, res_reg_lo, res_reg_hi;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_256 = _mm256_broadcastsi128_si256(kernel_reg);
+  kernel_reg_23 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 =
+      _mm256_shuffle_epi8(kernel_reg_256, _mm256_set1_epi16(0x0504u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001_lo = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+  src_reg_m1001_hi = _mm256_unpackhi_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223_lo = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+    src_reg_1223_hi = _mm256_unpackhi_epi8(src_reg_12, src_reg_23);
+
+    // Output from first half
+    res_reg_m1001_lo = _mm256_maddubs_epi16(src_reg_m1001_lo, kernel_reg_23);
+    res_reg_1223_lo = _mm256_maddubs_epi16(src_reg_1223_lo, kernel_reg_45);
+    res_reg_lo = _mm256_adds_epi16(res_reg_m1001_lo, res_reg_1223_lo);
+
+    // Output from second half
+    res_reg_m1001_hi = _mm256_maddubs_epi16(src_reg_m1001_hi, kernel_reg_23);
+    res_reg_1223_hi = _mm256_maddubs_epi16(src_reg_1223_hi, kernel_reg_45);
+    res_reg_hi = _mm256_adds_epi16(res_reg_m1001_hi, res_reg_1223_hi);
+
+    // Round the words
+    res_reg_lo = mm256_round_epi16(&res_reg_lo, &reg_32, 6);
+    res_reg_hi = mm256_round_epi16(&res_reg_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg_lo, res_reg_hi);
+
+    // Save the result
+    mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                       &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001_lo = src_reg_1223_lo;
+    src_reg_m1001_hi = src_reg_1223_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+  // Since avx2 allows us to use 256-bit buffer, we can do this two rows at a
+  // time.
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;                             // Segments of the kernel used
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+  int h;
+
+  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m256i dst_reg;
+  __m256i tmp_0, tmp_1;
+  __m256i idx_shift_0 =
+      _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+                       2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m256i idx_shift_2 =
+      _mm256_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3,
+                       4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+  for (h = height; h >= 2; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the output
+    tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm256_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_reg = _mm256_adds_epi16(tmp_0, tmp_1);
+
+    // Round the result
+    dst_reg = mm256_round_epi16(&dst_reg, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_reg = _mm256_packus_epi16(dst_reg, dst_reg);
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &dst_reg);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  // Repeat for the last row if needed
+  if (h > 0) {
+    __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    __m128i dst_reg;
+    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    __m128i tmp_0, tmp_1;
+
+    __m128i src_reg_shift_0 =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_0));
+    __m128i src_reg_shift_2 =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(idx_shift_2));
+
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0,
+                              _mm256_castsi256_si128(kernel_reg_23));
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2,
+                              _mm256_castsi256_si128(kernel_reg_45));
+    dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
+
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
+
+    dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_reg);
+  }
+}
+
+static void vpx_filter_block1d8_v4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[1,0] s[0,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call add with another row to get the output.
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223;
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg, kernel_reg_23,
+      kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m256i res_reg_m1001, res_reg_1223;
+  __m256i res_reg;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg_23 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi16(0x0504u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Output
+    res_reg_m1001 = _mm256_maddubs_epi16(src_reg_m1001, kernel_reg_23);
+    res_reg_1223 = _mm256_maddubs_epi16(src_reg_1223, kernel_reg_45);
+    res_reg = _mm256_adds_epi16(res_reg_m1001, res_reg_1223);
+
+    // Round the words
+    res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+    // Save the result
+    mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into a single register in the form
+  // k[5:2] k[5:2] k[5:2] k[5:2]
+  // Then we shuffle the source into
+  // s[5:2] s[4:1] s[3:0] s[2:-1]
+  // Calling multiply and add gives us half of the sum next to each other.
+  // Calling horizontal add then gives us the output.
+  // Since avx2 has 256-bit register, we can do 2 rows at a time.
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg;
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+  int h;
+  const ptrdiff_t unrolled_src_stride = src_stride << 1;
+  const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
+
+  __m256i src_reg, src_reg_shuf;
+  __m256i dst;
+  __m256i shuf_idx =
+      _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
+                       3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+  for (h = height; h > 1; h -= 2) {
+    // Load the source
+    src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
+                                 (const __m128i *)(src_ptr + src_stride));
+    src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+
+    // Get the result
+    dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+    dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
+
+    // Round result
+    dst = mm256_round_epi16(&dst, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst = _mm256_packus_epi16(dst, _mm256_setzero_si256());
+
+    // Save
+    mm256_storeu2_epi32((__m128i *const)dst_ptr,
+                        (__m128i *const)(dst_ptr + dst_stride), &dst);
+
+    src_ptr += unrolled_src_stride;
+    dst_ptr += unrolled_dst_stride;
+  }
+
+  if (h > 0) {
+    // Load the source
+    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
+    __m128i src_reg_shuf =
+        _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
+
+    // Get the result
+    __m128i dst =
+        _mm_maddubs_epi16(src_reg_shuf, _mm256_castsi256_si128(kernel_reg));
+    dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
+
+    // Round result
+    dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst = _mm_packus_epi16(dst, _mm_setzero_si128());
+    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+  }
+}
+
+static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
+                                        ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                        ptrdiff_t dst_stride, uint32_t height,
+                                        const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[3,0] s[2,0] s[1,0] s[0,0] s[2,0] s[1,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel to get partial output.
+  // Calling horizontal add then gives us the completely output
+
+  // Register for source s[-1:3, :]
+  __m256i src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23;
+  __m256i src_reg_m1001, src_reg_1223, src_reg_m1012_1023;
+
+  __m128i kernel_reg_128;  // Kernel
+  __m256i kernel_reg;
+
+  // Result after multiply and add
+  __m256i res_reg;
+
+  const __m256i reg_32 = _mm256_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg_128 = _mm_srai_epi16(kernel_reg_128, 1);
+  kernel_reg_128 = _mm_packs_epi16(kernel_reg_128, kernel_reg_128);
+  kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128);
+  kernel_reg = _mm256_shuffle_epi8(kernel_reg, _mm256_set1_epi32(0x05040302u));
+
+  // Row -1 to row 0
+  src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr,
+                                   (const __m128i *)(src_ptr + src_stride));
+
+  // Row 0 to row 1
+  src_reg_1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)));
+  src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21);
+
+  // First three rows
+  src_reg_m1001 = _mm256_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)));
+
+    src_reg_12 = _mm256_inserti128_si256(src_reg_1,
+                                         _mm256_castsi256_si128(src_reg_2), 1);
+
+    src_reg_3 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)));
+
+    src_reg_23 = _mm256_inserti128_si256(src_reg_2,
+                                         _mm256_castsi256_si128(src_reg_3), 1);
+
+    // Last three rows
+    src_reg_1223 = _mm256_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Combine all the rows
+    src_reg_m1012_1023 = _mm256_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+
+    // Output
+    res_reg = _mm256_maddubs_epi16(src_reg_m1012_1023, kernel_reg);
+    res_reg = _mm256_hadds_epi16(res_reg, _mm256_setzero_si256());
+
+    // Round the words
+    res_reg = mm256_round_epi16(&res_reg, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg = _mm256_packus_epi16(res_reg, res_reg);
+
+    // Save the result
+    mm256_storeu2_epi32((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride),
+                        &res_reg);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if ARCH_X86_64
@@ -376,6 +942,13 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 #define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
 #define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
 #define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
+
+#define vpx_filter_block1d16_v4_avg_avx2 vpx_filter_block1d16_v8_avg_avx2
+#define vpx_filter_block1d16_h4_avg_avx2 vpx_filter_block1d16_h8_avg_avx2
+#define vpx_filter_block1d8_v4_avg_avx2 vpx_filter_block1d8_v8_avg_avx2
+#define vpx_filter_block1d8_h4_avg_avx2 vpx_filter_block1d8_h8_avg_avx2
+#define vpx_filter_block1d4_v4_avg_avx2 vpx_filter_block1d4_v8_avg_avx2
+#define vpx_filter_block1d4_h4_avg_avx2 vpx_filter_block1d4_h8_avg_avx2
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
 //                                const InterpKernel *filter, int x0_q4,
@@ -396,10 +969,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            avx2, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -411,6 +986,6 @@ FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2);
-FUN_CONV_2D(avg_, avx2);
+FUN_CONV_2D(, avx2, 0);
+FUN_CONV_2D(avg_, avx2, 1);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index e4f992780ff..63049c9342e 100644
--- a/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libs/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -12,20 +12,17 @@
 
 #include <string.h>
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_sse2.h"
 #include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_dsp/x86/mem_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
-// These are reused by the avx2 intrinsics.
-// vpx_filter_block1d8_v8_intrin_ssse3()
-// vpx_filter_block1d8_h8_intrin_ssse3()
-// vpx_filter_block1d4_h8_intrin_ssse3()
-
 static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
     const __m128i *const s, const int16_t *const filter) {
   __m128i f[4];
@@ -33,6 +30,23 @@ static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
   return convolve8_8_ssse3(s, f);
 }
 
+// Used by the avx2 implementation.
+#if ARCH_X86_64
+// Use the intrinsics below
+filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
+#define vpx_filter_block1d4_h8_ssse3 vpx_filter_block1d4_h8_intrin_ssse3
+#define vpx_filter_block1d8_h8_ssse3 vpx_filter_block1d8_h8_intrin_ssse3
+#define vpx_filter_block1d8_v8_ssse3 vpx_filter_block1d8_v8_intrin_ssse3
+#else  // ARCH_X86
+// Use the assembly in vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm.
+filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
+#endif
+
+#if ARCH_X86_64
 void vpx_filter_block1d4_h8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
@@ -184,13 +198,490 @@ void vpx_filter_block1d8_v8_intrin_ssse3(
     output_ptr += out_pitch;
   }
 }
+#endif  // ARCH_X86_64
+
+static void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          ptrdiff_t dst_stride, uint32_t height,
+                                          const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m128i dst_first, dst_second;
+  __m128i tmp_0, tmp_1;
+  __m128i idx_shift_0 =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m128i idx_shift_2 =
+      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for first half
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Do again to get the second half of dst
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Partial result for first half
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_second = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Round each result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+    dst_second = mm_round_epi16_sse2(&dst_second, &reg_32, 6);
+
+    // Finally combine to get the final dst
+    dst_first = _mm_packus_epi16(dst_first, dst_second);
+    _mm_store_si128((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr,
+                                          ptrdiff_t src_stride,
+                                          uint8_t *dst_ptr,
+                                          ptrdiff_t dst_stride, uint32_t height,
+                                          const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // ... s[0,9] s[-1,9] s[0,8] s[-1,8]
+  // so that we can call multiply and add with the kernel to get 16-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10_lo, src_reg_m10_hi, src_reg_01_lo, src_reg_01_hi;
+  __m128i src_reg_12_lo, src_reg_12_hi, src_reg_23_lo, src_reg_23_hi;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo;
+  __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi;
+  __m128i res_reg_m1012, res_reg_0123;
+  __m128i res_reg_m1012_lo, res_reg_0123_lo, res_reg_m1012_hi, res_reg_0123_hi;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10_lo = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+  src_reg_m10_hi = _mm_unpackhi_epi8(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+  src_reg_01_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12_lo = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+    src_reg_12_hi = _mm_unpackhi_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+    src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3);
+
+    // Partial output from first half
+    res_reg_m10_lo = _mm_maddubs_epi16(src_reg_m10_lo, kernel_reg_23);
+    res_reg_01_lo = _mm_maddubs_epi16(src_reg_01_lo, kernel_reg_23);
+
+    res_reg_12_lo = _mm_maddubs_epi16(src_reg_12_lo, kernel_reg_45);
+    res_reg_23_lo = _mm_maddubs_epi16(src_reg_23_lo, kernel_reg_45);
+
+    // Add to get first half of the results
+    res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo);
+    res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo);
+
+    // Partial output for second half
+    res_reg_m10_hi = _mm_maddubs_epi16(src_reg_m10_hi, kernel_reg_23);
+    res_reg_01_hi = _mm_maddubs_epi16(src_reg_01_hi, kernel_reg_23);
+
+    res_reg_12_hi = _mm_maddubs_epi16(src_reg_12_hi, kernel_reg_45);
+    res_reg_23_hi = _mm_maddubs_epi16(src_reg_23_hi, kernel_reg_45);
+
+    // Second half of the results
+    res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi);
+    res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi);
+
+    // Round the words
+    res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, &reg_32, 6);
+    res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, &reg_32, 6);
+    res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, &reg_32, 6);
+    res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, &reg_32, 6);
+
+    // Combine to get the result
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi);
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, res_reg_0123_hi);
+
+    _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10_lo = src_reg_12_lo;
+    src_reg_m10_hi = src_reg_12_hi;
+    src_reg_01_lo = src_reg_23_lo;
+    src_reg_01_hi = src_reg_23_hi;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into two registers in the form
+  // ... k[3] k[2] k[3] k[2]
+  // ... k[5] k[4] k[5] k[4]
+  // Then we shuffle the source into
+  // ... s[1] s[0] s[0] s[-1]
+  // ... s[3] s[2] s[2] s[1]
+  // Calling multiply and add gives us half of the sum. Calling add gives us
+  // first half of the output. Repeat again to get the second half of the
+  // output. Finally we shuffle again to combine the two outputs.
+
+  __m128i kernel_reg;                         // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;       // Segments of the kernel used
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shift_0, src_reg_shift_2;
+  __m128i dst_first;
+  __m128i tmp_0, tmp_1;
+  __m128i idx_shift_0 =
+      _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  __m128i idx_shift_2 =
+      _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shift_0 = _mm_shuffle_epi8(src_reg, idx_shift_0);
+    src_reg_shift_2 = _mm_shuffle_epi8(src_reg, idx_shift_2);
+
+    // Get the result
+    tmp_0 = _mm_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
+    tmp_1 = _mm_maddubs_epi16(src_reg_shift_2, kernel_reg_45);
+    dst_first = _mm_adds_epi16(tmp_0, tmp_1);
+
+    // Round round result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+    _mm_storel_epi64((__m128i *)dst_ptr, dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
+
+static void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[0,1] s[-1,1] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel to get 16-bit words of
+  // the form
+  // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2]
+  // Finally, we can add multiple rows together to get the desired output.
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source. lo is first half, hi second
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+
+  __m128i kernel_reg;                    // Kernel
+  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
+
+  // Result after multiply and add
+  __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23;
+  __m128i res_reg_m1012, res_reg_0123;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg_23 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0302u));
+  kernel_reg_45 = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi16(0x0504u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi8(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi8(src_reg_0, src_reg_1);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+
+    src_reg_12 = _mm_unpacklo_epi8(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+
+    src_reg_23 = _mm_unpacklo_epi8(src_reg_2, src_reg_3);
+
+    // Partial output
+    res_reg_m10 = _mm_maddubs_epi16(src_reg_m10, kernel_reg_23);
+    res_reg_01 = _mm_maddubs_epi16(src_reg_01, kernel_reg_23);
+
+    res_reg_12 = _mm_maddubs_epi16(src_reg_12, kernel_reg_45);
+    res_reg_23 = _mm_maddubs_epi16(src_reg_23, kernel_reg_45);
+
+    // Add to get entire output
+    res_reg_m1012 = _mm_adds_epi16(res_reg_m10, res_reg_12);
+    res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23);
+
+    // Round the words
+    res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, &reg_32, 6);
+    res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, &reg_32, 6);
+
+    // Pack from 16-bit to 8-bit
+    res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128());
+    res_reg_0123 = _mm_packus_epi16(res_reg_0123, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012);
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m10 = src_reg_12;
+    src_reg_01 = src_reg_23;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will cast the kernel from 16-bit words to 8-bit words, and then extract
+  // the middle four elements of the kernel into a single register in the form
+  // k[5:2] k[5:2] k[5:2] k[5:2]
+  // Then we shuffle the source into
+  // s[5:2] s[4:1] s[3:0] s[2:-1]
+  // Calling multiply and add gives us half of the sum next to each other.
+  // Calling horizontal add then gives us the output.
+
+  __m128i kernel_reg;                         // Kernel
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+  int h;
+
+  __m128i src_reg, src_reg_shuf;
+  __m128i dst_first;
+  __m128i shuf_idx =
+      _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
+
+  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
+  src_ptr -= 1;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+  for (h = height; h > 0; --h) {
+    // Load the source
+    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    src_reg_shuf = _mm_shuffle_epi8(src_reg, shuf_idx);
+
+    // Get the result
+    dst_first = _mm_maddubs_epi16(src_reg_shuf, kernel_reg);
+    dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128());
+
+    // Round result
+    dst_first = mm_round_epi16_sse2(&dst_first, &reg_32, 6);
+
+    // Pack to 8-bits
+    dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
+    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+  }
+}
 
+static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *kernel) {
+  // We will load two rows of pixels as 8-bit words, rearrange them into the
+  // form
+  // ... s[2,0] s[1,0] s[0,0] s[-1,0]
+  // so that we can call multiply and add with the kernel partial output. Then
+  // we can call horizontal add to get the output.
+  // Finally, we can add multiple rows together to get the desired output.
+  // This is done two rows at a time
+
+  // Register for source s[-1:3, :]
+  __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3;
+  // Interleaved rows of the source.
+  __m128i src_reg_m10, src_reg_01;
+  __m128i src_reg_12, src_reg_23;
+  __m128i src_reg_m1001, src_reg_1223;
+  __m128i src_reg_m1012_1023_lo, src_reg_m1012_1023_hi;
+
+  __m128i kernel_reg;  // Kernel
+
+  // Result after multiply and add
+  __m128i reg_0, reg_1;
+
+  const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+
+  // We will compute the result two rows at a time
+  const ptrdiff_t src_stride_unrolled = src_stride << 1;
+  const ptrdiff_t dst_stride_unrolled = dst_stride << 1;
+  int h;
+
+  // Load Kernel
+  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
+  kernel_reg = _mm_srai_epi16(kernel_reg, 1);
+  kernel_reg = _mm_packs_epi16(kernel_reg, kernel_reg);
+  kernel_reg = _mm_shuffle_epi8(kernel_reg, _mm_set1_epi32(0x05040302u));
+
+  // First shuffle the data
+  src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+  src_reg_m10 = _mm_unpacklo_epi32(src_reg_m1, src_reg_0);
+
+  // More shuffling
+  src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2));
+  src_reg_01 = _mm_unpacklo_epi32(src_reg_0, src_reg_1);
+
+  // Put three rows next to each other
+  src_reg_m1001 = _mm_unpacklo_epi8(src_reg_m10, src_reg_01);
+
+  for (h = height; h > 1; h -= 2) {
+    src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3));
+    src_reg_12 = _mm_unpacklo_epi32(src_reg_1, src_reg_2);
+
+    src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4));
+    src_reg_23 = _mm_unpacklo_epi32(src_reg_2, src_reg_3);
+
+    // Put three rows next to each other
+    src_reg_1223 = _mm_unpacklo_epi8(src_reg_12, src_reg_23);
+
+    // Put all four rows next to each other
+    src_reg_m1012_1023_lo = _mm_unpacklo_epi16(src_reg_m1001, src_reg_1223);
+    src_reg_m1012_1023_hi = _mm_unpackhi_epi16(src_reg_m1001, src_reg_1223);
+
+    // Get the results
+    reg_0 = _mm_maddubs_epi16(src_reg_m1012_1023_lo, kernel_reg);
+    reg_1 = _mm_maddubs_epi16(src_reg_m1012_1023_hi, kernel_reg);
+    reg_0 = _mm_hadds_epi16(reg_0, _mm_setzero_si128());
+    reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128());
+
+    // Round the words
+    reg_0 = mm_round_epi16_sse2(&reg_0, &reg_32, 6);
+    reg_1 = mm_round_epi16_sse2(&reg_1, &reg_32, 6);
+
+    // Pack from 16-bit to 8-bit and put them in the right order
+    reg_0 = _mm_packus_epi16(reg_0, reg_0);
+    reg_1 = _mm_packus_epi16(reg_1, reg_1);
+
+    // Save the result
+    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+
+    // Update the source by two rows
+    src_ptr += src_stride_unrolled;
+    dst_ptr += dst_stride_unrolled;
+
+    src_reg_m1001 = src_reg_1223;
+    src_reg_1 = src_reg_3;
+  }
+}
+
+// From vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
 filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
@@ -198,6 +689,15 @@ filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
 
+// Use the [vh]8 version because there is no [vh]4 implementation.
+#define vpx_filter_block1d16_v4_avg_ssse3 vpx_filter_block1d16_v8_avg_ssse3
+#define vpx_filter_block1d16_h4_avg_ssse3 vpx_filter_block1d16_h8_avg_ssse3
+#define vpx_filter_block1d8_v4_avg_ssse3 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h4_avg_ssse3 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v4_avg_ssse3 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h4_avg_ssse3 vpx_filter_block1d4_h8_avg_ssse3
+
+// From vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -231,10 +731,12 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
-FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
+            ssse3, 0);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -571,7 +1073,7 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   }
 }
 
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const InterpKernel *filter, int x0_q4,
 //                          int32_t x_step_q4, int y0_q4, int y_step_q4,
@@ -581,5 +1083,5 @@ void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_, ssse3);
+FUN_CONV_2D(, ssse3, 0);
+FUN_CONV_2D(avg_, ssse3, 1);
diff --git a/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h
index 2c259d322e6..5631130243b 100644
--- a/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h
+++ b/libs/libvpx/vpx_mem/include/vpx_mem_intrnl.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
-#define VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#ifndef VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#define VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
 #include "./vpx_config.h"
 
 #define ADDRESS_STORAGE_SIZE sizeof(size_t)
@@ -28,4 +28,4 @@
 #define align_addr(addr, align) \
   (void *)(((size_t)(addr) + ((align)-1)) & ~(size_t)((align)-1))
 
-#endif  // VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
+#endif  // VPX_VPX_MEM_INCLUDE_VPX_MEM_INTRNL_H_
diff --git a/libs/libvpx/vpx_mem/vpx_mem.c b/libs/libvpx/vpx_mem/vpx_mem.c
index eeba34c373b..18abf1158b5 100644
--- a/libs/libvpx/vpx_mem/vpx_mem.c
+++ b/libs/libvpx/vpx_mem/vpx_mem.c
@@ -16,12 +16,14 @@
 #include "include/vpx_mem_intrnl.h"
 #include "vpx/vpx_integer.h"
 
+#if !defined(VPX_MAX_ALLOCABLE_MEMORY)
 #if SIZE_MAX > (1ULL << 40)
 #define VPX_MAX_ALLOCABLE_MEMORY (1ULL << 40)
 #else
 // For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
 #define VPX_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
 #endif
+#endif
 
 // Returns 0 in case of overflow of nmemb * size.
 static int check_size_argument_overflow(uint64_t nmemb, uint64_t size) {
diff --git a/libs/libvpx/vpx_mem/vpx_mem.h b/libs/libvpx/vpx_mem/vpx_mem.h
index a4274b8856c..7689a05e6ea 100644
--- a/libs/libvpx/vpx_mem/vpx_mem.h
+++ b/libs/libvpx/vpx_mem/vpx_mem.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_MEM_VPX_MEM_H_
-#define VPX_MEM_VPX_MEM_H_
+#ifndef VPX_VPX_MEM_VPX_MEM_H_
+#define VPX_VPX_MEM_VPX_MEM_H_
 
 #include "vpx_config.h"
 #if defined(__uClinux__)
@@ -49,4 +49,4 @@ static INLINE void *vpx_memset16(void *dest, int val, size_t length) {
 }
 #endif
 
-#endif  // VPX_MEM_VPX_MEM_H_
+#endif  // VPX_VPX_MEM_VPX_MEM_H_
diff --git a/libs/libvpx/vpx_ports/arm.h b/libs/libvpx/vpx_ports/arm.h
index 7be6104a4f5..6458a2c5b00 100644
--- a/libs/libvpx/vpx_ports/arm.h
+++ b/libs/libvpx/vpx_ports/arm.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_ARM_H_
-#define VPX_PORTS_ARM_H_
+#ifndef VPX_VPX_PORTS_ARM_H_
+#define VPX_VPX_PORTS_ARM_H_
 #include <stdlib.h>
 #include "vpx_config.h"
 
@@ -36,4 +36,4 @@ int arm_cpu_caps(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_ARM_H_
+#endif  // VPX_VPX_PORTS_ARM_H_
diff --git a/libs/libvpx/vpx_ports/asmdefs_mmi.h b/libs/libvpx/vpx_ports/asmdefs_mmi.h
index a9a49745afd..28355bf9fbf 100644
--- a/libs/libvpx/vpx_ports/asmdefs_mmi.h
+++ b/libs/libvpx/vpx_ports/asmdefs_mmi.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_ASMDEFS_MMI_H_
-#define VPX_PORTS_ASMDEFS_MMI_H_
+#ifndef VPX_VPX_PORTS_ASMDEFS_MMI_H_
+#define VPX_VPX_PORTS_ASMDEFS_MMI_H_
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -78,4 +78,4 @@
 
 #endif /* HAVE_MMI */
 
-#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */
+#endif  // VPX_VPX_PORTS_ASMDEFS_MMI_H_
diff --git a/libs/libvpx/vpx_ports/bitops.h b/libs/libvpx/vpx_ports/bitops.h
index 0ed7189ff65..5b2f31cd11e 100644
--- a/libs/libvpx/vpx_ports/bitops.h
+++ b/libs/libvpx/vpx_ports/bitops.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_BITOPS_H_
-#define VPX_PORTS_BITOPS_H_
+#ifndef VPX_VPX_PORTS_BITOPS_H_
+#define VPX_VPX_PORTS_BITOPS_H_
 
 #include <assert.h>
 
@@ -72,4 +72,4 @@ static INLINE int get_msb(unsigned int n) {
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_BITOPS_H_
+#endif  // VPX_VPX_PORTS_BITOPS_H_
diff --git a/libs/libvpx/vpx_ports/emmintrin_compat.h b/libs/libvpx/vpx_ports/emmintrin_compat.h
index 903534e0c0f..d6cc68ee4d2 100644
--- a/libs/libvpx/vpx_ports/emmintrin_compat.h
+++ b/libs/libvpx/vpx_ports/emmintrin_compat.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_EMMINTRIN_COMPAT_H_
-#define VPX_PORTS_EMMINTRIN_COMPAT_H_
+#ifndef VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
+#define VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
 
 #if defined(__GNUC__) && __GNUC__ < 4
 /* From emmintrin.h (gcc 4.5.3) */
@@ -52,4 +52,4 @@ extern __inline __m128d
 }
 #endif
 
-#endif  // VPX_PORTS_EMMINTRIN_COMPAT_H_
+#endif  // VPX_VPX_PORTS_EMMINTRIN_COMPAT_H_
diff --git a/libs/libvpx/vpx_ports/emms_mmx.asm b/libs/libvpx/vpx_ports/emms_mmx.asm
new file mode 100644
index 00000000000..9f33590a285
--- /dev/null
+++ b/libs/libvpx/vpx_ports/emms_mmx.asm
@@ -0,0 +1,18 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+global sym(vpx_clear_system_state) PRIVATE
+sym(vpx_clear_system_state):
+    emms
+    ret
diff --git a/libs/libvpx/vpx_ports/config.h b/libs/libvpx/vpx_ports/emms_mmx.c
similarity index 66%
rename from libs/libvpx/vpx_ports/config.h
rename to libs/libvpx/vpx_ports/emms_mmx.c
index 3c1ab99f4aa..f1036b98edb 100644
--- a/libs/libvpx/vpx_ports/config.h
+++ b/libs/libvpx/vpx_ports/emms_mmx.c
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,9 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_CONFIG_H_
-#define VPX_PORTS_CONFIG_H_
+#include <mmintrin.h>
 
-#include "vpx_config.h"
+#include "vpx_ports/system_state.h"
 
-#endif  // VPX_PORTS_CONFIG_H_
+void vpx_clear_system_state() { _mm_empty(); }
diff --git a/libs/libvpx/vpx_ports/emms.asm b/libs/libvpx/vpx_ports/float_control_word.asm
similarity index 90%
rename from libs/libvpx/vpx_ports/emms.asm
rename to libs/libvpx/vpx_ports/float_control_word.asm
index db8da287375..256dae08447 100644
--- a/libs/libvpx/vpx_ports/emms.asm
+++ b/libs/libvpx/vpx_ports/float_control_word.asm
@@ -12,11 +12,6 @@
 %include "vpx_ports/x86_abi_support.asm"
 
 section .text
-global sym(vpx_reset_mmx_state) PRIVATE
-sym(vpx_reset_mmx_state):
-    emms
-    ret
-
 
 %if LIBVPX_YASM_WIN64
 global sym(vpx_winx64_fldcw) PRIVATE
diff --git a/libs/libvpx/vpx_ports/mem.h b/libs/libvpx/vpx_ports/mem.h
index bfef783b133..317c6dc0613 100644
--- a/libs/libvpx/vpx_ports/mem.h
+++ b/libs/libvpx/vpx_ports/mem.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_H_
-#define VPX_PORTS_MEM_H_
+#ifndef VPX_VPX_PORTS_MEM_H_
+#define VPX_VPX_PORTS_MEM_H_
 
 #include "vpx_config.h"
 #include "vpx/vpx_integer.h"
@@ -51,4 +51,4 @@
 #define VPX_WITH_ASAN 0
 #endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 
-#endif  // VPX_PORTS_MEM_H_
+#endif  // VPX_VPX_PORTS_MEM_H_
diff --git a/libs/libvpx/vpx_ports/mem_ops.h b/libs/libvpx/vpx_ports/mem_ops.h
index 343f27577c0..b17015e7ecf 100644
--- a/libs/libvpx/vpx_ports/mem_ops.h
+++ b/libs/libvpx/vpx_ports/mem_ops.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_OPS_H_
-#define VPX_PORTS_MEM_OPS_H_
+#ifndef VPX_VPX_PORTS_MEM_OPS_H_
+#define VPX_VPX_PORTS_MEM_OPS_H_
 
 /* \file
  * \brief Provides portable memory access primitives
@@ -224,5 +224,4 @@ static VPX_INLINE void mem_put_le32(void *vmem, MEM_VALUE_T val) {
   mem[3] = (MAU_T)((val >> 24) & 0xff);
 }
 /* clang-format on */
-
-#endif  // VPX_PORTS_MEM_OPS_H_
+#endif  // VPX_VPX_PORTS_MEM_OPS_H_
diff --git a/libs/libvpx/vpx_ports/mem_ops_aligned.h b/libs/libvpx/vpx_ports/mem_ops_aligned.h
index ccac391ba00..8649b87623b 100644
--- a/libs/libvpx/vpx_ports/mem_ops_aligned.h
+++ b/libs/libvpx/vpx_ports/mem_ops_aligned.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MEM_OPS_ALIGNED_H_
-#define VPX_PORTS_MEM_OPS_ALIGNED_H_
+#ifndef VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
+#define VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
 
 #include "vpx/vpx_integer.h"
 
@@ -168,4 +168,4 @@ mem_put_le_aligned_generic(32)
 #undef swap_endian_32_se
 /* clang-format on */
 
-#endif  // VPX_PORTS_MEM_OPS_ALIGNED_H_
+#endif  // VPX_VPX_PORTS_MEM_OPS_ALIGNED_H_
diff --git a/libs/libvpx/vpx_ports/msvc.h b/libs/libvpx/vpx_ports/msvc.h
index 3ff71474b3b..d58de3535a0 100644
--- a/libs/libvpx/vpx_ports/msvc.h
+++ b/libs/libvpx/vpx_ports/msvc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MSVC_H_
-#define VPX_PORTS_MSVC_H_
+#ifndef VPX_VPX_PORTS_MSVC_H_
+#define VPX_VPX_PORTS_MSVC_H_
 #ifdef _MSC_VER
 
 #include "./vpx_config.h"
@@ -29,4 +29,4 @@ static INLINE double round(double x) {
 #endif  // _MSC_VER < 1800
 
 #endif  // _MSC_VER
-#endif  // VPX_PORTS_MSVC_H_
+#endif  // VPX_VPX_PORTS_MSVC_H_
diff --git a/libs/libvpx/vpx_ports/ppc.h b/libs/libvpx/vpx_ports/ppc.h
index ed29ef25b45..a11f4e8732e 100644
--- a/libs/libvpx/vpx_ports/ppc.h
+++ b/libs/libvpx/vpx_ports/ppc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_PPC_H_
-#define VPX_PORTS_PPC_H_
+#ifndef VPX_VPX_PORTS_PPC_H_
+#define VPX_VPX_PORTS_PPC_H_
 #include <stdlib.h>
 
 #include "./vpx_config.h"
@@ -26,4 +26,4 @@ int ppc_simd_caps(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_PPC_H_
+#endif  // VPX_VPX_PORTS_PPC_H_
diff --git a/libs/libvpx/vpx_ports/system_state.h b/libs/libvpx/vpx_ports/system_state.h
index 086c64681f5..452cb5739bf 100644
--- a/libs/libvpx/vpx_ports/system_state.h
+++ b/libs/libvpx/vpx_ports/system_state.h
@@ -8,15 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_SYSTEM_STATE_H_
-#define VPX_PORTS_SYSTEM_STATE_H_
+#ifndef VPX_VPX_PORTS_SYSTEM_STATE_H_
+#define VPX_VPX_PORTS_SYSTEM_STATE_H_
 
 #include "./vpx_config.h"
 
-#if ARCH_X86 || ARCH_X86_64
-void vpx_reset_mmx_state(void);
-#define vpx_clear_system_state() vpx_reset_mmx_state()
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if (ARCH_X86 || ARCH_X86_64) && HAVE_MMX
+extern void vpx_clear_system_state(void);
 #else
 #define vpx_clear_system_state()
-#endif  // ARCH_X86 || ARCH_X86_64
-#endif  // VPX_PORTS_SYSTEM_STATE_H_
+#endif  // (ARCH_X86 || ARCH_X86_64) && HAVE_MMX
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_SYSTEM_STATE_H_
diff --git a/libs/libvpx/vpx_ports/vpx_once.h b/libs/libvpx/vpx_ports/vpx_once.h
index 7d9fc3b4063..4eb592b87e5 100644
--- a/libs/libvpx/vpx_ports/vpx_once.h
+++ b/libs/libvpx/vpx_ports/vpx_once.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_VPX_ONCE_H_
-#define VPX_PORTS_VPX_ONCE_H_
+#ifndef VPX_VPX_PORTS_VPX_ONCE_H_
+#define VPX_VPX_PORTS_VPX_ONCE_H_
 
 #include "vpx_config.h"
 
@@ -137,4 +137,4 @@ static void once(void (*func)(void)) {
 }
 #endif
 
-#endif  // VPX_PORTS_VPX_ONCE_H_
+#endif  // VPX_VPX_PORTS_VPX_ONCE_H_
diff --git a/libs/libvpx/vpx_ports/vpx_ports.mk b/libs/libvpx/vpx_ports/vpx_ports.mk
index e17145e6cb8..aa9faf15ecf 100644
--- a/libs/libvpx/vpx_ports/vpx_ports.mk
+++ b/libs/libvpx/vpx_ports/vpx_ports.mk
@@ -17,8 +17,19 @@ PORTS_SRCS-yes += msvc.h
 PORTS_SRCS-yes += system_state.h
 PORTS_SRCS-yes += vpx_timer.h
 
+ifeq ($(ARCH_X86),yes)
+PORTS_SRCS-$(HAVE_MMX) += emms_mmx.c
+endif
+ifeq ($(ARCH_X86_64),yes)
+# Visual Studio x64 does not support the _mm_empty() intrinsic.
+PORTS_SRCS-$(HAVE_MMX) += emms_mmx.asm
+endif
+
+ifeq ($(ARCH_X86_64),yes)
+PORTS_SRCS-$(CONFIG_MSVS) += float_control_word.asm
+endif
+
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
-PORTS_SRCS-yes += emms.asm
 PORTS_SRCS-yes += x86.h
 PORTS_SRCS-yes += x86_abi_support.asm
 endif
diff --git a/libs/libvpx/vpx_ports/vpx_timer.h b/libs/libvpx/vpx_ports/vpx_timer.h
index 2083b4ece44..4934d5296a0 100644
--- a/libs/libvpx/vpx_ports/vpx_timer.h
+++ b/libs/libvpx/vpx_ports/vpx_timer.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_VPX_TIMER_H_
-#define VPX_PORTS_VPX_TIMER_H_
+#ifndef VPX_VPX_PORTS_VPX_TIMER_H_
+#define VPX_VPX_PORTS_VPX_TIMER_H_
 
 #include "./vpx_config.h"
 
@@ -106,4 +106,4 @@ static INLINE int vpx_usec_timer_elapsed(struct vpx_usec_timer *t) { return 0; }
 
 #endif /* CONFIG_OS_SUPPORT */
 
-#endif  // VPX_PORTS_VPX_TIMER_H_
+#endif  // VPX_VPX_PORTS_VPX_TIMER_H_
diff --git a/libs/libvpx/vpx_ports/x86.h b/libs/libvpx/vpx_ports/x86.h
index ced65ac058f..9b48a1f4c3a 100644
--- a/libs/libvpx/vpx_ports/x86.h
+++ b/libs/libvpx/vpx_ports/x86.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_X86_H_
-#define VPX_PORTS_X86_H_
+#ifndef VPX_VPX_PORTS_X86_H_
+#define VPX_VPX_PORTS_X86_H_
 #include <stdlib.h>
 
 #if defined(_MSC_VER)
@@ -161,7 +161,7 @@ static INLINE uint64_t xgetbv(void) {
 #define HAS_AVX2 0x080
 #define HAS_AVX512 0x100
 #ifndef BIT
-#define BIT(n) (1u << n)
+#define BIT(n) (1u << (n))
 #endif
 
 static INLINE int x86_simd_caps(void) {
@@ -223,11 +223,26 @@ static INLINE int x86_simd_caps(void) {
   return flags & mask;
 }
 
-// Note:
-//  32-bit CPU cycle counter is light-weighted for most function performance
-//  measurement. For large function (CPU time > a couple of seconds), 64-bit
-//  counter should be used.
-// 32-bit CPU cycle counter
+// Fine-Grain Measurement Functions
+//
+// If you are timing a small region of code, access the timestamp counter
+// (TSC) via:
+//
+// unsigned int start = x86_tsc_start();
+//   ...
+// unsigned int end = x86_tsc_end();
+// unsigned int diff = end - start;
+//
+// The start/end functions introduce a few more instructions than using
+// x86_readtsc directly, but prevent the CPU's out-of-order execution from
+// affecting the measurement (by having earlier/later instructions be evaluated
+// in the time interval). See the white paper, "How to Benchmark Code
+// Execution Times on Intel® IA-32 and IA-64 Instruction Set Architectures" by
+// Gabriele Paoloni for more information.
+//
+// If you are timing a large function (CPU time > a couple of seconds), use
+// x86_readtsc64 to read the timestamp counter in a 64-bit integer. The
+// out-of-order leakage that can occur is minimal compared to total runtime.
 static INLINE unsigned int x86_readtsc(void) {
 #if defined(__GNUC__) && __GNUC__
   unsigned int tsc;
@@ -264,6 +279,41 @@ static INLINE uint64_t x86_readtsc64(void) {
 #endif
 }
 
+// 32-bit CPU cycle counter with a partial fence against out-of-order execution.
+static INLINE unsigned int x86_readtscp(void) {
+#if defined(__GNUC__) && __GNUC__
+  unsigned int tscp;
+  __asm__ __volatile__("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+  unsigned int tscp;
+  asm volatile("rdtscp\n\t" : "=a"(tscp) :);
+  return tscp;
+#elif defined(_MSC_VER)
+  unsigned int ui;
+  return (unsigned int)__rdtscp(&ui);
+#else
+#if ARCH_X86_64
+  return (unsigned int)__rdtscp();
+#else
+  __asm rdtscp;
+#endif
+#endif
+}
+
+static INLINE unsigned int x86_tsc_start(void) {
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return x86_readtsc();
+}
+
+static INLINE unsigned int x86_tsc_end(void) {
+  uint32_t v = x86_readtscp();
+  unsigned int reg_eax, reg_ebx, reg_ecx, reg_edx;
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  return v;
+}
+
 #if defined(__GNUC__) && __GNUC__
 #define x86_pause_hint() __asm__ __volatile__("pause \n\t")
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
@@ -313,14 +363,23 @@ static unsigned short x87_get_control_word(void) {
 
 static INLINE unsigned int x87_set_double_precision(void) {
   unsigned int mode = x87_get_control_word();
+  // Intel 64 and IA-32 Architectures Developer's Manual: Vol. 1
+  // https://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-1-manual.pdf
+  // 8.1.5.2 Precision Control Field
+  // Bits 8 and 9 (0x300) of the x87 FPU Control Word ("Precision Control")
+  // determine the number of bits used in floating point calculations. To match
+  // later SSE instructions restrict x87 operations to Double Precision (0x200).
+  // Precision                     PC Field
+  // Single Precision (24-Bits)    00B
+  // Reserved                      01B
+  // Double Precision (53-Bits)    10B
+  // Extended Precision (64-Bits)  11B
   x87_set_control_word((mode & ~0x300) | 0x200);
   return mode;
 }
 
-extern void vpx_reset_mmx_state(void);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_X86_H_
+#endif  // VPX_VPX_PORTS_X86_H_
diff --git a/libs/libvpx/vpx_scale/generic/gen_scalers.c b/libs/libvpx/vpx_scale/generic/gen_scalers.c
index b554a56e832..d8db4b35475 100644
--- a/libs/libvpx/vpx_scale/generic/gen_scalers.c
+++ b/libs/libvpx/vpx_scale/generic/gen_scalers.c
@@ -12,8 +12,8 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 /****************************************************************************
-*  Imports
-****************************************************************************/
+ *  Imports
+ ****************************************************************************/
 
 /****************************************************************************
  *
diff --git a/libs/libvpx/vpx_scale/generic/vpx_scale.c b/libs/libvpx/vpx_scale/generic/vpx_scale.c
index 20e1ff90fd5..958bb320fc4 100644
--- a/libs/libvpx/vpx_scale/generic/vpx_scale.c
+++ b/libs/libvpx/vpx_scale/generic/vpx_scale.c
@@ -17,8 +17,8 @@
  ***************************************************************************/
 
 /****************************************************************************
-*  Header Files
-****************************************************************************/
+ *  Header Files
+ ****************************************************************************/
 #include "./vpx_scale_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
diff --git a/libs/libvpx/vpx_scale/generic/yv12config.c b/libs/libvpx/vpx_scale/generic/yv12config.c
index 9c7ca42c78b..eee291c30d8 100644
--- a/libs/libvpx/vpx_scale/generic/yv12config.c
+++ b/libs/libvpx/vpx_scale/generic/yv12config.c
@@ -15,9 +15,12 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
+#if defined(VPX_MAX_ALLOCABLE_MEMORY)
+#include "vp9/common/vp9_onyxc_int.h"
+#endif  // VPX_MAX_ALLOCABLE_MEMORY
 /****************************************************************************
-*  Exports
-****************************************************************************/
+ *  Exports
+ ****************************************************************************/
 
 /****************************************************************************
  *
@@ -54,13 +57,21 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width,
     int uv_width = aligned_width >> 1;
     int uv_height = aligned_height >> 1;
     /** There is currently a bunch of code which assumes
-      *  uv_stride == y_stride/2, so enforce this here. */
+     *  uv_stride == y_stride/2, so enforce this here. */
     int uv_stride = y_stride >> 1;
     int uvplane_size = (uv_height + border) * uv_stride;
-    const int frame_size = yplane_size + 2 * uvplane_size;
+    const size_t frame_size = yplane_size + 2 * uvplane_size;
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+      // This memset is needed for fixing the issue of using uninitialized
+      // value in msan test. It will cause a perf loss, so only do this for
+      // msan test.
+      memset(ybf->buffer_alloc, 0, frame_size);
+#endif
+#endif
       ybf->buffer_alloc_sz = frame_size;
     }
 
@@ -142,6 +153,17 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
                              int border, int byte_alignment,
                              vpx_codec_frame_buffer_t *fb,
                              vpx_get_frame_buffer_cb_fn_t cb, void *cb_priv) {
+#if CONFIG_SIZE_LIMIT
+  if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return -1;
+#endif
+
+  /* Only support allocating buffers that have a border that's a multiple
+   * of 32. The border restriction is required to get 16-byte alignment of
+   * the start of the chroma rows without introducing an arbitrary gap
+   * between planes, which would break the semantics of things like
+   * vpx_img_set_rect(). */
+  if (border & 0x1f) return -3;
+
   if (ybf) {
     const int vp9_byte_align = (byte_alignment == 0) ? 1 : byte_alignment;
     const int aligned_width = (width + 7) & ~7;
@@ -166,9 +188,16 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 
     uint8_t *buf = NULL;
 
-    // frame_size is stored in buffer_alloc_sz, which is an int. If it won't
+#if defined(VPX_MAX_ALLOCABLE_MEMORY)
+    // The decoder may allocate REF_FRAMES frame buffers in the frame buffer
+    // pool. Bound the total amount of allocated memory as if these REF_FRAMES
+    // frame buffers were allocated in a single allocation.
+    if (frame_size > VPX_MAX_ALLOCABLE_MEMORY / REF_FRAMES) return -1;
+#endif  // VPX_MAX_ALLOCABLE_MEMORY
+
+    // frame_size is stored in buffer_alloc_sz, which is a size_t. If it won't
     // fit, fail early.
-    if (frame_size > INT_MAX) {
+    if (frame_size > SIZE_MAX) {
       return -1;
     }
 
@@ -192,18 +221,19 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       // This memset is needed for fixing the issue of using uninitialized
       // value in msan test. It will cause a perf loss, so only do this for
       // msan test.
-      memset(ybf->buffer_alloc, 0, (int)frame_size);
+      memset(ybf->buffer_alloc, 0, (size_t)frame_size);
 #endif
 #endif
-    } else if (frame_size > (size_t)ybf->buffer_alloc_sz) {
+    } else if (frame_size > ybf->buffer_alloc_sz) {
       // Allocation to hold larger frame, or first allocation.
       vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
+      ybf->buffer_alloc_sz = 0;
 
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;
 
-      ybf->buffer_alloc_sz = (int)frame_size;
+      ybf->buffer_alloc_sz = (size_t)frame_size;
 
       // This memset is needed for fixing valgrind error from C loop filter
       // due to access uninitialized memory in frame border. It could be
@@ -211,13 +241,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       memset(ybf->buffer_alloc, 0, ybf->buffer_alloc_sz);
     }
 
-    /* Only support allocating buffers that have a border that's a multiple
-     * of 32. The border restriction is required to get 16-byte alignment of
-     * the start of the chroma rows without introducing an arbitrary gap
-     * between planes, which would break the semantics of things like
-     * vpx_img_set_rect(). */
-    if (border & 0x1f) return -3;
-
     ybf->y_crop_width = width;
     ybf->y_crop_height = height;
     ybf->y_width = aligned_width;
@@ -231,7 +254,7 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
     ybf->uv_stride = uv_stride;
 
     ybf->border = border;
-    ybf->frame_size = (int)frame_size;
+    ybf->frame_size = (size_t)frame_size;
     ybf->subsampling_x = ss_x;
     ybf->subsampling_y = ss_y;
 
diff --git a/libs/libvpx/vpx_scale/vpx_scale.h b/libs/libvpx/vpx_scale/vpx_scale.h
index 478a4834610..fd5ba7ccdc1 100644
--- a/libs/libvpx/vpx_scale/vpx_scale.h
+++ b/libs/libvpx/vpx_scale/vpx_scale.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_SCALE_VPX_SCALE_H_
-#define VPX_SCALE_VPX_SCALE_H_
+#ifndef VPX_VPX_SCALE_VPX_SCALE_H_
+#define VPX_VPX_SCALE_VPX_SCALE_H_
 
 #include "vpx_scale/yv12config.h"
 
@@ -19,4 +19,4 @@ extern void vpx_scale_frame(YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst,
                             unsigned int vscale, unsigned int vratio,
                             unsigned int interlaced);
 
-#endif  // VPX_SCALE_VPX_SCALE_H_
+#endif  // VPX_VPX_SCALE_VPX_SCALE_H_
diff --git a/libs/libvpx/vpx_scale/yv12config.h b/libs/libvpx/vpx_scale/yv12config.h
index b9b33621449..2cf18217f60 100644
--- a/libs/libvpx/vpx_scale/yv12config.h
+++ b/libs/libvpx/vpx_scale/yv12config.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_SCALE_YV12CONFIG_H_
-#define VPX_SCALE_YV12CONFIG_H_
+#ifndef VPX_VPX_SCALE_YV12CONFIG_H_
+#define VPX_VPX_SCALE_YV12CONFIG_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -49,9 +49,9 @@ typedef struct yv12_buffer_config {
   uint8_t *alpha_buffer;
 
   uint8_t *buffer_alloc;
-  int buffer_alloc_sz;
+  size_t buffer_alloc_sz;
   int border;
-  int frame_size;
+  size_t frame_size;
   int subsampling_x;
   int subsampling_y;
   unsigned int bit_depth;
@@ -100,4 +100,4 @@ int vpx_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 }
 #endif
 
-#endif  // VPX_SCALE_YV12CONFIG_H_
+#endif  // VPX_VPX_SCALE_YV12CONFIG_H_
diff --git a/libs/libvpx/vpx_util/endian_inl.h b/libs/libvpx/vpx_util/endian_inl.h
index dc387740958..1b6ef56c695 100644
--- a/libs/libvpx/vpx_util/endian_inl.h
+++ b/libs/libvpx/vpx_util/endian_inl.h
@@ -9,8 +9,8 @@
 //
 // Endian related functions.
 
-#ifndef VPX_UTIL_ENDIAN_INL_H_
-#define VPX_UTIL_ENDIAN_INL_H_
+#ifndef VPX_VPX_UTIL_ENDIAN_INL_H_
+#define VPX_VPX_UTIL_ENDIAN_INL_H_
 
 #include <stdlib.h>
 #include "./vpx_config.h"
@@ -115,4 +115,4 @@ static INLINE uint64_t BSwap64(uint64_t x) {
 #endif  // HAVE_BUILTIN_BSWAP64
 }
 
-#endif  // VPX_UTIL_ENDIAN_INL_H_
+#endif  // VPX_VPX_UTIL_ENDIAN_INL_H_
diff --git a/libs/libvpx/vpx_util/vpx_atomics.h b/libs/libvpx/vpx_util/vpx_atomics.h
index b8cf80daeb5..b06a8dce347 100644
--- a/libs/libvpx/vpx_util/vpx_atomics.h
+++ b/libs/libvpx/vpx_util/vpx_atomics.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_UTIL_VPX_ATOMICS_H_
-#define VPX_UTIL_VPX_ATOMICS_H_
+#ifndef VPX_VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_VPX_UTIL_VPX_ATOMICS_H_
 
 #include "./vpx_config.h"
 
@@ -68,7 +68,9 @@ extern "C" {
 // on any platform (to discourage programmer errors by setting values directly).
 // This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
 // (NOT memset) and accessed through vpx_atomic_ functions.
-typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int;
+typedef struct vpx_atomic_int {
+  volatile int value;
+} vpx_atomic_int;
 
 #define VPX_ATOMIC_INIT(num) \
   { num }
@@ -106,4 +108,4 @@ static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // VPX_UTIL_VPX_ATOMICS_H_
+#endif  // VPX_VPX_UTIL_VPX_ATOMICS_H_
diff --git a/libs/libvpx/vpx_util/vpx_debug_util.c b/libs/libvpx/vpx_util/vpx_debug_util.c
new file mode 100644
index 00000000000..3ce4065ba50
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_debug_util.c
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include "vpx_util/vpx_debug_util.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+static int frame_idx_w = 0;
+static int frame_idx_r = 0;
+
+void bitstream_queue_set_frame_write(int frame_idx) { frame_idx_w = frame_idx; }
+
+int bitstream_queue_get_frame_write(void) { return frame_idx_w; }
+
+void bitstream_queue_set_frame_read(int frame_idx) { frame_idx_r = frame_idx; }
+
+int bitstream_queue_get_frame_read(void) { return frame_idx_r; }
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+#define QUEUE_MAX_SIZE 2000000
+static int result_queue[QUEUE_MAX_SIZE];
+static int prob_queue[QUEUE_MAX_SIZE];
+
+static int queue_r = 0;
+static int queue_w = 0;
+static int queue_prev_w = -1;
+static int skip_r = 0;
+static int skip_w = 0;
+void bitstream_queue_set_skip_write(int skip) { skip_w = skip; }
+
+void bitstream_queue_set_skip_read(int skip) { skip_r = skip; }
+
+void bitstream_queue_record_write(void) { queue_prev_w = queue_w; }
+
+void bitstream_queue_reset_write(void) { queue_w = queue_prev_w; }
+
+int bitstream_queue_get_write(void) { return queue_w; }
+
+int bitstream_queue_get_read(void) { return queue_r; }
+
+void bitstream_queue_pop(int *result, int *prob) {
+  if (!skip_r) {
+    if (queue_w == queue_r) {
+      printf("buffer underflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+    *result = result_queue[queue_r];
+    *prob = prob_queue[queue_r];
+    queue_r = (queue_r + 1) % QUEUE_MAX_SIZE;
+  }
+}
+
+void bitstream_queue_push(int result, const int prob) {
+  if (!skip_w) {
+    result_queue[queue_w] = result;
+    prob_queue[queue_w] = prob;
+    queue_w = (queue_w + 1) % QUEUE_MAX_SIZE;
+    if (queue_w == queue_r) {
+      printf("buffer overflow queue_w %d queue_r %d\n", queue_w, queue_r);
+      assert(0);
+    }
+  }
+}
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+static int frame_buf_idx_r = 0;
+static int frame_buf_idx_w = 0;
+#define MAX_FRAME_BUF_NUM 20
+#define MAX_FRAME_STRIDE 1920
+#define MAX_FRAME_HEIGHT 1080
+static uint16_t
+    frame_pre[MAX_FRAME_BUF_NUM][3]
+             [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction only
+static uint16_t
+    frame_tx[MAX_FRAME_BUF_NUM][3]
+            [MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT];  // prediction + txfm
+static int frame_stride = MAX_FRAME_STRIDE;
+static int frame_height = MAX_FRAME_HEIGHT;
+static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT;
+void mismatch_move_frame_idx_w(void) {
+  frame_buf_idx_w = (frame_buf_idx_w + 1) % MAX_FRAME_BUF_NUM;
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf overflow\n");
+    assert(0);
+  }
+}
+
+void mismatch_reset_frame(int num_planes) {
+  int plane;
+  for (plane = 0; plane < num_planes; ++plane) {
+    memset(frame_pre[frame_buf_idx_w][plane], 0,
+           sizeof(frame_pre[frame_buf_idx_w][plane][0]) * frame_size);
+    memset(frame_tx[frame_buf_idx_w][plane], 0,
+           sizeof(frame_tx[frame_buf_idx_w][plane][0]) * frame_size);
+  }
+}
+
+void mismatch_move_frame_idx_r(void) {
+  if (frame_buf_idx_w == frame_buf_idx_r) {
+    printf("frame_buf underflow\n");
+    assert(0);
+  }
+  frame_buf_idx_r = (frame_buf_idx_r + 1) % MAX_FRAME_BUF_NUM;
+}
+
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+                               int pixel_c, int pixel_r, int blk_w, int blk_h,
+                               int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int r, c;
+
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      frame_pre[frame_buf_idx_w][plane]
+               [(r + pixel_r) * frame_stride + c + pixel_c] =
+                   src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  {
+    int ref_frame_idx = 3;
+    int ref_plane = 1;
+    int ref_pixel_c = 162;
+    int ref_pixel_r = 16;
+    if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+        ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+        ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+      printf(
+          "\nrecord_block_pre frame_idx %d plane %d pixel_c %d pixel_r %d blk_w"
+          " %d blk_h %d\n",
+          frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+    }
+  }
+#endif
+}
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      frame_tx[frame_buf_idx_w][plane]
+              [(r + pixel_r) * frame_stride + c + pixel_c] =
+                  src16 ? src16[r * src_stride + c] : src[r * src_stride + c];
+    }
+  }
+#if 0
+  {
+    int ref_frame_idx = 3;
+    int ref_plane = 1;
+    int ref_pixel_c = 162;
+    int ref_pixel_r = 16;
+    if (frame_idx_w == ref_frame_idx && plane == ref_plane &&
+        ref_pixel_c >= pixel_c && ref_pixel_c < pixel_c + blk_w &&
+        ref_pixel_r >= pixel_r && ref_pixel_r < pixel_r + blk_h) {
+      printf(
+          "\nrecord_block_tx frame_idx %d plane %d pixel_c %d pixel_r %d blk_w "
+          "%d blk_h %d\n",
+          frame_idx_w, plane, pixel_c, pixel_r, blk_w, blk_h);
+    }
+  }
+#endif
+}
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      if (frame_pre[frame_buf_idx_r][plane]
+                   [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    int rr, cc;
+    printf(
+        "\ncheck_block_pre failed frame_idx %d plane %d "
+        "pixel_c %d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_pre[frame_buf_idx_r][plane]
+                               [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+                             int pixel_c, int pixel_r, int blk_w, int blk_h,
+                             int highbd) {
+  const uint16_t *src16 = highbd ? CONVERT_TO_SHORTPTR(src) : NULL;
+  int mismatch = 0;
+  int r, c;
+  if (pixel_c + blk_w >= frame_stride || pixel_r + blk_h >= frame_height) {
+    printf("frame_buf undersized\n");
+    assert(0);
+  }
+
+  for (r = 0; r < blk_h; ++r) {
+    for (c = 0; c < blk_w; ++c) {
+      if (frame_tx[frame_buf_idx_r][plane]
+                  [(r + pixel_r) * frame_stride + c + pixel_c] !=
+          (uint16_t)(src16 ? src16[r * src_stride + c]
+                           : src[r * src_stride + c])) {
+        mismatch = 1;
+      }
+    }
+  }
+  if (mismatch) {
+    int rr, cc;
+    printf(
+        "\ncheck_block_tx failed frame_idx %d plane %d pixel_c "
+        "%d pixel_r "
+        "%d blk_w %d blk_h %d\n",
+        frame_idx_r, plane, pixel_c, pixel_r, blk_w, blk_h);
+    printf("enc\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ", frame_tx[frame_buf_idx_r][plane]
+                              [(rr + pixel_r) * frame_stride + cc + pixel_c]);
+      }
+      printf("\n");
+    }
+
+    printf("dec\n");
+    for (rr = 0; rr < blk_h; ++rr) {
+      for (cc = 0; cc < blk_w; ++cc) {
+        printf("%d ",
+               src16 ? src16[rr * src_stride + cc] : src[rr * src_stride + cc]);
+      }
+      printf("\n");
+    }
+    assert(0);
+  }
+}
+#endif  // CONFIG_MISMATCH_DEBUG
diff --git a/libs/libvpx/vpx_util/vpx_debug_util.h b/libs/libvpx/vpx_util/vpx_debug_util.h
new file mode 100644
index 00000000000..df1a1aab2cb
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_debug_util.h
@@ -0,0 +1,70 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+#define VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
+
+#include "./vpx_config.h"
+
+#include "vpx_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+void bitstream_queue_set_frame_write(int frame_idx);
+int bitstream_queue_get_frame_write(void);
+void bitstream_queue_set_frame_read(int frame_idx);
+int bitstream_queue_get_frame_read(void);
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+/* This is a debug tool used to detect bitstream error. On encoder side, it
+ * pushes each bit and probability into a queue before the bit is written into
+ * the Arithmetic coder. On decoder side, whenever a bit is read out from the
+ * Arithmetic coder, it pops out the reference bit and probability from the
+ * queue as well. If the two results do not match, this debug tool will report
+ * an error.  This tool can be used to pin down the bitstream error precisely.
+ * By combining gdb's backtrace method, we can detect which module causes the
+ * bitstream error. */
+int bitstream_queue_get_write(void);
+int bitstream_queue_get_read(void);
+void bitstream_queue_record_write(void);
+void bitstream_queue_reset_write(void);
+void bitstream_queue_pop(int *result, int *prob);
+void bitstream_queue_push(int result, const int prob);
+void bitstream_queue_set_skip_write(int skip);
+void bitstream_queue_set_skip_read(int skip);
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_MISMATCH_DEBUG
+void mismatch_move_frame_idx_w(void);
+void mismatch_move_frame_idx_r(void);
+void mismatch_reset_frame(int num_planes);
+void mismatch_record_block_pre(const uint8_t *src, int src_stride, int plane,
+                               int pixel_c, int pixel_r, int blk_w, int blk_h,
+                               int highbd);
+void mismatch_record_block_tx(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd);
+void mismatch_check_block_pre(const uint8_t *src, int src_stride, int plane,
+                              int pixel_c, int pixel_r, int blk_w, int blk_h,
+                              int highbd);
+void mismatch_check_block_tx(const uint8_t *src, int src_stride, int plane,
+                             int pixel_c, int pixel_r, int blk_w, int blk_h,
+                             int highbd);
+#endif  // CONFIG_MISMATCH_DEBUG
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_UTIL_VPX_DEBUG_UTIL_H_
diff --git a/libs/libvpx/vpx_util/vpx_thread.h b/libs/libvpx/vpx_util/vpx_thread.h
index 53a5f4966a4..6d308e949b1 100644
--- a/libs/libvpx/vpx_util/vpx_thread.h
+++ b/libs/libvpx/vpx_util/vpx_thread.h
@@ -12,8 +12,8 @@
 // Original source:
 //  https://chromium.googlesource.com/webm/libwebp
 
-#ifndef VPX_THREAD_H_
-#define VPX_THREAD_H_
+#ifndef VPX_VPX_UTIL_VPX_THREAD_H_
+#define VPX_VPX_UTIL_VPX_THREAD_H_
 
 #include "./vpx_config.h"
 
@@ -159,6 +159,23 @@ static INLINE int pthread_cond_init(pthread_cond_t *const condition,
   return 0;
 }
 
+static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) {
+  int ok = 1;
+#ifdef USE_WINDOWS_CONDITION_VARIABLE
+  WakeAllConditionVariable(condition);
+#else
+  while (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok &= SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+#endif
+  return !ok;
+}
+
 static INLINE int pthread_cond_signal(pthread_cond_t *const condition) {
   int ok = 1;
 #ifdef USE_WINDOWS_CONDITION_VARIABLE
@@ -194,6 +211,7 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
 #endif
   return !ok;
 }
+
 #elif defined(__OS2__)
 #define INCL_DOS
 #include <os2.h>  // NOLINT
@@ -202,6 +220,11 @@ static INLINE int pthread_cond_wait(pthread_cond_t *const condition,
 #include <stdlib.h>       // NOLINT
 #include <sys/builtin.h>  // NOLINT
 
+#if defined(__STRICT_ANSI__)
+// _beginthread() is not declared on __STRICT_ANSI__ mode. Declare here.
+int _beginthread(void (*)(void *), void *, unsigned, void *);
+#endif
+
 #define pthread_t TID
 #define pthread_mutex_t HMTX
 
@@ -412,4 +435,4 @@ const VPxWorkerInterface *vpx_get_worker_interface(void);
 }  // extern "C"
 #endif
 
-#endif  // VPX_THREAD_H_
+#endif  // VPX_VPX_UTIL_VPX_THREAD_H_
diff --git a/libs/libvpx/vpx_util/vpx_timestamp.h b/libs/libvpx/vpx_util/vpx_timestamp.h
new file mode 100644
index 00000000000..c210de5e537
--- /dev/null
+++ b/libs/libvpx/vpx_util/vpx_timestamp.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+#define VPX_VPX_UTIL_VPX_TIMESTAMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// Rational Number with an int64 numerator
+typedef struct vpx_rational64 {
+  int64_t num;       // fraction numerator
+  int den;           // fraction denominator
+} vpx_rational64_t;  // alias for struct vpx_rational64_t
+
+static INLINE int gcd(int64_t a, int b) {
+  int r;  // remainder
+  while (b > 0) {
+    r = (int)(a % b);
+    a = b;
+    b = r;
+  }
+
+  return (int)a;
+}
+
+static INLINE void reduce_ratio(vpx_rational64_t *ratio) {
+  const int denom = gcd(ratio->num, ratio->den);
+  ratio->num /= denom;
+  ratio->den /= denom;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // VPX_VPX_UTIL_VPX_TIMESTAMP_H_
diff --git a/libs/libvpx/vpx_util/vpx_util.mk b/libs/libvpx/vpx_util/vpx_util.mk
index 86d3ece3c89..11627149569 100644
--- a/libs/libvpx/vpx_util/vpx_util.mk
+++ b/libs/libvpx/vpx_util/vpx_util.mk
@@ -15,3 +15,6 @@ UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
 UTIL_SRCS-yes += vpx_write_yuv_frame.h
 UTIL_SRCS-yes += vpx_write_yuv_frame.c
+UTIL_SRCS-yes += vpx_timestamp.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.h
+UTIL_SRCS-$(or $(CONFIG_BITSTREAM_DEBUG),$(CONFIG_MISMATCH_DEBUG)) += vpx_debug_util.c
diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c
index ab685581157..4ef57a2fee0 100644
--- a/libs/libvpx/vpx_util/vpx_write_yuv_frame.c
+++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -13,7 +13,7 @@
 
 void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
 #if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
-    defined(OUTPUT_YUV_SKINMAP)
+    defined(OUTPUT_YUV_SKINMAP) || defined(OUTPUT_YUV_SVC_SRC)
 
   unsigned char *src = s->y_buffer;
   int h = s->y_crop_height;
diff --git a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h
index 1cb7029817e..ce1102458ed 100644
--- a/libs/libvpx/vpx_util/vpx_write_yuv_frame.h
+++ b/libs/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
-#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#ifndef VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
 
 #include <stdio.h>
 #include "vpx_scale/yv12config.h"
@@ -24,4 +24,4 @@ void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
 }  // extern "C"
 #endif
 
-#endif  // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#endif  // VPX_VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
diff --git a/libs/libvpx/vpxdec.c b/libs/libvpx/vpxdec.c
index ff20e6a3c9b..c60eb5c30ba 100644
--- a/libs/libvpx/vpxdec.c
+++ b/libs/libvpx/vpxdec.c
@@ -98,20 +98,41 @@ static const arg_def_t svcdecodingarg = ARG_DEF(
     NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer");
 static const arg_def_t framestatsarg =
     ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
-
-static const arg_def_t *all_args[] = {
-  &help,           &codecarg,          &use_yv12,
-  &use_i420,       &flipuvarg,         &rawvideo,
-  &noblitarg,      &progressarg,       &limitarg,
-  &skiparg,        &postprocarg,       &summaryarg,
-  &outputfile,     &threadsarg,        &frameparallelarg,
-  &verbosearg,     &scalearg,          &fb_arg,
-  &md5arg,         &error_concealment, &continuearg,
+static const arg_def_t rowmtarg =
+    ARG_DEF(NULL, "row-mt", 1, "Enable multi-threading to run row-wise in VP9");
+static const arg_def_t lpfoptarg =
+    ARG_DEF(NULL, "lpf-opt", 1,
+            "Do loopfilter without waiting for all threads to sync.");
+
+static const arg_def_t *all_args[] = { &help,
+                                       &codecarg,
+                                       &use_yv12,
+                                       &use_i420,
+                                       &flipuvarg,
+                                       &rawvideo,
+                                       &noblitarg,
+                                       &progressarg,
+                                       &limitarg,
+                                       &skiparg,
+                                       &postprocarg,
+                                       &summaryarg,
+                                       &outputfile,
+                                       &threadsarg,
+                                       &frameparallelarg,
+                                       &verbosearg,
+                                       &scalearg,
+                                       &fb_arg,
+                                       &md5arg,
+                                       &error_concealment,
+                                       &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
-  &outbitdeptharg,
+                                       &outbitdeptharg,
 #endif
-  &svcdecodingarg, &framestatsarg,     NULL
-};
+                                       &svcdecodingarg,
+                                       &framestatsarg,
+                                       &rowmtarg,
+                                       &lpfoptarg,
+                                       NULL };
 
 #if CONFIG_VP8_DECODER
 static const arg_def_t addnoise_level =
@@ -154,7 +175,7 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                    dst->d_h, mode);
 }
 #endif
-void show_help(FILE *fout, int shorthelp) {
+static void show_help(FILE *fout, int shorthelp) {
   int i;
 
   fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
@@ -238,13 +259,14 @@ static int raw_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
       return 1;
     }
     *bytes_read = frame_size;
+    return 0;
   }
 
-  return 0;
+  return 1;
 }
 
-static int read_frame(struct VpxDecInputContext *input, uint8_t **buf,
-                      size_t *bytes_in_buffer, size_t *buffer_size) {
+static int dec_read_frame(struct VpxDecInputContext *input, uint8_t **buf,
+                          size_t *bytes_in_buffer, size_t *buffer_size) {
   switch (input->vpx_input_ctx->file_type) {
 #if CONFIG_WEBM_IO
     case FILE_TYPE_WEBM:
@@ -506,6 +528,8 @@ static int main_loop(int argc, const char **argv_) {
   int arg_skip = 0;
   int ec_enabled = 0;
   int keep_going = 0;
+  int enable_row_mt = 0;
+  int enable_lpf_opt = 0;
   const VpxInterface *interface = NULL;
   const VpxInterface *fourcc_interface = NULL;
   uint64_t dx_time = 0;
@@ -628,6 +652,10 @@ static int main_loop(int argc, const char **argv_) {
         die("Error: Could not open --framestats file (%s) for writing.\n",
             arg.val);
       }
+    } else if (arg_match(&arg, &rowmtarg, argi)) {
+      enable_row_mt = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &lpfoptarg, argi)) {
+      enable_lpf_opt = arg_parse_uint(&arg);
     }
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -753,6 +781,18 @@ static int main_loop(int argc, const char **argv_) {
       goto fail;
     }
   }
+  if (interface->fourcc == VP9_FOURCC &&
+      vpx_codec_control(&decoder, VP9D_SET_ROW_MT, enable_row_mt)) {
+    fprintf(stderr, "Failed to set decoder in row multi-thread mode: %s\n",
+            vpx_codec_error(&decoder));
+    goto fail;
+  }
+  if (interface->fourcc == VP9_FOURCC &&
+      vpx_codec_control(&decoder, VP9D_SET_LOOP_FILTER_OPT, enable_lpf_opt)) {
+    fprintf(stderr, "Failed to set decoder in optimized loopfilter mode: %s\n",
+            vpx_codec_error(&decoder));
+    goto fail;
+  }
   if (!quiet) fprintf(stderr, "%s\n", decoder.name);
 
 #if CONFIG_VP8_DECODER
@@ -766,7 +806,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (arg_skip) fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
-    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
+    if (dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) break;
     arg_skip--;
   }
 
@@ -797,7 +837,7 @@ static int main_loop(int argc, const char **argv_) {
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
-      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
+      if (!dec_read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
         frame_avail = 1;
         frame_in++;
 
diff --git a/libs/libvpx/vpxenc.c b/libs/libvpx/vpxenc.c
index 4db7eccc351..50c36bedd54 100644
--- a/libs/libvpx/vpxenc.c
+++ b/libs/libvpx/vpxenc.c
@@ -50,12 +50,6 @@
 #endif
 #include "./y4minput.h"
 
-/* Swallow warnings about unused results of fread/fwrite */
-static size_t wrap_fread(void *ptr, size_t size, size_t nmemb, FILE *stream) {
-  return fread(ptr, size, nmemb, stream);
-}
-#define fread wrap_fread
-
 static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
                           FILE *stream) {
   return fwrite(ptr, size, nmemb, stream);
@@ -95,34 +89,6 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
   va_end(ap);
 }
 
-static int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
-  FILE *f = input_ctx->file;
-  y4m_input *y4m = &input_ctx->y4m;
-  int shortread = 0;
-
-  if (input_ctx->file_type == FILE_TYPE_Y4M) {
-    if (y4m_input_fetch_frame(y4m, f, img) < 1) return 0;
-  } else {
-    shortread = read_yuv_frame(input_ctx, img);
-  }
-
-  return !shortread;
-}
-
-static int file_is_y4m(const char detect[4]) {
-  if (memcmp(detect, "YUV4", 4) == 0) {
-    return 1;
-  }
-  return 0;
-}
-
-static int fourcc_is_ivf(const char detect[4]) {
-  if (memcmp(detect, "DKIF", 4) == 0) {
-    return 1;
-  }
-  return 0;
-}
-
 static const arg_def_t help =
     ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t debugmode =
@@ -326,9 +292,9 @@ static const arg_def_t maxsection_pct =
     ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
 static const arg_def_t corpus_complexity =
     ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint");
-static const arg_def_t *rc_twopass_args[] = {
-  &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL
-};
+static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
+                                              &maxsection_pct,
+                                              &corpus_complexity, NULL };
 
 static const arg_def_t kf_min_dist =
     ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
@@ -342,19 +308,19 @@ static const arg_def_t *kf_args[] = { &kf_min_dist, &kf_max_dist, &kf_disabled,
 static const arg_def_t noise_sens =
     ARG_DEF(NULL, "noise-sensitivity", 1, "Noise sensitivity (frames to blur)");
 static const arg_def_t sharpness =
-    ARG_DEF(NULL, "sharpness", 1, "Loop filter sharpness (0..7)");
+    ARG_DEF(NULL, "sharpness", 1,
+            "Increase sharpness at the expense of lower PSNR. (0..7)");
 static const arg_def_t static_thresh =
     ARG_DEF(NULL, "static-thresh", 1, "Motion detection threshold");
-static const arg_def_t auto_altref =
-    ARG_DEF(NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames");
 static const arg_def_t arnr_maxframes =
     ARG_DEF(NULL, "arnr-maxframes", 1, "AltRef max frames (0..15)");
 static const arg_def_t arnr_strength =
     ARG_DEF(NULL, "arnr-strength", 1, "AltRef filter strength (0..6)");
-static const arg_def_t arnr_type = ARG_DEF(NULL, "arnr-type", 1, "AltRef type");
-static const struct arg_enum_list tuning_enum[] = {
-  { "psnr", VP8_TUNE_PSNR }, { "ssim", VP8_TUNE_SSIM }, { NULL, 0 }
-};
+static const arg_def_t arnr_type =
+    ARG_DEF(NULL, "arnr-type", 1, "AltRef filter type (1..3)");
+static const struct arg_enum_list tuning_enum[] = { { "psnr", VP8_TUNE_PSNR },
+                                                    { "ssim", VP8_TUNE_SSIM },
+                                                    { NULL, 0 } };
 static const arg_def_t tune_ssim =
     ARG_DEF_ENUM(NULL, "tune", 1, "Material to favor", tuning_enum);
 static const arg_def_t cq_level =
@@ -367,12 +333,14 @@ static const arg_def_t gf_cbr_boost_pct = ARG_DEF(
 #if CONFIG_VP8_ENCODER
 static const arg_def_t cpu_used_vp8 =
     ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-16..16)");
+static const arg_def_t auto_altref_vp8 = ARG_DEF(
+    NULL, "auto-alt-ref", 1, "Enable automatic alt reference frames. (0..1)");
 static const arg_def_t token_parts =
     ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2");
 static const arg_def_t screen_content_mode =
     ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode");
 static const arg_def_t *vp8_args[] = { &cpu_used_vp8,
-                                       &auto_altref,
+                                       &auto_altref_vp8,
                                        &noise_sens,
                                        &sharpness,
                                        &static_thresh,
@@ -405,12 +373,19 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
 
 #if CONFIG_VP9_ENCODER
 static const arg_def_t cpu_used_vp9 =
-    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-8..8)");
+    ARG_DEF(NULL, "cpu-used", 1, "CPU Used (-9..9)");
+static const arg_def_t auto_altref_vp9 = ARG_DEF(
+    NULL, "auto-alt-ref", 1,
+    "Enable automatic alt reference frames, 2+ enables multi-layer. (0..6)");
 static const arg_def_t tile_cols =
     ARG_DEF(NULL, "tile-columns", 1, "Number of tile columns to use, log2");
 static const arg_def_t tile_rows =
     ARG_DEF(NULL, "tile-rows", 1,
             "Number of tile rows to use, log2 (set to 0 while threads > 1)");
+
+static const arg_def_t enable_tpl_model =
+    ARG_DEF(NULL, "enable-tpl", 1, "Enable temporal dependency model");
+
 static const arg_def_t lossless =
     ARG_DEF(NULL, "lossless", 1, "Lossless mode (0: false (default), 1: true)");
 static const arg_def_t frame_parallel_decoding = ARG_DEF(
@@ -491,11 +466,12 @@ static const arg_def_t row_mt =
 
 #if CONFIG_VP9_ENCODER
 static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
-                                       &auto_altref,
+                                       &auto_altref_vp9,
                                        &sharpness,
                                        &static_thresh,
                                        &tile_cols,
                                        &tile_rows,
+                                       &enable_tpl_model,
                                        &arnr_maxframes,
                                        &arnr_strength,
                                        &arnr_type,
@@ -527,6 +503,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP8E_SET_STATIC_THRESHOLD,
                                         VP9E_SET_TILE_COLUMNS,
                                         VP9E_SET_TILE_ROWS,
+                                        VP9E_SET_TPL,
                                         VP8E_SET_ARNR_MAXFRAMES,
                                         VP8E_SET_ARNR_STRENGTH,
                                         VP8E_SET_ARNR_TYPE,
@@ -552,7 +529,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
 
 static const arg_def_t *no_args[] = { NULL };
 
-void show_help(FILE *fout, int shorthelp) {
+static void show_help(FILE *fout, int shorthelp) {
   int i;
   const int num_encoder = get_vpx_encoder_count();
 
@@ -603,230 +580,6 @@ void usage_exit(void) {
   exit(EXIT_FAILURE);
 }
 
-#define mmin(a, b) ((a) < (b) ? (a) : (b))
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void find_mismatch_high(const vpx_image_t *const img1,
-                               const vpx_image_t *const img2, int yloc[4],
-                               int uloc[4], int vloc[4]) {
-  uint16_t *plane1, *plane2;
-  uint32_t stride1, stride2;
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_Y];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_Y];
-  stride1 = img1->stride[VPX_PLANE_Y] / 2;
-  stride2 = img2->stride[VPX_PLANE_Y] / 2;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            yloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_U];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_U];
-  stride1 = img1->stride[VPX_PLANE_U] / 2;
-  stride2 = img2->stride[VPX_PLANE_U] / 2;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            uloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  plane1 = (uint16_t *)img1->planes[VPX_PLANE_V];
-  plane2 = (uint16_t *)img2->planes[VPX_PLANE_V];
-  stride1 = img1->stride[VPX_PLANE_V] / 2;
-  stride2 = img2->stride[VPX_PLANE_V] / 2;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(plane1 + (i + k) * stride1 + j + l) !=
-              *(plane2 + (i + k) * stride2 + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(plane1 + (i + k) * stride1 + j + l);
-            vloc[3] = *(plane2 + (i + k) * stride2 + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-#endif
-
-static void find_mismatch(const vpx_image_t *const img1,
-                          const vpx_image_t *const img2, int yloc[4],
-                          int uloc[4], int vloc[4]) {
-  const uint32_t bsize = 64;
-  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
-  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
-  const uint32_t c_w =
-      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  int match = 1;
-  uint32_t i, j;
-  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
-  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
-    for (j = 0; match && j < img1->d_w; j += bsize) {
-      int k, l;
-      const int si = mmin(i + bsize, img1->d_h) - i;
-      const int sj = mmin(j + bsize, img1->d_w) - j;
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_Y] +
-                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
-              *(img2->planes[VPX_PLANE_Y] +
-                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
-            yloc[0] = i + k;
-            yloc[1] = j + l;
-            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
-                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
-            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
-                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_U] +
-                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
-              *(img2->planes[VPX_PLANE_U] +
-                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
-            uloc[0] = i + k;
-            uloc[1] = j + l;
-            uloc[2] = *(img1->planes[VPX_PLANE_U] +
-                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
-            uloc[3] = *(img2->planes[VPX_PLANE_U] +
-                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
-  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
-    for (j = 0; match && j < c_w; j += bsizex) {
-      int k, l;
-      const int si = mmin(i + bsizey, c_h - i);
-      const int sj = mmin(j + bsizex, c_w - j);
-      for (k = 0; match && k < si; ++k) {
-        for (l = 0; match && l < sj; ++l) {
-          if (*(img1->planes[VPX_PLANE_V] +
-                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
-              *(img2->planes[VPX_PLANE_V] +
-                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
-            vloc[0] = i + k;
-            vloc[1] = j + l;
-            vloc[2] = *(img1->planes[VPX_PLANE_V] +
-                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
-            vloc[3] = *(img2->planes[VPX_PLANE_V] +
-                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
-            match = 0;
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-static int compare_img(const vpx_image_t *const img1,
-                       const vpx_image_t *const img2) {
-  uint32_t l_w = img1->d_w;
-  uint32_t c_w = (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
-  const uint32_t c_h =
-      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
-  uint32_t i;
-  int match = 1;
-
-  match &= (img1->fmt == img2->fmt);
-  match &= (img1->d_w == img2->d_w);
-  match &= (img1->d_h == img2->d_h);
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (img1->fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-    l_w *= 2;
-    c_w *= 2;
-  }
-#endif
-
-  for (i = 0; i < img1->d_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                     l_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                     c_w) == 0);
-
-  for (i = 0; i < c_h; ++i)
-    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                     c_w) == 0);
-
-  return match;
-}
-
 #define NELEMENTS(x) (sizeof(x) / sizeof(x[0]))
 #if CONFIG_VP9_ENCODER
 #define ARG_CTRL_CNT_MAX NELEMENTS(vp9_arg_ctrl_map)
@@ -1012,57 +765,6 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   }
 }
 
-static void open_input_file(struct VpxInputContext *input) {
-  /* Parse certain options from the input file, if possible */
-  input->file = strcmp(input->filename, "-") ? fopen(input->filename, "rb")
-                                             : set_binary_mode(stdin);
-
-  if (!input->file) fatal("Failed to open input file");
-
-  if (!fseeko(input->file, 0, SEEK_END)) {
-    /* Input file is seekable. Figure out how long it is, so we can get
-     * progress info.
-     */
-    input->length = ftello(input->file);
-    rewind(input->file);
-  }
-
-  /* Default to 1:1 pixel aspect ratio. */
-  input->pixel_aspect_ratio.numerator = 1;
-  input->pixel_aspect_ratio.denominator = 1;
-
-  /* For RAW input sources, these bytes will applied on the first frame
-   *  in read_frame().
-   */
-  input->detect.buf_read = fread(input->detect.buf, 1, 4, input->file);
-  input->detect.position = 0;
-
-  if (input->detect.buf_read == 4 && file_is_y4m(input->detect.buf)) {
-    if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
-                       input->only_i420) >= 0) {
-      input->file_type = FILE_TYPE_Y4M;
-      input->width = input->y4m.pic_w;
-      input->height = input->y4m.pic_h;
-      input->pixel_aspect_ratio.numerator = input->y4m.par_n;
-      input->pixel_aspect_ratio.denominator = input->y4m.par_d;
-      input->framerate.numerator = input->y4m.fps_n;
-      input->framerate.denominator = input->y4m.fps_d;
-      input->fmt = input->y4m.vpx_fmt;
-      input->bit_depth = input->y4m.bit_depth;
-    } else
-      fatal("Unsupported Y4M stream.");
-  } else if (input->detect.buf_read == 4 && fourcc_is_ivf(input->detect.buf)) {
-    fatal("IVF is not supported as input.");
-  } else {
-    input->file_type = FILE_TYPE_RAW;
-  }
-}
-
-static void close_input_file(struct VpxInputContext *input) {
-  fclose(input->file);
-  if (input->file_type == FILE_TYPE_Y4M) y4m_input_close(&input->y4m);
-}
-
 static struct stream_state *new_stream(struct VpxEncoderConfig *global,
                                        struct stream_state *prev) {
   struct stream_state *stream;
@@ -1278,8 +980,8 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
           match = 1;
 
           /* Point either to the next free element or the first
-          * instance of this control.
-          */
+           * instance of this control.
+           */
           for (j = 0; j < config->arg_ctrl_cnt; j++)
             if (ctrl_args_map != NULL &&
                 config->arg_ctrls[j][0] == ctrl_args_map[i])
@@ -1614,14 +1316,14 @@ static void encode_frame(struct stream_state *stream,
             vpx_img_alloc(NULL, VPX_IMG_FMT_I42016, cfg->g_w, cfg->g_h, 16);
       }
       I420Scale_16(
-          (uint16 *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
-          img->d_w, img->d_h, (uint16 *)stream->img->planes[VPX_PLANE_Y],
+          (uint16_t *)img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U] / 2,
+          (uint16_t *)img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V] / 2,
+          img->d_w, img->d_h, (uint16_t *)stream->img->planes[VPX_PLANE_Y],
           stream->img->stride[VPX_PLANE_Y] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_U],
+          (uint16_t *)stream->img->planes[VPX_PLANE_U],
           stream->img->stride[VPX_PLANE_U] / 2,
-          (uint16 *)stream->img->planes[VPX_PLANE_V],
+          (uint16_t *)stream->img->planes[VPX_PLANE_V],
           stream->img->stride[VPX_PLANE_V] / 2, stream->img->d_w,
           stream->img->d_h, kFilterBox);
       img = stream->img;
@@ -2215,9 +1917,9 @@ int main(int argc, const char **argv_) {
 
     if (!global.quiet) {
       FOREACH_STREAM(fprintf(
-          stderr, "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64
-                  "b/f %7" PRId64 "b/s"
-                  " %7" PRId64 " %s (%.2f fps)\033[K\n",
+          stderr,
+          "\rPass %d/%d frame %4d/%-4d %7" PRId64 "B %7" PRId64 "b/f %7" PRId64
+          "b/s %7" PRId64 " %s (%.2f fps)\033[K\n",
           pass + 1, global.passes, frames_in, stream->frames_out,
           (int64_t)stream->nbytes,
           seen_frames ? (int64_t)(stream->nbytes * 8 / seen_frames) : 0,
diff --git a/libs/libvpx/vpxenc.h b/libs/libvpx/vpxenc.h
index d867e9d9548..b780aedca69 100644
--- a/libs/libvpx/vpxenc.h
+++ b/libs/libvpx/vpxenc.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef VPXENC_H_
-#define VPXENC_H_
+#ifndef VPX_VPXENC_H_
+#define VPX_VPXENC_H_
 
 #include "vpx/vpx_encoder.h"
 
@@ -61,4 +61,4 @@ struct VpxEncoderConfig {
 }  // extern "C"
 #endif
 
-#endif  // VPXENC_H_
+#endif  // VPX_VPXENC_H_
diff --git a/libs/libvpx/vpxstats.h b/libs/libvpx/vpxstats.h
index 5c9ea34f71a..3625ee3291f 100644
--- a/libs/libvpx/vpxstats.h
+++ b/libs/libvpx/vpxstats.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPXSTATS_H_
-#define VPXSTATS_H_
+#ifndef VPX_VPXSTATS_H_
+#define VPX_VPXSTATS_H_
 
 #include <stdio.h>
 
@@ -40,4 +40,4 @@ vpx_fixed_buf_t stats_get(stats_io_t *stats);
 }  // extern "C"
 #endif
 
-#endif  // VPXSTATS_H_
+#endif  // VPX_VPXSTATS_H_
diff --git a/libs/libvpx/warnings.h b/libs/libvpx/warnings.h
index 6b8ae6796f3..15558c64374 100644
--- a/libs/libvpx/warnings.h
+++ b/libs/libvpx/warnings.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WARNINGS_H_
-#define WARNINGS_H_
+#ifndef VPX_WARNINGS_H_
+#define VPX_WARNINGS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,4 +30,4 @@ void check_encoder_config(int disable_prompt,
 }  // extern "C"
 #endif
 
-#endif  // WARNINGS_H_
+#endif  // VPX_WARNINGS_H_
diff --git a/libs/libvpx/webmdec.h b/libs/libvpx/webmdec.h
index 7dcb170caf3..d8618b07d67 100644
--- a/libs/libvpx/webmdec.h
+++ b/libs/libvpx/webmdec.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WEBMDEC_H_
-#define WEBMDEC_H_
+#ifndef VPX_WEBMDEC_H_
+#define VPX_WEBMDEC_H_
 
 #include "./tools_common.h"
 
@@ -66,4 +66,4 @@ void webm_free(struct WebmInputContext *webm_ctx);
 }  // extern "C"
 #endif
 
-#endif  // WEBMDEC_H_
+#endif  // VPX_WEBMDEC_H_
diff --git a/libs/libvpx/webmenc.h b/libs/libvpx/webmenc.h
index b4a9e357bb8..4176e820814 100644
--- a/libs/libvpx/webmenc.h
+++ b/libs/libvpx/webmenc.h
@@ -7,8 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef WEBMENC_H_
-#define WEBMENC_H_
+#ifndef VPX_WEBMENC_H_
+#define VPX_WEBMENC_H_
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -52,4 +52,4 @@ void write_webm_file_footer(struct WebmOutputContext *webm_ctx);
 }  // extern "C"
 #endif
 
-#endif  // WEBMENC_H_
+#endif  // VPX_WEBMENC_H_
diff --git a/libs/libvpx/y4menc.h b/libs/libvpx/y4menc.h
index 69d590413ec..9a367e34c63 100644
--- a/libs/libvpx/y4menc.h
+++ b/libs/libvpx/y4menc.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef Y4MENC_H_
-#define Y4MENC_H_
+#ifndef VPX_Y4MENC_H_
+#define VPX_Y4MENC_H_
 
 #include "./tools_common.h"
 
@@ -30,4 +30,4 @@ int y4m_write_frame_header(char *buf, size_t len);
 }  // extern "C"
 #endif
 
-#endif  // Y4MENC_H_
+#endif  // VPX_Y4MENC_H_
diff --git a/libs/libvpx/y4minput.c b/libs/libvpx/y4minput.c
index 1de636cc0bc..56d5598276f 100644
--- a/libs/libvpx/y4minput.c
+++ b/libs/libvpx/y4minput.c
@@ -130,8 +130,8 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   The number of taps is intentionally kept small to reduce computational
    overhead and limit ringing.
 
-  The taps from these filters are scaled so that their sum is 1, and the result
-   is scaled by 128 and rounded to integers to create a filter whose
+  The taps from these filters are scaled so that their sum is 1, and the
+  result is scaled by 128 and rounded to integers to create a filter whose
    intermediate values fit inside 16 bits.
   Coefficients are rounded in such a way as to ensure their sum is still 128,
    which is usually equivalent to normal rounding.
@@ -139,7 +139,6 @@ static int y4m_parse_tags(y4m_input *_y4m, char *_tags) {
   Conversions which require both horizontal and vertical filtering could
    have these steps pipelined, for less memory consumption and better cache
    performance, but we do them separately for simplicity.*/
-
 #define OC_MINI(_a, _b) ((_a) > (_b) ? (_b) : (_a))
 #define OC_MAXI(_a, _b) ((_a) < (_b) ? (_b) : (_a))
 #define OC_CLAMPI(_a, _b, _c) (OC_MAXI(_a, OC_MINI(_b, _c)))
@@ -976,6 +975,8 @@ int y4m_input_open(y4m_input *_y4m, FILE *_fin, char *_skip, int _nskip,
     _y4m->aux_buf_sz =
         _y4m->aux_buf_read_sz + ((_y4m->pic_w + 1) / 2) * _y4m->pic_h;
     _y4m->convert = y4m_convert_411_420jpeg;
+    fprintf(stderr, "Unsupported conversion from yuv 411\n");
+    return -1;
   } else if (strcmp(_y4m->chroma_type, "444") == 0) {
     _y4m->src_c_dec_h = 1;
     _y4m->src_c_dec_v = 1;
diff --git a/libs/libvpx/y4minput.h b/libs/libvpx/y4minput.h
index 9e69ceb835a..a4a8b18dc53 100644
--- a/libs/libvpx/y4minput.h
+++ b/libs/libvpx/y4minput.h
@@ -11,8 +11,8 @@
  *  Copyright (C) 2002-2010 The Xiph.Org Foundation and contributors.
  */
 
-#ifndef Y4MINPUT_H_
-#define Y4MINPUT_H_
+#ifndef VPX_Y4MINPUT_H_
+#define VPX_Y4MINPUT_H_
 
 #include <stdio.h>
 #include "vpx/vpx_image.h"
@@ -65,4 +65,4 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *img);
 }  // extern "C"
 #endif
 
-#endif  // Y4MINPUT_H_
+#endif  // VPX_Y4MINPUT_H_
diff --git a/libs/libyuv/Android.bp b/libs/libyuv/Android.bp
index 691686aad33..ea76f9435be 100644
--- a/libs/libyuv/Android.bp
+++ b/libs/libyuv/Android.bp
@@ -69,6 +69,7 @@ cc_library {
 // with libyuv (b/37646797)
 cc_library_static {
     name: "libyuv_static",
+    vendor_available: true,
     whole_static_libs: ["libyuv"],
 }
 
diff --git a/libs/libyuv/DEPS b/libs/libyuv/DEPS
index aad3f8ed7ec..758848138d4 100644
--- a/libs/libyuv/DEPS
+++ b/libs/libyuv/DEPS
@@ -1,44 +1,76 @@
 vars = {
   'chromium_git': 'https://chromium.googlesource.com',
-  'chromium_revision': '4476bd69d1c8e4e1cde8633d3b33c992f7d3a6d0',
-  'swarming_revision': '0e3e1c4dc4e79f25a5b58fcbc135dc93183c0c54',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling lss
-  # and whatever else without interference from each other.
-  'lss_revision': 'e6527b0cd469e3ff5764785dadcb39bf7d787154',
-  # Three lines of non-changing comments so that
-  # the commit queue can handle CLs rolling catapult
-  # and whatever else without interference from each other.
-  'catapult_revision': 'a24a725f7834c16b3628bfb63f349b3480bf9592',
+  'chromium_revision': '08ed8f830153eebaf021fabcf92aed05a4cdc2a8',
 }
 
 deps = {
   'src/build':
-    Var('chromium_git') + '/chromium/src/build' + '@' + '669e41d6f18842ed5740449662a71b715dc607c6',
+    Var('chromium_git') + '/chromium/src/build' + '@' + '2ee8adf846a4b485fae5ea87290f82447c638c1a',
   'src/buildtools':
-    Var('chromium_git') + '/chromium/buildtools.git' + '@' + '0e1cbc4eab6861b0c84bf2ed9a3c4b7aa2063819',
+    Var('chromium_git') + '/chromium/src/buildtools' + '@' + '74cfb57006f83cfe050817526db359d5c8a11628',
   'src/testing':
-    Var('chromium_git') + '/chromium/src/testing' + '@' + 'b1c6aeebeabcc177a83ff0a33dc6c3ab03d4aa94',
+    Var('chromium_git') + '/chromium/src/testing' + '@' + 'ae52062d9c6d7e38df1c17b53caad1289a4742a1',
   'src/third_party':
-    Var('chromium_git') + '/chromium/src/third_party' + '@' + 'be3e0fc18f2e9ea14d0e9369e539eae5986335fd',
+    Var('chromium_git') + '/chromium/src/third_party' + '@' + '3a7aa3a24ca6db31d7afbee24f1c61dffc6d21f4',
+
+  'src/buildtools/linux64': {
+    'packages': [
+      {
+        'package': 'gn/gn/linux-amd64',
+        'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f',
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_linux',
+  },
+  'src/buildtools/mac': {
+    'packages': [
+      {
+        'package': 'gn/gn/mac-amd64',
+        'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f',
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_mac',
+  },
+  'src/buildtools/win': {
+    'packages': [
+      {
+        'package': 'gn/gn/windows-amd64',
+        'version': 'git_revision:972ed755f8e6d31cae9ba15fcd08136ae1a7886f',
+      }
+    ],
+    'dep_type': 'cipd',
+    'condition': 'checkout_win',
+  },
+
+  'src/buildtools/clang_format/script':
+    Var('chromium_git') + '/chromium/llvm-project/cfe/tools/clang-format.git' + '@' + '96636aa0e9f047f17447f2d45a094d0b59ed7917',
+  'src/buildtools/third_party/libc++/trunk':
+    Var('chromium_git') + '/chromium/llvm-project/libcxx.git' + '@' + '5938e0582bac570a41edb3d6a2217c299adc1bc6',
+  'src/buildtools/third_party/libc++abi/trunk':
+    Var('chromium_git') + '/chromium/llvm-project/libcxxabi.git' + '@' + '0d529660e32d77d9111912d73f2c74fc5fa2a858',
+  'src/buildtools/third_party/libunwind/trunk':
+    Var('chromium_git') + '/external/llvm.org/libunwind.git' + '@' + '69d9b84cca8354117b9fe9705a4430d789ee599b',
+
   'src/third_party/catapult':
-    Var('chromium_git') + '/catapult.git' + '@' + Var('catapult_revision'),
+    Var('chromium_git') + '/catapult.git' + '@' + '2e1d9ff85e0a54a39f55d656086ff00a830cb7a9',
   'src/third_party/colorama/src':
     Var('chromium_git') + '/external/colorama.git' + '@' + '799604a1041e9b3bc5d2789ecbd7e8db2e18e6b8',
   'src/third_party/freetype/src':
-    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + 'd01e28f41f8810c8ea422b854f8722659589fa99',
+    Var('chromium_git') + '/chromium/src/third_party/freetype2.git' + '@' + '3fa35aa420ee88856c60d3c0b7fedd43801953cc',
   'src/third_party/googletest/src':
-    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '879ac092fde0a19e1b3a61b2546b2a422b1528bc',
+    Var('chromium_git') + '/external/github.com/google/googletest.git' + '@' + '565f1b848215b77c3732bca345fe76a0431d8b34',
   'src/third_party/harfbuzz-ng/src':
-    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + '26c5b54fb09fb45e02c9c4618bcea4958c698953',
+    Var('chromium_git') + '/external/github.com/harfbuzz/harfbuzz.git' + '@' + 'bbad1b8298125d78c159ed7fdd7bde6a3f3fff56',
   'src/third_party/libjpeg_turbo':
-    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + '61a2bbaa9aec89cb2c882d87ace6aba9aee49bb9',
+    Var('chromium_git') + '/chromium/deps/libjpeg_turbo.git' + '@' + 'd460d6b1cb965c3363f36f7ed716f13d60cdb65d',
   'src/third_party/yasm/source/patched-yasm':
     Var('chromium_git') + '/chromium/deps/yasm/patched-yasm.git' + '@' + '720b70524a4424b15fc57e82263568c8ba0496ad',
   'src/tools':
-    Var('chromium_git') + '/chromium/src/tools' + '@' + '419541c8352b3b75a99c9a5a7c0d1e7b92f3fcf7',
+    Var('chromium_git') + '/chromium/src/tools' + '@' + '716cf8f74e9df7f53f474bae612b0b69bff8ed4e',
   'src/tools/swarming_client':
-    Var('chromium_git') + '/infra/luci/client-py.git' + '@' +  Var('swarming_revision'),
+    Var('chromium_git') + '/infra/luci/client-py.git' + '@' + '96f125709acfd0b48fc1e5dae7d6ea42291726ac',
 
   # libyuv-only dependencies (not present in Chromium).
   'src/third_party/gflags':
@@ -49,7 +81,7 @@ deps = {
     Var('chromium_git') + '/external/webrtc/deps/third_party/gtest-parallel' + '@' + '1dad0e9f6d82ff994130b529d7d814b40eb32b0e',
 
   'src/third_party/lss': {
-    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + Var('lss_revision'),
+    'url': Var('chromium_git') + '/linux-syscall-support.git' + '@' + '8048ece6c16c91acfe0d36d1d3cc0890ab6e945c',
     'condition': 'checkout_android or checkout_linux',
   },
 
@@ -58,7 +90,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/accessibility-test-framework',
-              'version': 'version:2.1-cr0',
+              'version': 'b5ec1e56e58e56bc1a0c77d43111c37f9b512c8a',
           },
       ],
       'condition': 'checkout_android',
@@ -68,15 +100,17 @@ deps = {
     'url': Var('chromium_git') + '/external/github.com/google/auto.git' + '@' + '8a81a858ae7b78a1aef71ac3905fade0bbd64e82',
     'condition': 'checkout_android',
   },
+  'src/third_party/boringssl/src':
+    'https://boringssl.googlesource.com/boringssl.git' + '@' + '05cd93068b0a553afc48f69acbceae10c6a17593',
   'src/base': {
-    'url': Var('chromium_git') + '/chromium/src/base' + '@' + '162a5d66ad148f26bbbe6b6ecaf5c1bafa2173e6',
+    'url': Var('chromium_git') + '/chromium/src/base' + '@' + '3bd361995da8b0cc4e32789e3490f17e4c2997b4',
     'condition': 'checkout_android',
   },
   'src/third_party/bazel': {
       'packages': [
           {
               'package': 'chromium/third_party/bazel',
-              'version': 'version:0.10.0',
+              'version': '1794576f65a721eb0af320a0701e48d31f1b2415',
           },
       ],
       'condition': 'checkout_android',
@@ -86,35 +120,83 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/bouncycastle',
-              'version': 'version:1.46-cr0',
+              'version': 'c078e87552ba26e776566fdaf0f22cd8712743d0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/android_ndk': {
-    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '4e2cea441bfd43f0863d14f57b1e1844260b9884',
+    'url': Var('chromium_git') + '/android_ndk.git' + '@' + '62582753e869484bf0cc7f7e8d184ce0077033c2',
     'condition': 'checkout_android',
   },
   'src/third_party/android_support_test_runner': {
       'packages': [
           {
               'package': 'chromium/third_party/android_support_test_runner',
-              'version': 'version:0.5-cr0',
+              'version': '96d4bf848cd210fdcbca6bcc8c1b4b39cbd93141',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
-  'src/third_party/android_tools': {
-    'url': Var('chromium_git') + '/android_tools.git' + '@' + 'e958d6ea74442d4e0849bb8a018d215a0e78981d',
-    'condition': 'checkout_android',
+  'src/third_party/android_sdk/public': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_sdk/public/build-tools',
+              'version': '5DL7LQQjVMLClXLzLgmGysccPGsGcjJdvH9z5-uetiIC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/emulator',
+              'version': 'xhyuoquVvBTcJelgRjMKZeoBVSQRjB7pLVJPt5C9saIC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/extras',
+              'version': 'ppQ4TnqDvBHQ3lXx5KPq97egzF5X2FFyOrVHkGmiTMQC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/patcher',
+              'version': 'I6FNMhrXlpB-E1lOhMlvld7xt9lBVNOO83KIluXDyA0C',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platform-tools',
+              'version': 'MSnxgXN7IurL-MQs1RrTkSFSb8Xd1UtZjLArI8Ty1FgC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/platforms',
+              'version': 'Kg2t9p0YnQk8bldUv4VA3o156uPXLUfIFAmVZ-Gm5ewC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/sources',
+              'version': 'K9uEn3JvNELEVjjVK_GQD3ZQD3rqAnJSxCWxjmUmRkgC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/tools',
+              'version': 'wYcRQC2WHsw2dKWs4EA7fw9Qsyzu1ds1_fRjKmGxe5QC',
+          },
+          {
+              'package': 'chromium/third_party/android_sdk/public/tools-lint',
+              'version': '89hXqZYzCum3delB5RV7J_QyWkaRodqdtQS0s3LMh3wC',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+  'src/tools/clang/dsymutil': {
+    'packages': [
+      {
+        'package': 'chromium/llvm-build-tools/dsymutil',
+        'version': 'OWlhXkmj18li3yhJk59Kmjbc5KdgLh56TwCd1qBdzlIC',
+      }
+    ],
+    'condition': 'checkout_mac',
+    'dep_type': 'cipd',
   },
   'src/third_party/android_build_tools/aapt2': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_tools_aapt2',
-              'version': 'version:3.2.0-alpha18-4804415-cr0',
+              'package': 'chromium/third_party/android_build_tools/aapt2',
+              'version': 'version:3.6.0-alpha03-5516695-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -124,14 +206,14 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/byte_buddy',
-              'version': 'version:1.4.17-cr0',
+              'version': 'c9b53316603fc2d997c899c7ca1707f809b918cd',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/ced/src': {
-    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + '94c367a1fe3a13207f4b22604fcfd1d9f9ddf6d9',
+    'url': Var('chromium_git') + '/external/github.com/google/compact_enc_det.git' + '@' + 'ba412eaaacd3186085babcd901679a48863c7dd5',
     'condition': 'checkout_android',
   },
   'src/third_party/errorprone/lib': {
@@ -146,7 +228,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/gson',
-              'version': 'version:2.8.0-cr0',
+              'version': '681931c9778045903a0ed59856ce2dd8dd7bf7ca',
           },
       ],
       'condition': 'checkout_android',
@@ -156,7 +238,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/guava',
-              'version': 'version:23.0-cr0',
+              'version': 'a6fba501f3a0de88b9be1daa2052632de5b96a46',
           },
       ],
       'condition': 'checkout_android',
@@ -166,20 +248,21 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/hamcrest',
-              'version': 'version:1.3-cr0',
+              'version': '37eccfc658fe79695d6abb6dd497463c4372032f',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
+
   'src/third_party/icu': {
-    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + 'd65301491c513d49163ad29c853eb85c02c8d5b4',
+    'url': Var('chromium_git') + '/chromium/deps/icu.git' + '@' + '53f6b233a41ec982d8445996247093f7aaf41639',
   },
   'src/third_party/icu4j': {
       'packages': [
           {
               'package': 'chromium/third_party/icu4j',
-              'version': 'version:53.1-cr0',
+              'version': 'e87e5bed2b4935913ee26a3ebd0b723ee2344354',
           },
       ],
       'condition': 'checkout_android',
@@ -189,7 +272,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/intellij',
-              'version': 'version:12.0-cr0',
+              'version': '77c2721b024b36ee073402c08e6d8428c0295336',
           },
       ],
       'condition': 'checkout_android',
@@ -211,7 +294,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/objenesis',
-              'version': 'version:2.4-cr0',
+              'version': '9e367f55e5a65781ee77bfcbaa88fb82b30e75c0',
           },
       ],
       'condition': 'checkout_android',
@@ -221,7 +304,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/ow2_asm',
-              'version': 'version:5.0.1-cr0',
+              'version': 'NNAhdJzMdnutUVqfSJm5v0tVazA9l3Dd6CRwH6N4Q5kC',
           },
       ],
       'condition': 'checkout_android',
@@ -231,7 +314,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/r8',
-              'version': 'version:1.0.30',
+              'version': '6xVKWv-ssICwyU5FC1osaRpeZio2kM4Tko33I_SIK-EC',
           },
       ],
       'condition': 'checkout_android',
@@ -255,21 +338,21 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/robolectric',
-              'version': 'version:3.5.1',
+              'version': '1KXoOiNP1a_uZNdM2ybWKwAQNow1dHTXTig-ZK4Xgq8C',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
   'src/third_party/robolectric/robolectric': {
-    'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '7e067f1112e1502caa742f7be72d37b5678d3403',
+    'url': Var('chromium_git') + '/external/robolectric.git' + '@' + '3d6dcabf5521e028c8efc2778ab6bd8c7b6d923c',
     'condition': 'checkout_android',
   },
   'src/third_party/sqlite4java': {
       'packages': [
           {
               'package': 'chromium/third_party/sqlite4java',
-              'version': 'version:0.282-cr0',
+              'version': '889660698187baa7c8b0d79f7bf58563125fbd66',
           },
       ],
       'condition': 'checkout_android',
@@ -283,7 +366,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/xstream',
-              'version': 'version:1.4.8-cr0',
+              'version': '4278b1b78b86ab7a1a29e64d5aec9a47a9aab0fe',
           },
       ],
       'condition': 'checkout_android',
@@ -292,7 +375,7 @@ deps = {
 
   # iOS deps:
   'src/ios': {
-    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '44be3c093cf2db7ab4cf1997d6a1a07722f1f391',
+    'url': Var('chromium_git') + '/chromium/src/ios' + '@' + '2f3f83ac15360abe4307e5092647d6dd3601e861',
     'condition': 'checkout_ios'
   },
 
@@ -304,12 +387,78 @@ deps = {
   },
 
   # === ANDROID_DEPS Generated Code Start ===
-  # Generated by //tools/android/roll/android_deps/fetch_all.sh
+  # Generated by //tools/android/roll/android_deps/fetch_all.py
   'src/third_party/android_deps/libs/android_arch_core_common': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_core_common',
-              'version': 'version:1.0.0-cr0',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_livedata_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_livedata_core',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_core_runtime': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_core_runtime',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_viewmodel': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_viewmodel',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_localbroadcastmanager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_localbroadcastmanager',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_print': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_print',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -320,7 +469,18 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common',
-              'version': 'version:1.0.0-cr0',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/android_arch_lifecycle_common_java8': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_common_java8',
+              'version': 'version:1.1.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -331,6 +491,17 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/android_arch_lifecycle_runtime',
+              'version': 'version:1.1.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_annotation_annotation': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_annotation_annotation',
               'version': 'version:1.0.0-cr0',
           },
       ],
@@ -338,11 +509,77 @@ deps = {
       'dep_type': 'cipd',
   },
 
+  'src/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_lifecycle_lifecycle_common',
+              'version': 'version:2.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_core': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_core',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_ext_junit': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_ext_junit',
+              'version': 'version:1.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_monitor': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_monitor',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_rules': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_rules',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/androidx_test_runner': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/androidx_test_runner',
+              'version': 'version:1.1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
   'src/third_party/android_deps/libs/com_android_support_animated_vector_drawable': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_animated_vector_drawable',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -353,7 +590,73 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_appcompat_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_asynclayoutinflater': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_asynclayoutinflater',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_coordinatorlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_coordinatorlayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_customview': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_customview',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_drawerlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_drawerlayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_slidingpanelayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_slidingpanelayout',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_loader': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_loader',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -364,7 +667,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_cardview_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -375,7 +678,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_design',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -386,7 +689,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_gridlayout_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -397,7 +700,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_leanback_v17',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -408,7 +711,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_mediarouter_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -430,7 +733,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_palette_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -441,7 +744,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_leanback_v17',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -452,7 +755,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v14',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -463,7 +766,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_preference_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -474,7 +777,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_recyclerview_v7',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -485,7 +788,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_annotations',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -496,7 +799,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_compat',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -507,7 +810,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_ui',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -518,7 +821,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_core_utils',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -529,7 +832,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_fragment',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -540,7 +843,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_media_compat',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -551,7 +854,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v13',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -562,7 +865,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_v4',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -573,7 +876,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_support_vector_drawable',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -584,18 +887,96 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_android_support_transition',
-              'version': 'version:27.0.0-cr0',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_swiperefreshlayout': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_swiperefreshlayout',
+              'version': 'version:28.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
+  'src/third_party/android_deps/libs/com_android_support_viewpager': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_viewpager',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_collections': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_collections',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_versionedparcelable': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_versionedparcelable',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_cursoradapter': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_cursoradapter',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_interpolator': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_interpolator',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_android_support_documentfile': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_android_support_documentfile',
+              'version': 'version:28.0.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+
   'src/third_party/android_deps/libs/com_google_android_gms_play_services_auth': {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -606,7 +987,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_api_phone',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -617,7 +998,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_auth_base',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -628,7 +1009,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_base',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -639,7 +1020,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_basement',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -650,7 +1031,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:16.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -661,7 +1042,18 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_cast_framework',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:16.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_clearcut',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -672,7 +1064,18 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_fido',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_flags': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_flags',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -683,7 +1086,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_gcm',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -694,7 +1097,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_iid',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -705,7 +1108,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_instantapps',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:16.0.0-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -716,7 +1119,40 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_location',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_phenotype',
+              'version': 'version:15.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_places_placereport',
+              'version': 'version:15.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_android_gms_play_services_stats': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_stats',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -727,7 +1163,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_tasks',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -738,7 +1174,7 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -749,18 +1185,128 @@ deps = {
       'packages': [
           {
               'package': 'chromium/third_party/android_deps/libs/com_google_android_gms_play_services_vision_common',
-              'version': 'version:12.0.1-cr0',
+              'version': 'version:15.0.1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_code_findbugs_jsr305': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_code_findbugs_jsr305',
+              'version': 'version:1.3.9-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger',
+              'version': 'version:2.17-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_compiler': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_compiler',
+              'version': 'version:2.17-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_producers': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_producers',
+              'version': 'version:2.17-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_dagger_dagger_spi': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_dagger_dagger_spi',
+              'version': 'version:2.17-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_error_prone_annotations',
+              'version': 'version:2.1.3-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_errorprone_javac_shaded': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_errorprone_javac_shaded',
+              'version': 'version:9-dev-r4023-3-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_googlejavaformat_google_java_format',
+              'version': 'version:1.5-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_guava_guava': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_guava_guava',
+              'version': 'version:25.0-jre-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/com_google_j2objc_j2objc_annotations',
+              'version': 'version:1.1-cr0',
           },
       ],
       'condition': 'checkout_android',
       'dep_type': 'cipd',
   },
 
-  'src/third_party/android_deps/libs/com_google_android_play_core': {
+  'src/third_party/android_deps/libs/com_google_protobuf_protobuf_lite': {
       'packages': [
           {
-              'package': 'chromium/third_party/android_deps/libs/com_google_android_play_core',
-              'version': 'version:1.3.0-cr0',
+              'package': 'chromium/third_party/android_deps/libs/com_google_protobuf_protobuf_lite',
+              'version': 'version:3.0.1-cr0',
           },
       ],
       'condition': 'checkout_android',
@@ -778,6 +1324,61 @@ deps = {
       'dep_type': 'cipd',
   },
 
+  'src/third_party/android_deps/libs/javax_annotation_jsr250_api': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/javax_annotation_jsr250_api',
+              'version': 'version:1.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/javax_inject_javax_inject': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/javax_inject_javax_inject',
+              'version': 'version:1-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/net_sf_kxml_kxml2': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/net_sf_kxml_kxml2',
+              'version': 'version:2.3.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_checkerframework_checker_compat_qual': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_checkerframework_checker_compat_qual',
+              'version': 'version:2.3.0-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
+  'src/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations': {
+      'packages': [
+          {
+              'package': 'chromium/third_party/android_deps/libs/org_codehaus_mojo_animal_sniffer_annotations',
+              'version': 'version:1.14-cr0',
+          },
+      ],
+      'condition': 'checkout_android',
+      'dep_type': 'cipd',
+  },
+
   # === ANDROID_DEPS Generated Code End ===
 }
 
@@ -883,40 +1484,6 @@ hooks = [
     'action': ['python', 'src/build/util/lastchange.py',
                '-o', 'src/build/util/LASTCHANGE'],
   },
-  # Pull GN binaries.
-  {
-    'name': 'gn_win',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=win32',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/win/gn.exe.sha1',
-    ],
-  },
-  {
-    'name': 'gn_mac',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=darwin',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/mac/gn.sha1',
-    ],
-  },
-  {
-    'name': 'gn_linux64',
-    'pattern': '.',
-    'action': [ 'download_from_google_storage',
-                '--no_resume',
-                '--platform=linux*',
-                '--no_auth',
-                '--bucket', 'chromium-gn',
-                '-s', 'src/buildtools/linux64/gn.sha1',
-    ],
-  },
   # Pull clang-format binaries using checked-in hashes.
   {
     'name': 'clang_format_win',
@@ -943,6 +1510,7 @@ hooks = [
   {
     'name': 'clang_format_linux',
     'pattern': '.',
+    'condition': 'host_os == "linux"',
     'action': [ 'download_from_google_storage',
                 '--no_resume',
                 '--platform=linux*',
@@ -997,25 +1565,6 @@ hooks = [
                '--root', 'src',
     ],
   },
-  # Android dependencies. Many are downloaded using Google Storage these days.
-  # They're copied from https://cs.chromium.org/chromium/src/DEPS for all
-  # such dependencies we share with Chromium.
-  {
-    # This downloads SDK extras and puts them in the
-    # third_party/android_tools/sdk/extras directory.
-    'name': 'sdkextras',
-    'condition': 'checkout_android',
-    'pattern': '.',
-    # When adding a new sdk extras package to download, add the package
-    # directory and zip file to .gitignore in third_party/android_tools.
-    'action': ['vpython',
-               'src/build/android/play_services/update.py',
-               'download'
-    ],
-  },
 ]
 
-recursedeps = [
-  # buildtools provides clang_format, libc++, and libc++abi.
-  'src/buildtools',
-]
+recursedeps = []
diff --git a/libs/libyuv/README.chromium b/libs/libyuv/README.chromium
index be44f7aa40f..dadffc9c3be 100644
--- a/libs/libyuv/README.chromium
+++ b/libs/libyuv/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1724
+Version: 1735
 License: BSD
 License File: LICENSE
 
diff --git a/libs/libyuv/docs/rotation.md b/libs/libyuv/docs/rotation.md
index fb84fce5a9c..a08430fded0 100644
--- a/libs/libyuv/docs/rotation.md
+++ b/libs/libyuv/docs/rotation.md
@@ -100,4 +100,8 @@ Inverting can be achieved with almost any libyuv function by passing a negative
 
 I420Mirror and ARGBMirror can also be used to rotate by 180 degrees by passing a negative height.
 
+# Cropping - Vertical Flip
 
+When cropping from a subsampled format like NV21, the method of setting the start pointers wont work for odd crop start y on the UV plane.
+If the height after cropping will be odd, invert the source - point to the last row, negate the strides, and pass negative height, which
+will re-invert the image as the conversion outputs.
diff --git a/libs/libyuv/include/libyuv/convert.h b/libs/libyuv/include/libyuv/convert.h
index 674325a386f..f571142fab3 100644
--- a/libs/libyuv/include/libyuv/convert.h
+++ b/libs/libyuv/include/libyuv/convert.h
@@ -226,6 +226,28 @@ int UYVYToI420(const uint8_t* src_uyvy,
                int width,
                int height);
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 // Convert M420 to I420.
 LIBYUV_API
 int M420ToI420(const uint8_t* src_m420,
@@ -322,6 +344,19 @@ int RGB24ToI420(const uint8_t* src_rgb24,
                 int width,
                 int height);
 
+// RGB little endian (bgr in memory) to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height);
+
 // RGB big endian (rgb in memory) to I420.
 LIBYUV_API
 int RAWToI420(const uint8_t* src_raw,
@@ -374,14 +409,21 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
                    int width,
                    int height);
 
+// RGB little endian (bgr in memory) to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height);
+
 #ifdef HAVE_JPEG
-// src_mjpg is pointer to raw jpeg bytes in memory
-// src_size_mjpg is size of jpeg in bytes
 // src_width/height provided by capture.
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToI420(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
+int MJPGToI420(const uint8_t* sample,
+               size_t sample_size,
                uint8_t* dst_y,
                int dst_stride_y,
                uint8_t* dst_u,
@@ -395,8 +437,8 @@ int MJPGToI420(const uint8_t* src_mjpg,
 
 // JPEG to NV21
 LIBYUV_API
-int MJPGToNV21(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
+int MJPGToNV21(const uint8_t* sample,
+               size_t sample_size,
                uint8_t* dst_y,
                int dst_stride_y,
                uint8_t* dst_vu,
@@ -408,8 +450,8 @@ int MJPGToNV21(const uint8_t* src_mjpg,
 
 // Query size of MJPG in pixels.
 LIBYUV_API
-int MJPGSize(const uint8_t* src_mjpg,
-             size_t src_size_mjpg,
+int MJPGSize(const uint8_t* sample,
+             size_t sample_size,
              int* width,
              int* height);
 #endif
diff --git a/libs/libyuv/include/libyuv/convert_argb.h b/libs/libyuv/include/libyuv/convert_argb.h
index 283ece952f1..e8ed1f59f33 100644
--- a/libs/libyuv/include/libyuv/convert_argb.h
+++ b/libs/libyuv/include/libyuv/convert_argb.h
@@ -256,6 +256,7 @@ int NV21ToARGB(const uint8_t* src_y,
                int height);
 
 // Convert NV12 to ABGR.
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_uv,
@@ -298,6 +299,17 @@ int NV21ToRGB24(const uint8_t* src_y,
                 int width,
                 int height);
 
+// Convert NV21 to YUV24.
+LIBYUV_API
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height);
+
 // Convert NV12 to RAW.
 LIBYUV_API
 int NV12ToRAW(const uint8_t* src_y,
@@ -627,8 +639,8 @@ int AR30ToAB30(const uint8_t* src_ar30,
 // src_width/height provided by capture
 // dst_width/height for clipping determine final size.
 LIBYUV_API
-int MJPGToARGB(const uint8_t* src_mjpg,
-               size_t src_size_mjpg,
+int MJPGToARGB(const uint8_t* sample,
+               size_t sample_size,
                uint8_t* dst_argb,
                int dst_stride_argb,
                int src_width,
diff --git a/libs/libyuv/include/libyuv/convert_from_argb.h b/libs/libyuv/include/libyuv/convert_from_argb.h
index 05c815a093e..08c6aa5649d 100644
--- a/libs/libyuv/include/libyuv/convert_from_argb.h
+++ b/libs/libyuv/include/libyuv/convert_from_argb.h
@@ -250,6 +250,28 @@ int ARGBToNV21(const uint8_t* src_argb,
                int width,
                int height);
 
+// Convert ABGR To NV12.
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
+// Convert ABGR To NV21.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height);
+
 // Convert ARGB To NV21.
 LIBYUV_API
 int ARGBToNV21(const uint8_t* src_argb,
diff --git a/libs/libyuv/include/libyuv/mjpeg_decoder.h b/libs/libyuv/include/libyuv/mjpeg_decoder.h
index fecbedfb856..275f8d4c185 100644
--- a/libs/libyuv/include/libyuv/mjpeg_decoder.h
+++ b/libs/libyuv/include/libyuv/mjpeg_decoder.h
@@ -26,7 +26,7 @@ namespace libyuv {
 extern "C" {
 #endif
 
-LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg_size);
+LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libs/libyuv/include/libyuv/planar_functions.h b/libs/libyuv/include/libyuv/planar_functions.h
index 91137baba25..5299fe2c0ef 100644
--- a/libs/libyuv/include/libyuv/planar_functions.h
+++ b/libs/libyuv/include/libyuv/planar_functions.h
@@ -105,6 +105,15 @@ void MergeUVPlane(const uint8_t* src_u,
                   int width,
                   int height);
 
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height);
+
 // Split interleaved RGB plane into separate R, G and B planes.
 LIBYUV_API
 void SplitRGBPlane(const uint8_t* src_rgb,
@@ -224,6 +233,19 @@ int UYVYToNV12(const uint8_t* src_uyvy,
                int width,
                int height);
 
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 LIBYUV_API
 int YUY2ToY(const uint8_t* src_yuy2,
             int src_stride_yuy2,
diff --git a/libs/libyuv/include/libyuv/rotate.h b/libs/libyuv/include/libyuv/rotate.h
index 76b692be8b0..c64e0216d53 100644
--- a/libs/libyuv/include/libyuv/rotate.h
+++ b/libs/libyuv/include/libyuv/rotate.h
@@ -49,6 +49,24 @@ int I420Rotate(const uint8_t* src_y,
                int height,
                enum RotationMode mode);
 
+// Rotate I444 frame.
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum RotationMode mode);
+
 // Rotate NV12 input and store in I420.
 LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
diff --git a/libs/libyuv/include/libyuv/row.h b/libs/libyuv/include/libyuv/row.h
index da54f7d7aa1..890766ff3ff 100644
--- a/libs/libyuv/include/libyuv/row.h
+++ b/libs/libyuv/include/libyuv/row.h
@@ -275,6 +275,7 @@ extern "C" {
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
+#define HAS_SWAPUVROW_SSSE3
 #endif
 
 // The following are available for AVX2 gcc/clang x86 platforms:
@@ -283,6 +284,8 @@ extern "C" {
     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \
     (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
 #define HAS_ABGRTOAR30ROW_AVX2
+#define HAS_ABGRTOUVROW_AVX2
+#define HAS_ABGRTOYROW_AVX2
 #define HAS_ARGBTOAR30ROW_AVX2
 #define HAS_ARGBTORAWROW_AVX2
 #define HAS_ARGBTORGB24ROW_AVX2
@@ -295,6 +298,9 @@ extern "C" {
 #define HAS_I422TOYUY2ROW_AVX2
 #define HAS_MERGEUVROW_16_AVX2
 #define HAS_MULTIPLYROW_16_AVX2
+#define HAS_SWAPUVROW_AVX2
+// TODO(fbarchard): Fix AVX2 version of YUV24
+// #define HAS_NV21TOYUV24ROW_AVX2
 #endif
 
 // The following are available for AVX512 clang x86 platforms:
@@ -330,6 +336,9 @@ extern "C" {
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
+#define HAS_AYUVTOUVROW_NEON
+#define HAS_AYUVTOVUROW_NEON
+#define HAS_AYUVTOYROW_NEON
 #define HAS_BGRATOUVROW_NEON
 #define HAS_BGRATOYROW_NEON
 #define HAS_BYTETOFLOATROW_NEON
@@ -355,6 +364,7 @@ extern "C" {
 #define HAS_NV12TORGB565ROW_NEON
 #define HAS_NV21TOARGBROW_NEON
 #define HAS_NV21TORGB24ROW_NEON
+#define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
 #define HAS_RAWTOUVROW_NEON
@@ -370,6 +380,7 @@ extern "C" {
 #define HAS_SETROW_NEON
 #define HAS_SPLITRGBROW_NEON
 #define HAS_SPLITUVROW_NEON
+#define HAS_SWAPUVROW_NEON
 #define HAS_UYVYTOARGBROW_NEON
 #define HAS_UYVYTOUV422ROW_NEON
 #define HAS_UYVYTOUVROW_NEON
@@ -815,6 +826,10 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                         uint8_t* dst_argb,
                         const struct YuvConstants* yuvconstants,
@@ -899,6 +914,8 @@ void UYVYToARGBRow_MSA(const uint8_t* src_uyvy,
 
 void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width);
+void ABGRToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void ARGBToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
@@ -927,7 +944,7 @@ void ARGBToUV444Row_MSA(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void ARGBToUVRow_MSA(const uint8_t* src_argb0,
+void ARGBToUVRow_MSA(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -936,7 +953,7 @@ void ARGBToUV444Row_MMI(const uint8_t* src_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void ARGBToUVRow_MMI(const uint8_t* src_argb0,
+void ARGBToUVRow_MMI(const uint8_t* src_argb,
                      int src_stride_argb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
@@ -986,32 +1003,32 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
-void ARGBToUVJRow_MSA(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void BGRAToUVRow_MSA(const uint8_t* src_rgb0,
+void BGRAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ABGRToUVRow_MSA(const uint8_t* src_rgb0,
+void ABGRToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGBAToUVRow_MSA(const uint8_t* src_rgb0,
+void RGBAToUVRow_MSA(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGB24ToUVRow_MSA(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MSA(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void RAWToUVRow_MSA(const uint8_t* src_rgb0,
+void RAWToUVRow_MSA(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -1026,32 +1043,32 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
                          uint8_t* dst_u,
                          uint8_t* dst_v,
                          int width);
-void ARGBToUVJRow_MMI(const uint8_t* src_rgb0,
+void ARGBToUVJRow_MMI(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void BGRAToUVRow_MMI(const uint8_t* src_rgb0,
+void BGRAToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void ABGRToUVRow_MMI(const uint8_t* src_rgb0,
+void ABGRToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGBAToUVRow_MMI(const uint8_t* src_rgb0,
+void RGBAToUVRow_MMI(const uint8_t* src_rgb,
                      int src_stride_rgb,
                      uint8_t* dst_u,
                      uint8_t* dst_v,
                      int width);
-void RGB24ToUVRow_MMI(const uint8_t* src_rgb0,
+void RGB24ToUVRow_MMI(const uint8_t* src_rgb,
                       int src_stride_rgb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void RAWToUVRow_MMI(const uint8_t* src_rgb0,
+void RAWToUVRow_MMI(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
@@ -1083,29 +1100,29 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
 void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
                          uint8_t* dst_y,
                          int width);
-void BGRAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MSA(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MSA(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
-void BGRAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_MMI(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void BGRAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_MMI(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB565ToYRow_MMI(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_MMI(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_MMI(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
 
-void ARGBToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ARGBToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void BGRAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void ABGRToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGBAToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RGB24ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
-void RAWToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width);
+void ARGBToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ARGBToYJRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void BGRAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void ABGRToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGBAToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RGB24ToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
+void RAWToYRow_C(const uint8_t* src_argb, uint8_t* dst_y, int width);
 void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
 void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
 void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width);
@@ -1156,37 +1173,42 @@ void ARGB4444ToYRow_Any_MMI(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int width);
 
-void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
+void ARGBToUVRow_AVX2(const uint8_t* src_argb,
                       int src_stride_argb,
                       uint8_t* dst_u,
                       uint8_t* dst_v,
                       int width);
-void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width);
+void ARGBToUVJRow_AVX2(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
                        int src_stride_argb,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
+void ARGBToUVJRow_SSSE3(const uint8_t* src_argb,
                         int src_stride_argb,
                         uint8_t* dst_u,
                         uint8_t* dst_v,
                         int width);
-void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
+void BGRAToUVRow_SSSE3(const uint8_t* src_bgra,
                        int src_stride_bgra,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
+void ABGRToUVRow_SSSE3(const uint8_t* src_abgr,
                        int src_stride_abgr,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
                        int width);
-void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0,
+void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
                        int src_stride_rgba,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
@@ -1196,6 +1218,11 @@ void ARGBToUVRow_Any_AVX2(const uint8_t* src_ptr,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width);
+void ABGRToUVRow_Any_AVX2(const uint8_t* src_ptr,
+                          int src_stride_ptr,
+                          uint8_t* dst_u,
+                          uint8_t* dst_v,
+                          int width);
 void ARGBToUVJRow_Any_AVX2(const uint8_t* src_ptr,
                            int src_stride_ptr,
                            uint8_t* dst_u,
@@ -1383,47 +1410,47 @@ void ARGB4444ToUVRow_Any_MMI(const uint8_t* src_ptr,
                              uint8_t* dst_u,
                              uint8_t* dst_v,
                              int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void ARGBToUVRow_C(const uint8_t* src_rgb0,
+void ARGBToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ARGBToUVJRow_C(const uint8_t* src_rgb0,
+void ARGBToUVJRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void BGRAToUVRow_C(const uint8_t* src_rgb0,
+void BGRAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void ABGRToUVRow_C(const uint8_t* src_rgb0,
+void ABGRToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGBAToUVRow_C(const uint8_t* src_rgb0,
+void RGBAToUVRow_C(const uint8_t* src_rgb,
                    int src_stride_rgb,
                    uint8_t* dst_u,
                    uint8_t* dst_v,
                    int width);
-void RGB24ToUVRow_C(const uint8_t* src_rgb0,
+void RGB24ToUVRow_C(const uint8_t* src_rgb,
                     int src_stride_rgb,
                     uint8_t* dst_u,
                     uint8_t* dst_v,
                     int width);
-void RAWToUVRow_C(const uint8_t* src_rgb0,
+void RAWToUVRow_C(const uint8_t* src_rgb,
                   int src_stride_rgb,
                   uint8_t* dst_u,
                   uint8_t* dst_v,
@@ -2183,6 +2210,10 @@ void NV21ToRGB24Row_C(const uint8_t* src_y,
                       uint8_t* rgb_buf,
                       const struct YuvConstants* yuvconstants,
                       int width);
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width);
 void YUY2ToARGBRow_C(const uint8_t* src_yuy2,
                      uint8_t* rgb_buf,
                      const struct YuvConstants* yuvconstants,
@@ -2349,6 +2380,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
                          uint8_t* dst_rgb24,
                          const struct YuvConstants* yuvconstants,
                          int width);
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width);
 void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
                           const uint8_t* src_uv,
                           uint8_t* dst_rgb565,
@@ -2554,6 +2589,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf,
                                const uint8_t* uv_buf,
                                uint8_t* dst_ptr,
@@ -3027,6 +3066,10 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf,
                              uint8_t* dst_ptr,
                              const struct YuvConstants* yuvconstants,
                              int width);
+void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y,
+                             const uint8_t* src_vu,
+                             uint8_t* dst_yuv24,
+                             int width);
 void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf,
                               const uint8_t* uv_buf,
                               uint8_t* dst_ptr,
@@ -3344,6 +3387,40 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
                             uint8_t* dst_u,
                             uint8_t* dst_v,
                             int width);
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width);
+void SwapUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width);
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width);
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width);
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width);
+void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
+void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_uv,
+                          int width);
+void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
+                          int stride_ayuv,
+                          uint8_t* dst_vu,
+                          int width);
 
 void I422ToYUY2Row_C(const uint8_t* src_y,
                      const uint8_t* src_u,
diff --git a/libs/libyuv/include/libyuv/scale.h b/libs/libyuv/include/libyuv/scale.h
index b937d348cab..23ba1634f78 100644
--- a/libs/libyuv/include/libyuv/scale.h
+++ b/libs/libyuv/include/libyuv/scale.h
@@ -97,6 +97,54 @@ int I420Scale_16(const uint16_t* src_y,
                  int dst_height,
                  enum FilterMode filtering);
 
+// Scales a YUV 4:4:4 image from the src width and height to the
+// dst width and height.
+// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
+// used. This produces basic (blocky) quality at the fastest speed.
+// If filtering is kFilterBilinear, interpolation is used to produce a better
+// quality image, at the expense of speed.
+// If filtering is kFilterBox, averaging is used to produce ever better
+// quality image, at further expense of speed.
+// Returns 0 if successful.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering);
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering);
+
 #ifdef __cplusplus
 // Legacy API.  Deprecated.
 LIBYUV_API
diff --git a/libs/libyuv/include/libyuv/version.h b/libs/libyuv/include/libyuv/version.h
index 522eb5224fa..b1883ef8e14 100644
--- a/libs/libyuv/include/libyuv/version.h
+++ b/libs/libyuv/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1724
+#define LIBYUV_VERSION 1735
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/libs/libyuv/infra/config/OWNERS b/libs/libyuv/infra/config/OWNERS
deleted file mode 100644
index b61b29d6c25..00000000000
--- a/libs/libyuv/infra/config/OWNERS
+++ /dev/null
@@ -1,3 +0,0 @@
-set noparent
-agable@chromium.org
-phoglund@chromium.org
diff --git a/libs/libyuv/infra/config/PRESUBMIT.py b/libs/libyuv/infra/config/PRESUBMIT.py
deleted file mode 100644
index 89eaa5192c2..00000000000
--- a/libs/libyuv/infra/config/PRESUBMIT.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright 2018 The LibYuv Project Authors. All rights reserved.
-#
-# Use of this source code is governed by a BSD-style license
-# that can be found in the LICENSE file in the root of the source
-# tree. An additional intellectual property rights grant can be found
-# in the file PATENTS. All contributing project authors may
-# be found in the AUTHORS file in the root of the source tree.
-
-
-def CheckChangeOnUpload(input_api, output_api):
-  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
-
-
-def CheckChangeOnCommit(input_api, output_api):
-  return input_api.canned_checks.CheckChangedLUCIConfigs(input_api, output_api)
diff --git a/libs/libyuv/infra/config/README.md b/libs/libyuv/infra/config/README.md
deleted file mode 100644
index c036d610c5f..00000000000
--- a/libs/libyuv/infra/config/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This directory contains configuration files for infra services.
diff --git a/libs/libyuv/infra/config/cq.cfg b/libs/libyuv/infra/config/cq.cfg
deleted file mode 100644
index 04e25319051..00000000000
--- a/libs/libyuv/infra/config/cq.cfg
+++ /dev/null
@@ -1,50 +0,0 @@
-# Commit Queue configuration file. The documentation of the format can be found
-# at http://luci-config.appspot.com/schemas/projects/refs:cq.cfg.
-
-version: 1
-cq_status_url: "https://chromium-cq-status.appspot.com"
-git_repo_url: "https://chromium.googlesource.com/libyuv/libyuv.git"
-
-gerrit {}
-
-verifiers {
-  gerrit_cq_ability {
-    committer_list: "project-libyuv-committers"
-    dry_run_access_list: "project-libyuv-tryjob-access"
-  }
-
-  try_job {
-    buckets {
-      name: "luci.libyuv.try"
-      builders { name: "win" }
-      builders { name: "win_rel" }
-      builders { name: "win_x64_rel" }
-      builders { name: "win_clang" }
-      builders { name: "win_clang_rel" }
-      builders { name: "win_x64_clang_rel" }
-      builders { name: "mac" }
-      builders { name: "mac_rel" }
-      builders { name: "mac_asan" }
-      builders { name: "ios" }
-      builders { name: "ios_rel" }
-      builders { name: "ios_arm64" }
-      builders { name: "ios_arm64_rel" }
-      builders { name: "linux" }
-      builders { name: "linux_rel" }
-      builders {
-        name: "linux_gcc"
-        experiment_percentage: 100
-      }
-      builders { name: "linux_tsan2" }
-      builders { name: "linux_asan" }
-      builders { name: "linux_msan" }
-      builders { name: "linux_ubsan" }
-      builders { name: "linux_ubsan_vptr" }
-      builders { name: "android" }
-      builders { name: "android_rel" }
-      builders { name: "android_arm64" }
-      builders { name: "android_x86" }
-      builders { name: "android_x64" }
-    }
-  }
-}
diff --git a/libs/libyuv/source/compare.cc b/libs/libyuv/source/compare.cc
index 5aa3a4db86d..7f4828104fe 100644
--- a/libs/libyuv/source/compare.cc
+++ b/libs/libyuv/source/compare.cc
@@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
     if (argb[0] != 255) {  // First byte is not Alpha of 255, so not ARGB.
       return FOURCC_BGRA;
     }
-    if (argb[3] != 255) {  // 4th byte is not Alpha of 255, so not BGRA.
+    if (argb[3] != 255) {  // Fourth byte is not Alpha of 255, so not BGRA.
       return FOURCC_ARGB;
     }
     if (argb[4] != 255) {  // Second pixel first byte is not Alpha of 255.
       return FOURCC_BGRA;
     }
-    if (argb[7] != 255) {  // Second pixel 4th byte is not Alpha of 255.
+    if (argb[7] != 255) {  // Second pixel fourth byte is not Alpha of 255.
       return FOURCC_ARGB;
     }
     argb += 8;
diff --git a/libs/libyuv/source/convert.cc b/libs/libyuv/source/convert.cc
index 4b8d0dc57f7..614fa48241e 100644
--- a/libs/libyuv/source/convert.cc
+++ b/libs/libyuv/source/convert.cc
@@ -880,6 +880,144 @@ int UYVYToI420(const uint8_t* src_uyvy,
   return 0;
 }
 
+// Convert AYUV to NV12.
+LIBYUV_API
+int AYUVToNV12(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_uv, int width) = AYUVToUVRow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToUVRow = AYUVToUVRow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToUVRow = AYUVToUVRow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToUVRow = AYUVToUVRow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToUVRow = AYUVToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToUVRow = AYUVToUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    AYUVToUVRow(src_ayuv, 0, dst_uv, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
+// Convert AYUV to NV21.
+LIBYUV_API
+int AYUVToNV21(const uint8_t* src_ayuv,
+               int src_stride_ayuv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
+                      uint8_t* dst_vu, int width) = AYUVToVURow_C;
+  void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
+      AYUVToYRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
+    src_stride_ayuv = -src_stride_ayuv;
+  }
+// place holders for future intel code
+#if defined(HAS_AYUVTOYROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    AYUVToVURow = AYUVToVURow_Any_SSE2;
+    AYUVToYRow = AYUVToYRow_Any_SSE2;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToVURow = AYUVToVURow_SSE2;
+      AYUVToYRow = AYUVToYRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_AYUVTOYROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    AYUVToVURow = AYUVToVURow_Any_AVX2;
+    AYUVToYRow = AYUVToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      AYUVToVURow = AYUVToVURow_AVX2;
+      AYUVToYRow = AYUVToYRow_AVX2;
+    }
+  }
+#endif
+
+#if defined(HAS_AYUVTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    AYUVToYRow = AYUVToYRow_Any_NEON;
+    AYUVToVURow = AYUVToVURow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      AYUVToYRow = AYUVToYRow_NEON;
+      AYUVToVURow = AYUVToVURow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+    AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
+    src_ayuv += src_stride_ayuv * 2;
+    dst_y += dst_stride_y * 2;
+    dst_vu += dst_stride_vu;
+  }
+  if (height & 1) {
+    AYUVToVURow(src_ayuv, 0, dst_vu, width);
+    AYUVToYRow(src_ayuv, dst_y, width);
+  }
+  return 0;
+}
+
 // Convert ARGB to I420.
 LIBYUV_API
 int ARGBToI420(const uint8_t* src_argb,
@@ -1446,6 +1584,155 @@ int RGB24ToI420(const uint8_t* src_rgb24,
   return 0;
 }
 
+// TODO(fbarchard): Use Matrix version to implement I420 and J420.
+// Convert RGB24 to J420.
+LIBYUV_API
+int RGB24ToJ420(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_y,
+                int dst_stride_y,
+                uint8_t* dst_u,
+                int dst_stride_u,
+                uint8_t* dst_v,
+                int dst_stride_v,
+                int width,
+                int height) {
+  int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+  void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
+                        uint8_t* dst_u, uint8_t* dst_v, int width) =
+      RGB24ToUVJRow_C;
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
+      RGB24ToYJRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
+                       uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ARGBToUVJRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_rgb24 || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_NEON;
+      }
+    }
+  }
+#elif defined(HAS_RGB24TOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MSA;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+      RGB24ToUVJRow = RGB24ToUVJRow_MSA;
+    }
+  }
+#elif defined(HAS_RGB24TOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+      if (IS_ALIGNED(width, 16)) {
+        RGB24ToUVJRow = RGB24ToUVJRow_MMI;
+      }
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToUVJRow = ARGBToUVJRow_AVX2;
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+      defined(HAS_RGB24TOYJROW_MMI))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToUVJRow(row, kRowSize, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+      ARGBToYJRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_y += dst_stride_y * 2;
+      dst_u += dst_stride_u;
+      dst_v += dst_stride_v;
+    }
+    if (height & 1) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
+      RGB24ToYJRow(src_rgb24, dst_y, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToUVJRow(row, 0, dst_u, dst_v, width);
+      ARGBToYJRow(row, dst_y, width);
+#endif
+    }
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+      defined(HAS_RGB24TOYJROW_MMI))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
 // Convert RAW to I420.
 LIBYUV_API
 int RAWToI420(const uint8_t* src_raw,
@@ -2082,6 +2369,124 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
   return 0;
 }
 
+// Convert RGB24 to J400.
+LIBYUV_API
+int RGB24ToJ400(const uint8_t* src_rgb24,
+                int src_stride_rgb24,
+                uint8_t* dst_yj,
+                int dst_stride_yj,
+                int width,
+                int height) {
+  int y;
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+  void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) =
+      RGB24ToYJRow_C;
+#else
+  void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
+      RGB24ToARGBRow_C;
+  void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_yj, int width) =
+      ARGBToYJRow_C;
+#endif
+  if (!src_rgb24 || !dst_yj || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+    src_stride_rgb24 = -src_stride_rgb24;
+  }
+
+// Neon version does direct RGB24 to YUV.
+#if defined(HAS_RGB24TOYJROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_NEON;
+    }
+  }
+#elif defined(HAS_RGB24TOYJROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToYJRow = RGB24ToYJRow_MSA;
+    }
+  }
+#elif defined(HAS_RGB24TOYJROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    RGB24ToYJRow = RGB24ToYJRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      RGB24ToYJRow = RGB24ToYJRow_MMI;
+    }
+  }
+// Other platforms do intermediate conversion from RGB24 to ARGB.
+#else
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ARGBToYJRow = ARGBToYJRow_AVX2;
+    }
+  }
+#endif
+#endif
+
+  {
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+      defined(HAS_RGB24TOYJROW_MMI))
+    // Allocate 2 rows of ARGB.
+    const int kRowSize = (width * 4 + 31) & ~31;
+    align_buffer_64(row, kRowSize * 2);
+#endif
+
+    for (y = 0; y < height - 1; y += 2) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToYJRow(src_rgb24, dst_yj, width);
+      RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_yj + dst_stride_yj, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+      ARGBToYJRow(row, dst_yj, width);
+      ARGBToYJRow(row + kRowSize, dst_yj + dst_stride_yj, width);
+#endif
+      src_rgb24 += src_stride_rgb24 * 2;
+      dst_yj += dst_stride_yj * 2;
+    }
+    if (height & 1) {
+#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+     defined(HAS_RGB24TOYJROW_MMI))
+      RGB24ToYJRow(src_rgb24, dst_yj, width);
+#else
+      RGB24ToARGBRow(src_rgb24, row, width);
+      ARGBToYJRow(row, dst_yj, width);
+#endif
+    }
+#if !(defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
+      defined(HAS_RGB24TOYJROW_MMI))
+    free_aligned_buffer_64(row);
+#endif
+  }
+  return 0;
+}
+
 static void SplitPixels(const uint8_t* src_u,
                         int src_pixel_stride_uv,
                         uint8_t* dst_u,
diff --git a/libs/libyuv/source/convert_argb.cc b/libs/libyuv/source/convert_argb.cc
index b376a0f3876..540503330dc 100644
--- a/libs/libyuv/source/convert_argb.cc
+++ b/libs/libyuv/source/convert_argb.cc
@@ -1793,8 +1793,9 @@ int NV21ToARGB(const uint8_t* src_y,
 }
 
 // Convert NV12 to ABGR.
-// To output ABGR instead of ARGB swap the UV and use a mirrrored yuc matrix.
+// To output ABGR instead of ARGB swap the UV and use a mirrored yuv matrix.
 // To swap the UV use NV12 instead of NV21.LIBYUV_API
+LIBYUV_API
 int NV12ToABGR(const uint8_t* src_y,
                int src_stride_y,
                const uint8_t* src_uv,
@@ -1998,6 +1999,54 @@ int NV21ToRAW(const uint8_t* src_y,
                            dst_stride_raw, &kYvuI601Constants, width, height);
 }
 
+// Convert NV21 to YUV24
+int NV21ToYUV24(const uint8_t* src_y,
+                int src_stride_y,
+                const uint8_t* src_vu,
+                int src_stride_vu,
+                uint8_t* dst_yuv24,
+                int dst_stride_yuv24,
+                int width,
+                int height) {
+  int y;
+  void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
+                         uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
+  if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24;
+    dst_stride_yuv24 = -dst_stride_yuv24;
+  }
+#if defined(HAS_NV21TOYUV24ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_NV21TOYUV24ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      NV21ToYUV24Row = NV21ToYUV24Row_AVX2;
+    }
+  }
+#endif
+  for (y = 0; y < height; ++y) {
+    NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width);
+    dst_yuv24 += dst_stride_yuv24;
+    src_y += src_stride_y;
+    if (y & 1) {
+      src_vu += src_stride_vu;
+    }
+  }
+  return 0;
+}
+
 // Convert M420 to ARGB.
 LIBYUV_API
 int M420ToARGB(const uint8_t* src_m420,
diff --git a/libs/libyuv/source/convert_from_argb.cc b/libs/libyuv/source/convert_from_argb.cc
index 1b070c10f75..b7b5643a188 100644
--- a/libs/libyuv/source/convert_from_argb.cc
+++ b/libs/libyuv/source/convert_from_argb.cc
@@ -572,6 +572,326 @@ int ARGBToNV21(const uint8_t* src_argb,
   return 0;
 }
 
+LIBYUV_API
+int ABGRToNV12(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_uv, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_uv += dst_stride_uv;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ABGRToNV21(const uint8_t* src_abgr,
+               int src_stride_abgr,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_vu,
+               int dst_stride_vu,
+               int width,
+               int height) {
+  int y;
+  int halfwidth = (width + 1) >> 1;
+  void (*ABGRToUVRow)(const uint8_t* src_abgr0, int src_stride_abgr,
+                      uint8_t* dst_u, uint8_t* dst_v, int width) =
+      ABGRToUVRow_C;
+  void (*ABGRToYRow)(const uint8_t* src_abgr, uint8_t* dst_y, int width) =
+      ABGRToYRow_C;
+  void (*MergeUVRow_)(const uint8_t* src_u, const uint8_t* src_v,
+                      uint8_t* dst_vu, int width) = MergeUVRow_C;
+  if (!src_abgr || !dst_y || !dst_vu || width <= 0 || height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+    src_stride_abgr = -src_stride_abgr;
+  }
+#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+    ABGRToYRow = ABGRToYRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_SSSE3;
+      ABGRToYRow = ABGRToYRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    ABGRToUVRow = ABGRToUVRow_Any_AVX2;
+    ABGRToYRow = ABGRToYRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_AVX2;
+      ABGRToYRow = ABGRToYRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToYRow = ABGRToYRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    ABGRToUVRow = ABGRToUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToYRow = ABGRToYRow_Any_MSA;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToYRow = ABGRToYRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      ABGRToUVRow = ABGRToUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOYROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToYRow = ABGRToYRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      ABGRToYRow = ABGRToYRow_MMI;
+    }
+  }
+#endif
+#if defined(HAS_ABGRTOUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    ABGRToUVRow = ABGRToUVRow_Any_MMI;
+    if (IS_ALIGNED(width, 16)) {
+      ABGRToUVRow = ABGRToUVRow_MMI;
+    }
+  }
+#endif
+
+#if defined(HAS_MERGEUVROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2)) {
+    MergeUVRow_ = MergeUVRow_Any_SSE2;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_SSE2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MergeUVRow_ = MergeUVRow_Any_AVX2;
+    if (IS_ALIGNED(halfwidth, 32)) {
+      MergeUVRow_ = MergeUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MergeUVRow_ = MergeUVRow_Any_NEON;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MergeUVRow_ = MergeUVRow_Any_MSA;
+    if (IS_ALIGNED(halfwidth, 16)) {
+      MergeUVRow_ = MergeUVRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MERGEUVROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MergeUVRow_ = MergeUVRow_Any_MMI;
+    if (IS_ALIGNED(halfwidth, 8)) {
+      MergeUVRow_ = MergeUVRow_MMI;
+    }
+  }
+#endif
+  {
+    // Allocate a rows of uv.
+    align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2);
+    uint8_t* row_v = row_u + ((halfwidth + 31) & ~31);
+
+    for (y = 0; y < height - 1; y += 2) {
+      ABGRToUVRow(src_abgr, src_stride_abgr, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+      ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+      src_abgr += src_stride_abgr * 2;
+      dst_y += dst_stride_y * 2;
+      dst_vu += dst_stride_vu;
+    }
+    if (height & 1) {
+      ABGRToUVRow(src_abgr, 0, row_u, row_v, width);
+      MergeUVRow_(row_v, row_u, dst_vu, halfwidth);
+      ABGRToYRow(src_abgr, dst_y, width);
+    }
+    free_aligned_buffer_64(row_u);
+  }
+  return 0;
+}
+
 // Convert ARGB to YUY2.
 LIBYUV_API
 int ARGBToYUY2(const uint8_t* src_argb,
diff --git a/libs/libyuv/source/mjpeg_decoder.cc b/libs/libyuv/source/mjpeg_decoder.cc
index eaf2530130b..80e381dd6b0 100644
--- a/libs/libyuv/source/mjpeg_decoder.cc
+++ b/libs/libyuv/source/mjpeg_decoder.cc
@@ -25,7 +25,8 @@
 #endif
 
 #endif
-struct FILE;  // For jpeglib.h.
+
+#include <stdio.h>  // For jpeglib.h.
 
 // C++ build requires extern C for jpeg internals.
 #ifdef __cplusplus
@@ -427,7 +428,15 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
 }
 
 void skip_input_data(j_decompress_ptr cinfo, long num_bytes) {  // NOLINT
-  cinfo->src->next_input_byte += num_bytes;
+  jpeg_source_mgr* src = cinfo->src;
+  size_t bytes = static_cast<size_t>(num_bytes);
+  if (bytes > src->bytes_in_buffer) {
+    src->next_input_byte = nullptr;
+    src->bytes_in_buffer = 0;
+  } else {
+    src->next_input_byte += bytes;
+    src->bytes_in_buffer -= bytes;
+  }
 }
 
 void term_source(j_decompress_ptr cinfo) {
diff --git a/libs/libyuv/source/mjpeg_validate.cc b/libs/libyuv/source/mjpeg_validate.cc
index d22b62df0a5..ba0a03ab9e5 100644
--- a/libs/libyuv/source/mjpeg_validate.cc
+++ b/libs/libyuv/source/mjpeg_validate.cc
@@ -47,7 +47,8 @@ LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg) {
     // ERROR: Invalid jpeg size: src_size_mjpg
     return LIBYUV_FALSE;
   }
-  if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8) {  // SOI marker
+  // SOI marker
+  if (src_mjpg[0] != 0xff || src_mjpg[1] != 0xd8 || src_mjpg[2] != 0xff) {
     // ERROR: Invalid jpeg initial start code
     return LIBYUV_FALSE;
   }
diff --git a/libs/libyuv/source/planar_functions.cc b/libs/libyuv/source/planar_functions.cc
index b49bf0a0b4b..5a9d56d88a6 100644
--- a/libs/libyuv/source/planar_functions.cc
+++ b/libs/libyuv/source/planar_functions.cc
@@ -440,7 +440,6 @@ void MergeUVPlane(const uint8_t* src_u,
   int y;
   void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
                      uint8_t* dst_uv, int width) = MergeUVRow_C;
-  // Coalesce rows.
   // Negative height means invert the image.
   if (height < 0) {
     height = -height;
@@ -504,6 +503,87 @@ void MergeUVPlane(const uint8_t* src_u,
   }
 }
 
+// Swap U and V channels in interleaved UV plane.
+LIBYUV_API
+void SwapUVPlane(const uint8_t* src_uv,
+                 int src_stride_uv,
+                 uint8_t* dst_vu,
+                 int dst_stride_vu,
+                 int width,
+                 int height) {
+  int y;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+  // Coalesce rows.
+  if (src_stride_uv == width * 2 && dst_stride_vu == width * 2) {
+    width *= height;
+    height = 1;
+    src_stride_uv = dst_stride_vu = 0;
+  }
+
+#if defined(HAS_SWAPUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    SwapUVRow = SwapUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    SwapUVRow = SwapUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      SwapUVRow = SwapUVRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
+
+  for (y = 0; y < height; ++y) {
+    SwapUVRow(src_uv, dst_vu, width);
+    src_uv += src_stride_uv;
+    dst_vu += dst_stride_vu;
+  }
+}
+
+// Convert NV21 to NV12.
+LIBYUV_API
+int NV21ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_vu,
+               int src_stride_vu,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_vu || !dst_uv || width <= 0 || height == 0) {
+    return -1;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
+              halfheight);
+  return 0;
+}
+
 // Support function for NV12 etc RGB channels.
 // Width and height are plane sizes (typically half pixel width).
 LIBYUV_API
diff --git a/libs/libyuv/source/rotate.cc b/libs/libyuv/source/rotate.cc
index f28a06d38a9..d414186a5a0 100644
--- a/libs/libyuv/source/rotate.cc
+++ b/libs/libyuv/source/rotate.cc
@@ -481,6 +481,66 @@ int I420Rotate(const uint8_t* src_y,
   return -1;
 }
 
+LIBYUV_API
+int I444Rotate(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height,
+               enum libyuv::RotationMode mode) {
+  if (!src_y || !src_u || !src_v || width <= 0 || height == 0 || !dst_y ||
+      !dst_u || !dst_v) {
+    return -1;
+  }
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  switch (mode) {
+    case libyuv::kRotate0:
+      // copy frame
+      CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate90:
+      RotatePlane90(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane90(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane90(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate270:
+      RotatePlane270(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane270(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane270(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    case libyuv::kRotate180:
+      RotatePlane180(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+      RotatePlane180(src_u, src_stride_u, dst_u, dst_stride_u, width, height);
+      RotatePlane180(src_v, src_stride_v, dst_v, dst_stride_v, width, height);
+      return 0;
+    default:
+      break;
+  }
+  return -1;
+}
+
 LIBYUV_API
 int NV12ToI420Rotate(const uint8_t* src_y,
                      int src_stride_y,
diff --git a/libs/libyuv/source/row_any.cc b/libs/libyuv/source/row_any.cc
index 031a8f6490e..9fafff60205 100644
--- a/libs/libyuv/source/row_any.cc
+++ b/libs/libyuv/source/row_any.cc
@@ -286,7 +286,12 @@ ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15)
 #ifdef HAS_MERGEUVROW_MMI
 ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7)
 #endif
-
+#ifdef HAS_NV21TOYUV24ROW_NEON
+ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15)
+#endif
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31)
+#endif
 // Math functions.
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3)
@@ -575,6 +580,9 @@ ANY11(RAWToRGB24Row_Any_MMI, RAWToRGB24Row_MMI, 0, 3, 3, 3)
 #ifdef HAS_ARGBTOYROW_AVX2
 ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31)
 #endif
+#ifdef HAS_ABGRTOYROW_AVX2
+ANY11(ABGRToYRow_Any_AVX2, ABGRToYRow_AVX2, 0, 4, 1, 31)
+#endif
 #ifdef HAS_ARGBTOYJROW_AVX2
 ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31)
 #endif
@@ -702,6 +710,18 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
 #ifdef HAS_UYVYTOYROW_MMI
 ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
 #endif
+#ifdef HAS_AYUVTOYROW_NEON
+ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
+#endif
+#ifdef HAS_SWAPUVROW_SSSE3
+ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15)
+#endif
+#ifdef HAS_SWAPUVROW_AVX2
+ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31)
+#endif
+#ifdef HAS_SWAPUVROW_NEON
+ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
+#endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
@@ -1256,6 +1276,9 @@ ANY13(SplitRGBRow_Any_MMI, SplitRGBRow_MMI, 3, 3)
 #ifdef HAS_ARGBTOUVROW_AVX2
 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
 #endif
+#ifdef HAS_ABGRTOUVROW_AVX2
+ANY12S(ABGRToUVRow_Any_AVX2, ABGRToUVRow_AVX2, 0, 4, 31)
+#endif
 #ifdef HAS_ARGBTOUVJROW_AVX2
 ANY12S(ARGBToUVJRow_Any_AVX2, ARGBToUVJRow_AVX2, 0, 4, 31)
 #endif
@@ -1381,6 +1404,37 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
 #endif
 #undef ANY12S
 
+// Any 1 to 1 with source stride (2 rows of source).  Outputs UV plane.
+// 128 byte row allows for 32 avx ARGB pixels.
+#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK)                        \
+  void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu,  \
+               int width) {                                                  \
+    SIMD_ALIGNED(uint8_t temp[128 * 3]);                                     \
+    memset(temp, 0, 128 * 2); /* for msan */                                 \
+    int r = width & MASK;                                                    \
+    int n = width & ~MASK;                                                   \
+    if (n > 0) {                                                             \
+      ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n);                          \
+    }                                                                        \
+    memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP);      \
+    memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP,      \
+           SS(r, UVSHIFT) * BPP);                                            \
+    if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \
+      memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \
+             BPP);                                                           \
+      memcpy(temp + 128 + SS(r, UVSHIFT) * BPP,                              \
+             temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP);                  \
+    }                                                                        \
+    ANY_SIMD(temp, 128, temp + 256, MASK + 1);                               \
+    memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2);                 \
+  }
+
+#ifdef HAS_AYUVTOVUROW_NEON
+ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
+ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
+#endif
+#undef ANY11S
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/libs/libyuv/source/row_common.cc b/libs/libyuv/source/row_common.cc
index 2bbc5adbf14..9fa6e34ce52 100644
--- a/libs/libyuv/source/row_common.cc
+++ b/libs/libyuv/source/row_common.cc
@@ -3231,6 +3231,107 @@ void GaussCol_C(const uint16_t* src0,
   }
 }
 
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_C(const uint8_t* src_y,
+                      const uint8_t* src_vu,
+                      uint8_t* dst_yuv24,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+    dst_yuv24[3] = src_vu[0];  // V
+    dst_yuv24[4] = src_vu[1];  // U
+    dst_yuv24[5] = src_y[1];   // Y1
+    src_y += 2;
+    src_vu += 2;
+    dst_yuv24 += 6;  // Advance 2 pixels.
+  }
+  if (width & 1) {
+    dst_yuv24[0] = src_vu[0];  // V
+    dst_yuv24[1] = src_vu[1];  // U
+    dst_yuv24[2] = src_y[0];   // Y0
+  }
+}
+
+// Filter 2 rows of AYUV UV's (444) into UV (420).
+void AYUVToUVRow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_uv,
+                   int width) {
+  // Output a row of UV values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Filter 2 rows of AYUV UV's (444) into VU (420).
+void AYUVToVURow_C(const uint8_t* src_ayuv,
+                   int src_stride_ayuv,
+                   uint8_t* dst_vu,
+                   int width) {
+  // Output a row of VU values, filtering 2x2 rows of AYUV.
+  int x;
+  for (x = 0; x < width; x += 2) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 4] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 5] + 2) >>
+                2;
+    src_ayuv += 8;
+    dst_vu += 2;
+  }
+  if (width & 1) {
+    dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
+                 src_ayuv[src_stride_ayuv + 0] + 2) >>
+                2;
+    dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
+                 src_ayuv[src_stride_ayuv + 1] + 2) >>
+                2;
+  }
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  // Output a row of Y values.
+  int x;
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = src_ayuv[2];  // v,u,y,a
+    src_ayuv += 4;
+  }
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    uint8_t u = src_uv[0];
+    uint8_t v = src_uv[1];
+    dst_vu[0] = v;
+    dst_vu[1] = u;
+    src_uv += 2;
+    dst_vu += 2;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/libs/libyuv/source/row_gcc.cc b/libs/libyuv/source/row_gcc.cc
index 8d3cb81cec2..ae672d625c1 100644
--- a/libs/libyuv/source/row_gcc.cc
+++ b/libs/libyuv/source/row_gcc.cc
@@ -1154,6 +1154,48 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
 }
 #endif  // HAS_ARGBTOYROW_AVX2
 
+#ifdef HAS_ABGRTOYROW_AVX2
+// Convert 32 ABGR pixels (128 bytes) to 32 Y values.
+void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
+  asm volatile(
+      "vbroadcastf128 %3,%%ymm4                  \n"
+      "vbroadcastf128 %4,%%ymm5                  \n"
+      "vmovdqu    %5,%%ymm6                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea       0x80(%0),%0                     \n"
+      "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
+      "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
+      "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
+      "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
+      "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr),         // %0
+        "+r"(dst_y),            // %1
+        "+r"(width)             // %2
+      : "m"(kABGRToY),          // %3
+        "m"(kAddY16),           // %4
+        "m"(kPermdARGBToY_AVX)  // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
+}
+#endif  // HAS_ABGRTOYROW_AVX2
+
+
 #ifdef HAS_ARGBTOYJROW_AVX2
 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
 void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1328,6 +1370,69 @@ void ARGBToUVRow_AVX2(const uint8_t* src_argb0,
 }
 #endif  // HAS_ARGBTOUVROW_AVX2
 
+#ifdef HAS_ABGRTOUVROW_AVX2
+void ABGRToUVRow_AVX2(const uint8_t* src_abgr0,
+                      int src_stride_abgr,
+                      uint8_t* dst_u,
+                      uint8_t* dst_v,
+                      int width) {
+  asm volatile(
+      "vbroadcastf128 %5,%%ymm5                  \n"
+      "vbroadcastf128 %6,%%ymm6                  \n"
+      "vbroadcastf128 %7,%%ymm7                  \n"
+      "sub        %1,%2                          \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "vmovdqu    0x20(%0),%%ymm1                \n"
+      "vmovdqu    0x40(%0),%%ymm2                \n"
+      "vmovdqu    0x60(%0),%%ymm3                \n"
+      "vpavgb    0x00(%0,%4,1),%%ymm0,%%ymm0     \n"
+      "vpavgb    0x20(%0,%4,1),%%ymm1,%%ymm1     \n"
+      "vpavgb    0x40(%0,%4,1),%%ymm2,%%ymm2     \n"
+      "vpavgb    0x60(%0,%4,1),%%ymm3,%%ymm3     \n"
+      "lea        0x80(%0),%0                    \n"
+      "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
+      "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
+      "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
+      "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
+      "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
+
+      "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
+      "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
+      "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
+      "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
+      "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
+      "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
+      "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
+      "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
+      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
+      "vpshufb    %8,%%ymm0,%%ymm0               \n"
+      "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
+
+      "vextractf128 $0x0,%%ymm0,(%1)             \n"
+      "vextractf128 $0x1,%%ymm0,0x0(%1,%2,1)     \n"
+      "lea        0x10(%1),%1                    \n"
+      "sub        $0x20,%3                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_abgr0),                   // %0
+        "+r"(dst_u),                       // %1
+        "+r"(dst_v),                       // %2
+        "+rm"(width)                       // %3
+      : "r"((intptr_t)(src_stride_abgr)),  // %4
+        "m"(kAddUV128),                    // %5
+        "m"(kABGRToV),                     // %6
+        "m"(kABGRToU),                     // %7
+        "m"(kShufARGBToUV_AVX)             // %8
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
+}
+#endif  // HAS_ABGRTOUVROW_AVX2
+
 #ifdef HAS_ARGBTOUVJROW_AVX2
 void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
                        int src_stride_argb,
@@ -5238,7 +5343,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );
+  );
 }
 #endif  // HAS_ARGBMULTIPLYROW_AVX2
 
@@ -6120,24 +6225,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub       %1,%2                             \n"
+      "sub       %1,%2                           \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "add       $0x10,%0                          \n"
-      "movdqa    %%xmm0,%%xmm1                     \n"
-      "punpcklbw %%xmm2,%%xmm0                     \n"
-      "punpckhbw %%xmm2,%%xmm1                     \n"
-      "movdqu    %%xmm0,(%3)                       \n"
-      "movdqu    %%xmm1,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
+      "1:                                        \n"
+      "movq      (%1),%%xmm2                     \n"
+      "movq      0x00(%1,%2,1),%%xmm1            \n"
+      "add       $0x8,%1                         \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "add       $0x10,%0                        \n"
+      "movdqa    %%xmm0,%%xmm1                   \n"
+      "punpcklbw %%xmm2,%%xmm0                   \n"
+      "punpckhbw %%xmm2,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%3)                     \n"
+      "movdqu    %%xmm1,0x10(%3)                 \n"
+      "lea       0x20(%3),%3                     \n"
+      "sub       $0x10,%4                        \n"
+      "jg         1b                             \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6156,24 +6261,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub        %1,%2                            \n"
+      "sub        %1,%2                          \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "movq      (%1),%%xmm2                       \n"
-      "movq      0x00(%1,%2,1),%%xmm1              \n"
-      "add       $0x8,%1                           \n"
-      "punpcklbw %%xmm1,%%xmm2                     \n"
-      "movdqu    (%0),%%xmm0                       \n"
-      "movdqa    %%xmm2,%%xmm1                     \n"
-      "add       $0x10,%0                          \n"
-      "punpcklbw %%xmm0,%%xmm1                     \n"
-      "punpckhbw %%xmm0,%%xmm2                     \n"
-      "movdqu    %%xmm1,(%3)                       \n"
-      "movdqu    %%xmm2,0x10(%3)                   \n"
-      "lea       0x20(%3),%3                       \n"
-      "sub       $0x10,%4                          \n"
-      "jg         1b                               \n"
+      "1:                                        \n"
+      "movq      (%1),%%xmm2                     \n"
+      "movq      0x00(%1,%2,1),%%xmm1            \n"
+      "add       $0x8,%1                         \n"
+      "punpcklbw %%xmm1,%%xmm2                   \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqa    %%xmm2,%%xmm1                   \n"
+      "add       $0x10,%0                        \n"
+      "punpcklbw %%xmm0,%%xmm1                   \n"
+      "punpckhbw %%xmm0,%%xmm2                   \n"
+      "movdqu    %%xmm1,(%3)                     \n"
+      "movdqu    %%xmm2,0x10(%3)                 \n"
+      "lea       0x20(%3),%3                     \n"
+      "sub       $0x10,%4                        \n"
+      "jg         1b                             \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6192,27 +6297,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub       %1,%2                             \n"
+      "sub       %1,%2                           \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1             \n"
-      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
+      "1:                                        \n"
+      "vpmovzxbw  (%1),%%ymm1                    \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2           \n"
+      "add        $0x10,%1                       \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2             \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2           \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "add        $0x20,%0                       \n"
+      "vpunpcklbw %%ymm2,%%ymm0,%%ymm1           \n"
+      "vpunpckhbw %%ymm2,%%ymm0,%%ymm2           \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea        0x40(%3),%3                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6231,27 +6336,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
                         int width) {
   asm volatile(
 
-      "sub        %1,%2                            \n"
+      "sub        %1,%2                          \n"
 
       LABELALIGN
-      "1:                                          \n"
-      "vpmovzxbw  (%1),%%ymm1                      \n"
-      "vpmovzxbw  0x00(%1,%2,1),%%ymm2             \n"
-      "add        $0x10,%1                         \n"
-      "vpsllw     $0x8,%%ymm2,%%ymm2               \n"
-      "vpor       %%ymm1,%%ymm2,%%ymm2             \n"
-      "vmovdqu    (%0),%%ymm0                      \n"
-      "add        $0x20,%0                         \n"
-      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1             \n"
-      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2             \n"
-      "vextractf128 $0x0,%%ymm1,(%3)               \n"
-      "vextractf128 $0x0,%%ymm2,0x10(%3)           \n"
-      "vextractf128 $0x1,%%ymm1,0x20(%3)           \n"
-      "vextractf128 $0x1,%%ymm2,0x30(%3)           \n"
-      "lea        0x40(%3),%3                      \n"
-      "sub        $0x20,%4                         \n"
-      "jg         1b                               \n"
-      "vzeroupper                                  \n"
+      "1:                                        \n"
+      "vpmovzxbw  (%1),%%ymm1                    \n"
+      "vpmovzxbw  0x00(%1,%2,1),%%ymm2           \n"
+      "add        $0x10,%1                       \n"
+      "vpsllw     $0x8,%%ymm2,%%ymm2             \n"
+      "vpor       %%ymm1,%%ymm2,%%ymm2           \n"
+      "vmovdqu    (%0),%%ymm0                    \n"
+      "add        $0x20,%0                       \n"
+      "vpunpcklbw %%ymm0,%%ymm2,%%ymm1           \n"
+      "vpunpckhbw %%ymm0,%%ymm2,%%ymm2           \n"
+      "vextractf128 $0x0,%%ymm1,(%3)             \n"
+      "vextractf128 $0x0,%%ymm2,0x10(%3)         \n"
+      "vextractf128 $0x1,%%ymm1,0x20(%3)         \n"
+      "vextractf128 $0x1,%%ymm2,0x30(%3)         \n"
+      "lea        0x40(%3),%3                    \n"
+      "sub        $0x20,%4                       \n"
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
       : "+r"(src_y),     // %0
         "+r"(src_u),     // %1
         "+r"(src_v),     // %2
@@ -6669,6 +6774,186 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
 }
 #endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
 
+#ifdef HAS_NV21TOYUV24ROW_AVX2
+
+// begin NV21ToYUV24Row_C avx2 constants
+static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00,
+                               0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00};
+
+static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80};
+
+static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00,
+                               0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80,
+                               0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00,
+                               0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00};
+
+static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05,
+                              0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d,
+                              0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05};
+
+static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80,
+                              0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02,
+                              0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80};
+
+static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f,
+                              0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80,
+                              0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f};
+
+static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80,
+                              0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80,
+                              0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80};
+
+static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a,
+                              0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80,
+                              0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a};
+
+static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80,
+                              0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07,
+                              0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80};
+
+// NV21ToYUV24Row_AVX2
+void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  uint8_t* src_y_ptr;
+  uint64_t src_offset = 0;
+  uint64_t width64;
+
+  width64 = width;
+  src_y_ptr = (uint8_t*)src_y;
+
+  asm volatile(
+      "vmovdqu     %5, %%ymm0 \n"  // init blend value
+      "vmovdqu     %6, %%ymm1 \n"  // init blend value
+      "vmovdqu     %7, %%ymm2 \n"  // init blend value
+      //      "sub         $0x20, %3  \n"  //sub 32 from width for final loop
+
+      LABELALIGN
+      "1:                                             \n"  // label 1
+      "vmovdqu     (%0,%4), %%ymm3                    \n"  // src_y
+      "vmovdqu     1(%1,%4), %%ymm4                   \n"  // src_uv+1
+      "vmovdqu     (%1), %%ymm5                       \n"  // src_uv
+      "vpshufb     %8, %%ymm3, %%ymm13                \n"  // y, kSHUF0 for shuf
+      "vpshufb     %9, %%ymm4, %%ymm14                \n"  // uv+1, kSHUF1 for
+                                                           // shuf
+      "vpshufb     %10, %%ymm5, %%ymm15               \n"  // uv, kSHUF2 for
+                                                           // shuf
+      "vpshufb     %11, %%ymm3, %%ymm3                \n"  // y kSHUF3 for shuf
+      "vpshufb     %12, %%ymm4, %%ymm4                \n"  // uv+1 kSHUF4 for
+                                                           // shuf
+      "vpblendvb   %%ymm0, %%ymm14, %%ymm13, %%ymm12  \n"  // blend 0
+      "vpblendvb   %%ymm0, %%ymm13, %%ymm14, %%ymm14  \n"  // blend 0
+      "vpblendvb   %%ymm2, %%ymm15, %%ymm12, %%ymm12  \n"  // blend 2
+      "vpblendvb   %%ymm1, %%ymm15, %%ymm14, %%ymm13  \n"  // blend 1
+      "vpshufb     %13, %%ymm5, %%ymm15               \n"  // shuffle const
+      "vpor        %%ymm4, %%ymm3, %%ymm5             \n"  // get results
+      "vmovdqu     %%ymm12, 0x20(%2)                  \n"  // store dst_yuv+20h
+      "vpor        %%ymm15, %%ymm5, %%ymm3            \n"  // get results
+      "add         $0x20, %4                          \n"  // add to src buffer
+                                                           // ptr
+      "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4      \n"  // insert
+      "vperm2i128  $0x31, %%ymm13, %%ymm3, %%ymm5     \n"  // insert
+      "vmovdqu     %%ymm4, (%2)                       \n"  // store dst_yuv
+      "vmovdqu     %%ymm5, 0x40(%2)                   \n"  // store dst_yuv+40h
+      "add         $0x60,%2                           \n"  // add to dst buffer
+                                                           // ptr
+      //      "cmp         %3, %4                             \n" //(width64 -
+      //      32 bytes) and src_offset
+      "sub         $0x20,%3                           \n"  // 32 pixels per loop
+      "jg          1b                                 \n"
+      "vzeroupper                                     \n"  // sse-avx2
+                                                           // transistions
+
+      : "+r"(src_y),      //%0
+        "+r"(src_vu),     //%1
+        "+r"(dst_yuv24),  //%2
+        "+r"(width64),    //%3
+        "+r"(src_offset)  //%4
+      : "m"(kBLEND0),     //%5
+        "m"(kBLEND1),     //%6
+        "m"(kBLEND2),     //%7
+        "m"(kSHUF0),      //%8
+        "m"(kSHUF1),      //%9
+        "m"(kSHUF2),      //%10
+        "m"(kSHUF3),      //%11
+        "m"(kSHUF4),      //%12
+        "m"(kSHUF5)       //%13
+      : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
+        "xmm13", "xmm14", "xmm15");
+}
+#endif  // HAS_NV21TOYUV24ROW_AVX2
+
+#ifdef HAS_SWAPUVROW_SSSE3
+
+// Shuffle table for reversing the bytes.
+static const uvec8 kShuffleUVToVU = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "movdqu    %3,%%xmm5                      \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"
+      "movdqu    0x10(%0),%%xmm1                 \n"
+      "lea       0x20(%0),%0                     \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "pshufb    %%xmm5,%%xmm1                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "movdqu    %%xmm1,0x10(%1)                 \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_SSSE3
+
+#ifdef HAS_SWAPUVROW_AVX2
+void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu   (%0),%%ymm0                     \n"
+      "vmovdqu   0x20(%0),%%ymm1                 \n"
+      "lea       0x40(%0),%0                     \n"
+      "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
+      "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
+      "vmovdqu   %%ymm0,(%1)                     \n"
+      "vmovdqu   %%ymm1,0x20(%1)                 \n"
+      "lea       0x40(%1),%1                     \n"
+      "sub       $0x20,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),        // %0
+        "+r"(dst_vu),        // %1
+        "+r"(width)          // %2
+      : "m"(kShuffleUVToVU)  // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm5");
+}
+#endif  // HAS_SWAPUVROW_AVX2
+
 #endif  // defined(__x86_64__) || defined(__i386__)
 
 #ifdef __cplusplus
diff --git a/libs/libyuv/source/row_neon.cc b/libs/libyuv/source/row_neon.cc
index ff87e74c62c..a9dc4e46b7e 100644
--- a/libs/libyuv/source/row_neon.cc
+++ b/libs/libyuv/source/row_neon.cc
@@ -561,7 +561,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -582,7 +582,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -607,7 +607,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "d0", "d1", "d2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -632,7 +632,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
@@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -761,7 +761,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -778,7 +778,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -795,7 +795,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -826,7 +826,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
         "+r"(width)        // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -872,7 +872,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -901,7 +901,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
         "+r"(width)          // %2
       :
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -919,7 +919,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
         "+r"(width)       // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -935,7 +935,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "d1", "d2", "d3", "d4"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -950,7 +950,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -965,7 +965,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -985,7 +985,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1005,7 +1005,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
         "+r"(width)      // %3
       :
       : "cc", "memory", "d0", "d1", "d2", "d3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1032,7 +1032,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1059,7 +1059,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
         "d7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1081,7 +1081,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1241,7 +1241,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
         "+r"(width)      // %2
       :
       : "cc", "memory", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -2564,7 +2564,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2),                     // %5
         "r"(6)                      // %6
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2601,7 +2601,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1),                     // %4
         "r"(6)                      // %5
       : "cc", "memory", "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // %y passes a float as a scalar vector for vector * scalar multiply.
@@ -2685,6 +2685,205 @@ void ByteToFloatRow_NEON(const uint8_t* src,
       : "cc", "memory", "q1", "q2", "q3");
 }
 
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussCol_NEON(const uint16_t* src0,
+                   const uint16_t* src1,
+                   const uint16_t* src2,
+                   const uint16_t* src3,
+                   const uint16_t* src4,
+                   uint32_t* dst,
+                   int width) {
+  asm volatile(
+      "vmov.u16   d6, #4                         \n"  // constant 4
+      "vmov.u16   d7, #6                         \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.16    {q1}, [%0]!                    \n"  // load 8 samples, 5 rows
+      "vld1.16    {q2}, [%4]!                    \n"
+      "vaddl.u16  q0, d2, d4                     \n"  // * 1
+      "vaddl.u16  q1, d3, d5                     \n"  // * 1
+      "vld1.16    {q2}, [%1]!                    \n"
+      "vmlal.u16  q0, d4, d6                     \n"  // * 4
+      "vmlal.u16  q1, d5, d6                     \n"  // * 4
+      "vld1.16    {q2}, [%2]!                    \n"
+      "vmlal.u16  q0, d4, d7                     \n"  // * 6
+      "vmlal.u16  q1, d5, d7                     \n"  // * 6
+      "vld1.16    {q2}, [%3]!                    \n"
+      "vmlal.u16  q0, d4, d6                     \n"  // * 4
+      "vmlal.u16  q1, d5, d6                     \n"  // * 4
+      "subs       %6, %6, #8                     \n"  // 8 processed per loop
+      "vst1.32    {q0, q1}, [%5]!                \n"  // store 8 samples
+      "bgt        1b                             \n"
+      : "+r"(src0),  // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(src4),  // %4
+        "+r"(dst),   // %5
+        "+r"(width)  // %6
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
+void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
+  const uint32_t* src1 = src + 1;
+  const uint32_t* src2 = src + 2;
+  const uint32_t* src3 = src + 3;
+  asm volatile(
+      "vmov.u32    q10, #4                        \n"  // constant 4
+      "vmov.u32    q11, #6                        \n"  // constant 6
+
+      "1:                                        \n"
+      "vld1.32     {q0, q1}, [%0]!               \n"  // load 12 source samples
+      "vld1.32     {q2}, [%0]                    \n"
+      "vadd.u32    q0, q0, q1                    \n"  // * 1
+      "vadd.u32    q1, q1, q2                    \n"  // * 1
+      "vld1.32     {q2, q3}, [%2]!               \n"
+      "vmla.u32    q0, q2, q11                   \n"  // * 6
+      "vmla.u32    q1, q3, q11                   \n"  // * 6
+      "vld1.32     {q2, q3}, [%1]!               \n"
+      "vld1.32     {q8, q9}, [%3]!               \n"
+      "vadd.u32    q2, q2, q8                    \n"  // add rows for * 4
+      "vadd.u32    q3, q3, q9                    \n"
+      "vmla.u32    q0, q2, q10                   \n"  // * 4
+      "vmla.u32    q1, q3, q10                   \n"  // * 4
+      "subs        %5, %5, #8                    \n"  // 8 processed per loop
+      "vqshrn.u32  d0, q0, #8                    \n"  // round and pack
+      "vqshrn.u32  d1, q1, #8                    \n"
+      "vst1.u16    {q0}, [%4]!                   \n"  // store 8 samples
+      "bgt         1b                            \n"
+      : "+r"(src),   // %0
+        "+r"(src1),  // %1
+        "+r"(src2),  // %2
+        "+r"(src3),  // %3
+        "+r"(dst),   // %4
+        "+r"(width)  // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+}
+
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8    {q2}, [%0]!                     \n"  // load 16 Y values
+      "vld2.8    {d0, d2}, [%1]!                 \n"  // load 8 VU values
+      "vmov      d1, d0                          \n"
+      "vzip.u8   d0, d1                          \n"  // VV
+      "vmov      d3, d2                          \n"
+      "vzip.u8   d2, d3                          \n"  // UU
+      "subs      %3, %3, #16                     \n"  // 16 pixels per loop
+      "vst3.8    {d0, d2, d4}, [%2]!             \n"  // store 16 YUV pixels
+      "vst3.8    {d1, d3, d5}, [%2]!             \n"
+      "bgt        1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d1, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d0, q1, #2                   \n"
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels UV.
+      "bgt        1b                             \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_uv),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  asm volatile(
+      "add        %1, %0, %1                     \n"  // src_stride + src_AYUV
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels.
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV
+                                                      // pixels.
+      "vpaddl.u8  q0, q0                         \n"  // V 16 bytes -> 8 shorts.
+      "vpaddl.u8  q1, q1                         \n"  // U 16 bytes -> 8 shorts.
+      "vld4.8     {d8, d10, d12, d14}, [%1]!     \n"  // load 8 more AYUV
+                                                      // pixels.
+      "vld4.8     {d9, d11, d13, d15}, [%1]!     \n"  // load last 8 AYUV
+                                                      // pixels.
+      "vpadal.u8  q0, q4                         \n"  // B 16 bytes -> 8 shorts.
+      "vpadal.u8  q1, q5                         \n"  // G 16 bytes -> 8 shorts.
+      "vqrshrun.s16 d0, q0, #2                   \n"  // 2x2 average
+      "vqrshrun.s16 d1, q1, #2                   \n"
+      "subs       %3, %3, #16                    \n"  // 16 processed per loop.
+      "vst2.8     {d0, d1}, [%2]!                \n"  // store 8 pixels VU.
+      "bgt        1b                             \n"
+      : "+r"(src_ayuv),         // %0
+        "+r"(src_stride_ayuv),  // %1
+        "+r"(dst_vu),           // %2
+        "+r"(width)             // %3
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+}
+
+// Copy row of AYUV Y's into Y.
+// Similar to ARGBExtractAlphaRow_NEON
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 AYUV pixels
+      "vld4.8     {d1, d3, d5, d7}, [%0]!        \n"  // load next 8 AYUV pixels
+      "subs       %2, %2, #16                    \n"  // 16 processed per loop
+      "vst1.8     {q2}, [%1]!                    \n"  // store 16 Y's.
+      "bgt       1b                              \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "vld2.8     {d0, d2}, [%0]!                \n"  // load 16 UV values
+      "vld2.8     {d1, d3}, [%0]!                \n"
+      "vorr.u8    q2, q0, q0                     \n"  // move U after V
+      "subs       %2, %2, #16                    \n"  // 16 pixels per loop
+      "vst2.8     {q1, q2}, [%1]!                \n"  // store 16 VU pixels
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "q0", "q1", "q2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
 
 #ifdef __cplusplus
diff --git a/libs/libyuv/source/row_neon64.cc b/libs/libyuv/source/row_neon64.cc
index 24b4520babc..e70953f299c 100644
--- a/libs/libyuv/source/row_neon64.cc
+++ b/libs/libyuv/source/row_neon64.cc
@@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 U's and V's and writes out 16 pairs of UV.
@@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
         "+r"(width)                 // %3  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
         "+r"(width)                       // %4
       :                                   // Input registers
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // Copy multiple of 32.
@@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
         "+r"(width)                 // %2  // Output registers
       :                             // Input registers
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // SetRow writes 'width' bytes using an 8 bit value repeated.
@@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
         "+r"(width)       // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 #define RGB565TOARGB                                                        \
@@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
         "+r"(width)        // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
-      );
+  );
 }
 
 #define ARGB1555TOARGB                                                      \
@@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 #define ARGB4444TOARGB                                                      \
@@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
         "+r"(width)          // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
         "+r"(width)       // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
-      );
+  );
 }
 
 void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
         "+r"(width)      // %3
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
       :
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
         "v7"  // Clobber List
-      );
+  );
 }
 
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
         "+r"(width)                       // %2
       : "r"(shuffler)                     // %3
       : "cc", "memory", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 void I422ToYUY2Row_NEON(const uint8_t* src_y,
@@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
         "+r"(width)      // %2
       :
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
         "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
         "v28"
 
-      );
+  );
 }
 
 void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
       : "r"(2LL),                               // %5
         "r"(6LL)                                // %6
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // SobelY as a matrix is
@@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
       : "r"(1LL),                               // %4
         "r"(6LL)                                // %5
       : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Caveat - rounds float to half float whereas scaling version truncates.
@@ -2876,6 +2876,115 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
       : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 
+// Convert biplanar NV21 to packed YUV24
+void NV21ToYUV24Row_NEON(const uint8_t* src_y,
+                         const uint8_t* src_vu,
+                         uint8_t* dst_yuv24,
+                         int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v2.16b}, [%0], #16            \n"     // load 16 Y values
+      "ld2        {v0.8b, v1.8b}, [%1], #16      \n"     // load 8 VU values
+      "zip1       v0.16b, v0.16b, v0.16b         \n"     // replicate V values
+      "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
+      "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
+      "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_y),      // %0
+        "+r"(src_vu),     // %1
+        "+r"(dst_yuv24),  // %2
+        "+r"(width)       // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
+void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_uv,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v2.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_uv),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+void AYUVToVURow_NEON(const uint8_t* src_ayuv,
+                      int src_stride_ayuv,
+                      uint8_t* dst_vu,
+                      int width) {
+  const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
+  asm volatile(
+
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels.
+      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
+      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
+      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
+      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
+      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),    // %0
+        "+r"(src_ayuv_1),  // %1
+        "+r"(dst_vu),      // %2
+        "+r"(width)        // %3
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+}
+
+// Copy row of AYUV Y's into Y
+void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
+                                                                // pixels
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_ayuv),  // %0
+        "+r"(dst_y),     // %1
+        "+r"(width)      // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
+// Convert UV plane of NV12 to VU of NV21.
+void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
+  asm volatile(
+      "1:                                        \n"
+      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
+      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
+      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_vu),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "v0", "v1", "v2");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/libs/libyuv/source/row_win.cc b/libs/libyuv/source/row_win.cc
index 5500d7f5a64..f976d40260a 100644
--- a/libs/libyuv/source/row_win.cc
+++ b/libs/libyuv/source/row_win.cc
@@ -1594,9 +1594,9 @@ __declspec(naked) void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
     mov        edx, [esp + 8 + 12]  // dst_u
     mov        edi, [esp + 8 + 16]  // dst_v
     mov        ecx, [esp + 8 + 20]  // width
-    vbroadcastf128 ymm5, xmmword ptr kAddUV128
-    vbroadcastf128 ymm6, xmmword ptr kARGBToV
-    vbroadcastf128 ymm7, xmmword ptr kARGBToU
+    vbroadcastf128 ymm5, xmmword ptr kAddUVJ128
+    vbroadcastf128 ymm6, xmmword ptr kARGBToVJ
+    vbroadcastf128 ymm7, xmmword ptr kARGBToUJ
     sub        edi, edx   // stride from u to v
 
  convertloop:
@@ -4222,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
     add        ecx, 4 - 1
     jl         convertloop1b
 
-        // 1 pixel loop.
+            // 1 pixel loop.
   convertloop1:
     movd       xmm3, [eax]  // src argb
     lea        eax, [eax + 4]
@@ -5360,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     movdqu     xmm0, [eax]
     psubd      xmm0, [eax + edx * 4]
@@ -5448,9 +5448,9 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
-    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
+    movd       xmm2, dword ptr [eax]  // 1 argb pixel
     lea        eax, [eax + 4]
     punpcklbw  xmm2, xmm1
     punpcklwd  xmm2, xmm1
@@ -5534,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
     add        ecx, 4 - 1
     jl         l1b
 
-        // 1 pixel loop
+            // 1 pixel loop
   l1:
     cvttps2dq  xmm0, xmm2  // x, y float to int
     packssdw   xmm0, xmm0  // x, y as shorts
diff --git a/libs/libyuv/source/scale.cc b/libs/libyuv/source/scale.cc
index a8db93fde48..ab085496375 100644
--- a/libs/libyuv/source/scale.cc
+++ b/libs/libyuv/source/scale.cc
@@ -1788,6 +1788,75 @@ int I420Scale_16(const uint16_t* src_y,
   return 0;
 }
 
+// Scale an I444 image.
+// This function in turn calls a scaling function for each plane.
+
+LIBYUV_API
+int I444Scale(const uint8_t* src_y,
+              int src_stride_y,
+              const uint8_t* src_u,
+              int src_stride_u,
+              const uint8_t* src_v,
+              int src_stride_v,
+              int src_width,
+              int src_height,
+              uint8_t* dst_y,
+              int dst_stride_y,
+              uint8_t* dst_u,
+              int dst_stride_u,
+              uint8_t* dst_v,
+              int dst_stride_v,
+              int dst_width,
+              int dst_height,
+              enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+             dst_width, dst_height, filtering);
+  ScalePlane(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+             dst_width, dst_height, filtering);
+  return 0;
+}
+
+LIBYUV_API
+int I444Scale_16(const uint16_t* src_y,
+                 int src_stride_y,
+                 const uint16_t* src_u,
+                 int src_stride_u,
+                 const uint16_t* src_v,
+                 int src_stride_v,
+                 int src_width,
+                 int src_height,
+                 uint16_t* dst_y,
+                 int dst_stride_y,
+                 uint16_t* dst_u,
+                 int dst_stride_u,
+                 uint16_t* dst_v,
+                 int dst_stride_v,
+                 int dst_width,
+                 int dst_height,
+                 enum FilterMode filtering) {
+  if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
+    return -1;
+  }
+
+  ScalePlane_16(src_y, src_stride_y, src_width, src_height, dst_y, dst_stride_y,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_u, src_stride_u, src_width, src_height, dst_u, dst_stride_u,
+                dst_width, dst_height, filtering);
+  ScalePlane_16(src_v, src_stride_v, src_width, src_height, dst_v, dst_stride_v,
+                dst_width, dst_height, filtering);
+  return 0;
+}
+
 // Deprecated api
 LIBYUV_API
 int Scale(const uint8_t* src_y,
diff --git a/libs/libyuv/source/scale_gcc.cc b/libs/libyuv/source/scale_gcc.cc
index 312236d2df8..90a49f30d73 100644
--- a/libs/libyuv/source/scale_gcc.cc
+++ b/libs/libyuv/source/scale_gcc.cc
@@ -483,7 +483,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf0),  // %0
         "m"(kShuf1),  // %1
         "m"(kShuf2)   // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -521,7 +521,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
       "movdqa    %0,%%xmm5                       \n"  // kMadd01
       "movdqa    %1,%%xmm0                       \n"  // kMadd11
@@ -530,7 +530,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -587,7 +587,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShuf01),  // %0
         "m"(kShuf11),  // %1
         "m"(kShuf21)   // %2
-      );
+  );
   asm volatile(
       "movdqa    %0,%%xmm5                       \n"  // kMadd01
       "movdqa    %1,%%xmm0                       \n"  // kMadd11
@@ -596,7 +596,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kMadd01),  // %0
         "m"(kMadd11),  // %1
         "m"(kRound34)  // %2
-      );
+  );
 
   asm volatile(
 
@@ -690,7 +690,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
         "m"(kShufAb1),  // %1
         "m"(kShufAb2),  // %2
         "m"(kScaleAb2)  // %3
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -734,7 +734,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
       : "m"(kShufAc),    // %0
         "m"(kShufAc3),   // %1
         "m"(kScaleAc33)  // %2
-      );
+  );
   asm volatile(
 
       LABELALIGN
@@ -1272,7 +1272,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
       :
       : "m"(kShuffleColARGB),   // %0
         "m"(kShuffleFractions)  // %1
-      );
+  );
 
   asm volatile(
       "movd      %5,%%xmm2                       \n"
diff --git a/libs/libyuv/source/scale_neon.cc b/libs/libyuv/source/scale_neon.cc
index 46f5ba4cd4f..366b155ba4e 100644
--- a/libs/libyuv/source/scale_neon.cc
+++ b/libs/libyuv/source/scale_neon.cc
@@ -40,7 +40,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -61,7 +61,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "q0", "q1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -92,7 +92,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -523,7 +523,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
         "+r"(src_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2"  // Clobber List
-      );
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -705,7 +705,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 //  46:  f964 018d   vld4.32  {d16,d18,d20,d22}, [r4]!
@@ -734,7 +734,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "q0", "q1", "q2", "q3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
diff --git a/libs/libyuv/source/scale_neon64.cc b/libs/libyuv/source/scale_neon64.cc
index f4aed5fc92f..0a7b80ce1d4 100644
--- a/libs/libyuv/source/scale_neon64.cc
+++ b/libs/libyuv/source/scale_neon64.cc
@@ -38,7 +38,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x1 average down and write 16x1.
@@ -60,7 +60,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "v0", "v1"  // Clobber List
-      );
+  );
 }
 
 // Read 32x2 average down and write 16x1.
@@ -89,7 +89,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@@ -534,7 +534,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
         "+r"(src_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2"  // Clobber List
-      );
+  );
 }
 
 // TODO(Yang Zhang): Investigate less load instructions for
@@ -719,7 +719,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
@@ -742,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
         "+r"(dst_width)  // %2
       :
       : "memory", "cc", "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@@ -991,7 +991,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
         "+r"(dst_width)    // %3
       :
       : "v0", "v1", "v2", "v3"  // Clobber List
-      );
+  );
 }
 
 // Read 8x2 upsample with filtering and write 16x1.
@@ -1041,7 +1041,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
         "r"(14LL)          // %5
       : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
         "v19"  // Clobber List
-      );
+  );
 }
 
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
diff --git a/libs/libyuv/tools_libyuv/autoroller/roll_deps.py b/libs/libyuv/tools_libyuv/autoroller/roll_deps.py
index 8359d30965b..2b95e302cdb 100755
--- a/libs/libyuv/tools_libyuv/autoroller/roll_deps.py
+++ b/libs/libyuv/tools_libyuv/autoroller/roll_deps.py
@@ -37,7 +37,7 @@
 CHROMIUM_FILE_TEMPLATE = CHROMIUM_SRC_URL + '/+/%s/%s'
 
 COMMIT_POSITION_RE = re.compile('^Cr-Commit-Position: .*#([0-9]+).*$')
-CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'(\d+)\'$')
+CLANG_REVISION_RE = re.compile(r'^CLANG_REVISION = \'([0-9a-z]+)\'$')
 ROLL_BRANCH_NAME = 'roll_chromium_revision'
 
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
diff --git a/libs/libyuv/unit_test/compare_test.cc b/libs/libyuv/unit_test/compare_test.cc
index 136254e169b..bd99cdd3ac3 100644
--- a/libs/libyuv/unit_test/compare_test.cc
+++ b/libs/libyuv/unit_test/compare_test.cc
@@ -15,10 +15,13 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
-#include "libyuv/compare_row.h" /* For HammingDistance_C */
 #include "libyuv/cpu_id.h"
 #include "libyuv/video_common.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/compare_row.h" /* For HammingDistance_C */
+#endif
+
 namespace libyuv {
 
 // hash seed of 5381 recommended.
@@ -206,6 +209,7 @@ TEST_F(LibYUVCompareTest, BenchmarkARGBDetect_Unaligned) {
   free_aligned_buffer_page_end(src_a);
 }
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVCompareTest, BenchmarkHammingDistance_Opt) {
   const int kMaxWidth = 4096 * 3;
   align_buffer_page_end(src_a, kMaxWidth);
@@ -403,6 +407,7 @@ TEST_F(LibYUVCompareTest, TestHammingDistance_Opt) {
   free_aligned_buffer_page_end(src_a);
   free_aligned_buffer_page_end(src_b);
 }
+#endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVCompareTest, TestHammingDistance) {
   align_buffer_page_end(src_a, benchmark_width_ * benchmark_height_);
diff --git a/libs/libyuv/unit_test/convert_test.cc b/libs/libyuv/unit_test/convert_test.cc
index d97b4fc723c..b676954b217 100644
--- a/libs/libyuv/unit_test/convert_test.cc
+++ b/libs/libyuv/unit_test/convert_test.cc
@@ -12,8 +12,6 @@
 #include <stdlib.h>
 #include <time.h>
 
-#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
-
 #include "libyuv/basic_types.h"
 #include "libyuv/compare.h"
 #include "libyuv/convert.h"
@@ -29,6 +27,10 @@
 #include "libyuv/rotate.h"
 #include "libyuv/video_common.h"
 
+#ifdef ENABLE_ROW_TESTS
+#include "libyuv/row.h" /* For ARGBToAR30Row_AVX2 */
+#endif
+
 #if defined(__arm__) || defined(__aarch64__)
 // arm version subsamples by summing 4 pixels then multiplying by matrix with
 // 4x smaller coefficients which are rounded to nearest integer.
@@ -37,6 +39,11 @@
 #define ARM_YUV_ERROR 0
 #endif
 
+// Some functions fail on big endian. Enable these tests on all cpus except PowerPC
+#if !defined(__powerpc__)
+#define LITTLE_ENDIAN_TEST 1
+#endif
+
 namespace libyuv {
 
 // Alias to copy pixels as is
@@ -311,10 +318,10 @@ int I400ToNV21(const uint8_t* src_y,
                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) +  \
                                      OFF);                                    \
     align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
-    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *        \
+    align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *        \
                                         SUBSAMPLE(kHeight, SUBSAMP_Y));       \
     align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
-    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) *      \
+    align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 *      \
                                           SUBSAMPLE(kHeight, SUBSAMP_Y));     \
     for (int i = 0; i < kHeight; ++i)                                         \
       for (int j = 0; j < kWidth; ++j)                                        \
@@ -329,21 +336,21 @@ int I400ToNV21(const uint8_t* src_y,
     }                                                                         \
     memset(dst_y_c, 1, kWidth* kHeight);                                      \
     memset(dst_uv_c, 2,                                                       \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     memset(dst_y_opt, 101, kWidth* kHeight);                                  \
     memset(dst_uv_opt, 102,                                                   \
-           SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+           SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
     MaskCpuFlags(disable_cpu_flags_);                                         \
     SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
         src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X),   \
         src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth,       \
-        dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight);     \
+        dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight);     \
     MaskCpuFlags(benchmark_cpu_info_);                                        \
     for (int i = 0; i < benchmark_iterations_; ++i) {                         \
       SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
           src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
           src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth,   \
-          dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \
+          dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
     }                                                                         \
     int max_diff = 0;                                                         \
     for (int i = 0; i < kHeight; ++i) {                                       \
@@ -357,12 +364,12 @@ int I400ToNV21(const uint8_t* src_y,
     }                                                                         \
     EXPECT_LE(max_diff, 1);                                                   \
     for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
-      for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) {            \
+      for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) {            \
         int abs_diff =                                                        \
             abs(static_cast<int>(                                             \
-                    dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) -     \
+                    dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) -     \
                 static_cast<int>(                                             \
-                    dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]));   \
+                    dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]));   \
         if (abs_diff > max_diff) {                                            \
           max_diff = abs_diff;                                                \
         }                                                                     \
@@ -395,6 +402,100 @@ TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
 TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
 TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
 
+#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,       \
+                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG,    \
+                          OFF, DOY)                                           \
+  TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) {              \
+    const int kWidth = ((W1280) > 0) ? (W1280) : 1;                           \
+    const int kHeight = benchmark_height_;                                    \
+    align_buffer_page_end(src_y, kWidth* kHeight + OFF);                      \
+    align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) *      \
+                                          SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
+                                      OFF);                                   \
+    align_buffer_page_end(dst_y_c, kWidth* kHeight);                          \
+    align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *        \
+                                        SUBSAMPLE(kHeight, SUBSAMP_Y));       \
+    align_buffer_page_end(dst_y_opt, kWidth* kHeight);                        \
+    align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) *      \
+                                          SUBSAMPLE(kHeight, SUBSAMP_Y));     \
+    for (int i = 0; i < kHeight; ++i)                                         \
+      for (int j = 0; j < kWidth; ++j)                                        \
+        src_y[i * kWidth + j + OFF] = (fastrand() & 0xff);                    \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) {             \
+      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) {        \
+        src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] =        \
+            (fastrand() & 0xff);                                              \
+      }                                                                       \
+    }                                                                         \
+    memset(dst_y_c, 1, kWidth* kHeight);                                      \
+    memset(dst_uv_c, 2,                                                       \
+           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+    memset(dst_y_opt, 101, kWidth* kHeight);                                  \
+    memset(dst_uv_opt, 102,                                                   \
+           2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
+    MaskCpuFlags(disable_cpu_flags_);                                         \
+    SRC_FMT_PLANAR##To##FMT_PLANAR(                                           \
+        src_y + OFF, kWidth, src_uv + OFF,                                    \
+        2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth,   \
+        dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight);     \
+    MaskCpuFlags(benchmark_cpu_info_);                                        \
+    for (int i = 0; i < benchmark_iterations_; ++i) {                         \
+      SRC_FMT_PLANAR##To##FMT_PLANAR(                                         \
+          src_y + OFF, kWidth, src_uv + OFF,                                  \
+          2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL,       \
+          kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth,       \
+          NEG kHeight);                                                       \
+    }                                                                         \
+    int max_diff = 0;                                                         \
+    if (DOY) {                                                                \
+      for (int i = 0; i < kHeight; ++i) {                                     \
+        for (int j = 0; j < kWidth; ++j) {                                    \
+          int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) -      \
+                             static_cast<int>(dst_y_opt[i * kWidth + j]));    \
+          if (abs_diff > max_diff) {                                          \
+            max_diff = abs_diff;                                              \
+          }                                                                   \
+        }                                                                     \
+      }                                                                       \
+      EXPECT_LE(max_diff, 1);                                                 \
+    }                                                                         \
+    for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) {                 \
+      for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) {            \
+        int abs_diff =                                                        \
+            abs(static_cast<int>(                                             \
+                    dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) -     \
+                static_cast<int>(                                             \
+                    dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]));   \
+        if (abs_diff > max_diff) {                                            \
+          max_diff = abs_diff;                                                \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    EXPECT_LE(max_diff, 1);                                                   \
+    free_aligned_buffer_page_end(dst_y_c);                                    \
+    free_aligned_buffer_page_end(dst_uv_c);                                   \
+    free_aligned_buffer_page_end(dst_y_opt);                                  \
+    free_aligned_buffer_page_end(dst_uv_opt);                                 \
+    free_aligned_buffer_page_end(src_y);                                      \
+    free_aligned_buffer_page_end(src_uv);                                     \
+  }
+
+#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
+                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y)                     \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1,  \
+                    1)                                                         \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1)  \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1)     \
+  TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR,  \
+                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
+
+TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                          FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
                          DOY)                                                  \
@@ -585,13 +686,15 @@ TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(I420, 2, 2, RGB24, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1)
 TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1)
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB1555, 2, 2, 1)
 TESTPLANARTOB(I420, 2, 2, ARGB4444, 2, 2, 1)
-TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(I422, 2, 1, RGB565, 2, 2, 1)
+#endif
+TESTPLANARTOB(I422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(J422, 2, 1, ARGB, 4, 4, 1)
 TESTPLANARTOB(J422, 2, 1, ABGR, 4, 4, 1)
 TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
@@ -608,8 +711,10 @@ TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
 TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1)
 TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1)
 TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1)
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1)
 TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
+#endif
 
 #define TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
                         YALIGN, W1280, DIFF, N, NEG, OFF, ATTEN)               \
@@ -680,8 +785,8 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
 TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
 TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
 
-#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,       \
-                         W1280, DIFF, N, NEG, OFF)                             \
+#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C,       \
+                         BPP_B, W1280, DIFF, N, NEG, OFF)                      \
   TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) {                        \
     const int kWidth = ((W1280) > 0) ? (W1280) : 1;                            \
     const int kHeight = benchmark_height_;                                     \
@@ -716,9 +821,9 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
     align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight);               \
     memset(dst_argb32_c, 2, kWidth * 4 * kHeight);                             \
     memset(dst_argb32_opt, 102, kWidth * 4 * kHeight);                         \
-    FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
+    FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth,      \
                   kHeight);                                                    \
-    FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
+    FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth,  \
                   kHeight);                                                    \
     int max_diff = 0;                                                          \
     for (int i = 0; i < kHeight; ++i) {                                        \
@@ -740,25 +845,29 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
     free_aligned_buffer_page_end(dst_argb32_opt);                              \
   }
 
-#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_ - 4, DIFF, _Any, +, 0)                    \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_, DIFF, _Unaligned, +, 1)                  \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
-                   benchmark_width_, DIFF, _Invert, -, 0)                     \
-  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B,            \
+#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
+                        DIFF)                                                  \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_ - 4, DIFF, _Any, +, 0)                     \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, DIFF, _Unaligned, +, 1)                   \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
+                   benchmark_width_, DIFF, _Invert, -, 0)                      \
+  TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,      \
                    benchmark_width_, DIFF, _Opt, +, 0)
 
-TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RGB24, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RAW, 3, 2)
-TESTBIPLANARTOB(NV21, 2, 2, RAW, 3, 2)
-TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
+TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2)
+TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2)
+TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2)
+TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2)
+#ifdef LITTLE_ENDIAN_TEST
+TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9)
+#endif
+TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
 
 #ifdef DO_THREE_PLANES
 // Do 3 allocations for yuv.  conventional but slower.
@@ -885,26 +994,30 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9)
   TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y,      \
                  benchmark_width_, DIFF, _Opt, +, 0)
 
+TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
 TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
+TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
 TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
 TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
+#ifdef LITTLE_ENDIAN_TEST
+TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
+TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
+#endif
 TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
-TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
+TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
+TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
 TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
 TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
+TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
+#ifdef LITTLE_ENDIAN_TEST
 TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
-// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9.
-TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
-TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
-TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
-TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
-TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+#endif
+TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
 TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
-TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
 TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
-TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
-TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
+TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
 
 #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X,          \
                          SUBSAMP_Y, W1280, N, NEG, OFF)                       \
@@ -976,8 +1089,12 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
 
 TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
 TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(ABGR, 1, 4, NV21, 2, 2)
 TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
 TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
+TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
 
 #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B,  \
                   HEIGHT_B, W1280, DIFF, N, NEG, OFF)                        \
@@ -1069,45 +1186,58 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
                  HEIGHT_B, DIFF)
 
 // TODO(fbarchard): make ARM version of C code that matches NEON.
+TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
+TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
+TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
+#endif
+TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
+TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
+#endif
+TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
+TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
+TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
+#endif
+TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
+TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
+#endif
 TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
+TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
 TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
 TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+#endif
+TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0)
 TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4)
-TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2)
-TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2)
+TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4)
+TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
+TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
+TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
 TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0)
 TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0)
+TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0)
+#ifdef LITTLE_ENDIAN_TEST
 TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0)
-TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0)
-TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
+#endif
+TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0)
 TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
+TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1, ARM_YUV_ERROR)
 TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1, 0)
-TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1, 0)
-TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0)
-TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
 
 #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                    HEIGHT_B, W1280, DIFF, N, NEG, OFF)                       \
@@ -1204,7 +1334,9 @@ TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0)
   TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
                   HEIGHT_B, DIFF)
 
+#ifdef LITTLE_ENDIAN_TEST
 TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0)
+#endif
 
 #define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF)      \
   TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) {                          \
@@ -1291,6 +1423,7 @@ TEST_F(LibYUVConvertTest, ValidateJpeg) {
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1317,6 +1450,7 @@ TEST_F(LibYUVConvertTest, ValidateJpegLarge) {
   // EOI, SOI. Expect pass.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   orig_pixels[kSize - kOff + 0] = 0xff;
   orig_pixels[kSize - kOff + 1] = 0xd9;  // EOI.
   for (int times = 0; times < benchmark_iterations_; ++times) {
@@ -1350,6 +1484,7 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
   // SOI but no EOI. Expect fail.
   orig_pixels[0] = 0xff;
   orig_pixels[1] = 0xd8;  // SOI.
+  orig_pixels[2] = 0xff;
   for (int times = 0; times < benchmark_iterations_; ++times) {
     EXPECT_FALSE(ValidateJpeg(orig_pixels, kSize));
   }
@@ -1367,22 +1502,24 @@ TEST_F(LibYUVConvertTest, InvalidateJpeg) {
 TEST_F(LibYUVConvertTest, FuzzJpeg) {
   // SOI but no EOI. Expect fail.
   for (int times = 0; times < benchmark_iterations_; ++times) {
-    const int kSize = fastrand() % 5000 + 2;
+    const int kSize = fastrand() % 5000 + 3;
     align_buffer_page_end(orig_pixels, kSize);
     MemRandomize(orig_pixels, kSize);
 
     // Add SOI so frame will be scanned.
     orig_pixels[0] = 0xff;
     orig_pixels[1] = 0xd8;  // SOI.
+    orig_pixels[2] = 0xff;
     orig_pixels[kSize - 1] = 0xff;
-    ValidateJpeg(orig_pixels, kSize);  // Failure normally expected.
+    ValidateJpeg(orig_pixels,
+                 kSize);  // Failure normally expected.
     free_aligned_buffer_page_end(orig_pixels);
   }
 }
 
-// Test data created in GIMP.  In export jpeg, disable thumbnails etc,
-// choose a subsampling, and use low quality (50) to keep size small.
-// Generated with xxd -i test.jpg
+// Test data created in GIMP.  In export jpeg, disable
+// thumbnails etc, choose a subsampling, and use low quality
+// (50) to keep size small. Generated with xxd -i test.jpg
 // test 0 is J400
 static const uint8_t kTest0Jpg[] = {
     0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
@@ -1984,8 +2121,8 @@ TEST_F(LibYUVConvertTest, TestMJPGInfo) {
   EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
   EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
   EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
-  EXPECT_EQ(1,
-            ShowJPegInfo(kTest4Jpg, kTest4JpgLen));  // Valid but unsupported.
+  EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
+                            kTest4JpgLen));  // Valid but unsupported.
 }
 #endif  // HAVE_JPEG
 
@@ -2296,8 +2433,9 @@ TEST_F(LibYUVConvertTest, TestDither) {
   TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN,       \
                   YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C)
 
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4)
-
+#endif
 #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12)                                \
   TEST_F(LibYUVConvertTest, NAME) {                                           \
     const int kWidth = benchmark_width_;                                      \
@@ -2437,10 +2575,12 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3)
 TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3)
 TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4)
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2)
 TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2)
 TESTPLANARTOE(I422, 2, 1, ARGB, 1, 4, RGB565, 2)
+#endif
 TESTPLANARTOE(J422, 2, 1, ARGB, 1, 4, ARGB, 4)
 TESTPLANARTOE(J422, 2, 1, ABGR, 1, 4, ARGB, 4)
 TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
@@ -2574,6 +2714,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
                 _Opt, +, 0, FMT_C, BPP_C)
 
 // Caveat: Destination needs to be 4 bytes
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
 TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
@@ -2582,6 +2723,7 @@ TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
 TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
 TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
 TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
+#endif
 
 TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
   // 2x2 frames
@@ -2753,12 +2895,16 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) {
 
 TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 2)
 TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 2)
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2)
 TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2)
+#endif
 TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 2)
 TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 2)
+#ifdef LITTLE_ENDIAN_TEST
 TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2)
 TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 2)
+#endif
 
 static int Clamp(int y) {
   if (y < 0) {
@@ -2903,7 +3049,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAR30) {
   const int kSize = 1024;
   int histogram_b[1024];
@@ -2966,7 +3113,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
 }
 
 // Test 10 bit YUV to 10 bit RGB
-// Caveat: Result is near due to float rounding in expected result.
+// Caveat: Result is near due to float rounding in expected
+// result.
 TEST_F(LibYUVConvertTest, TestH010ToAB30) {
   const int kSize = 1024;
   int histogram_b[1024];
diff --git a/libs/libyuv/unit_test/math_test.cc b/libs/libyuv/unit_test/math_test.cc
index 0abbad51321..a1544c122b5 100644
--- a/libs/libyuv/unit_test/math_test.cc
+++ b/libs/libyuv/unit_test/math_test.cc
@@ -16,10 +16,14 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"
+#endif
 
 namespace libyuv {
 
+#ifdef ENABLE_ROW_TESTS
 TEST_F(LibYUVBaseTest, TestFixedDiv) {
   int num[1280];
   int div[1280];
@@ -151,5 +155,6 @@ TEST_F(LibYUVBaseTest, TestFixedDiv1_Opt) {
     EXPECT_NEAR(result_c[j], result_opt[j], 1);
   }
 }
+#endif  // ENABLE_ROW_TESTS
 
 }  // namespace libyuv
diff --git a/libs/libyuv/unit_test/planar_test.cc b/libs/libyuv/unit_test/planar_test.cc
index 756089558f7..19232b6fed9 100644
--- a/libs/libyuv/unit_test/planar_test.cc
+++ b/libs/libyuv/unit_test/planar_test.cc
@@ -12,8 +12,6 @@
 #include <stdlib.h>
 #include <time.h>
 
-// row.h defines SIMD_ALIGNED, overriding unit_test.h
-#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
 
 #include "../unit_test/unit_test.h"
 #include "libyuv/compare.h"
@@ -25,6 +23,12 @@
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 
+#ifdef ENABLE_ROW_TESTS
+// row.h defines SIMD_ALIGNED, overriding unit_test.h
+// TODO(fbarchard): Remove row.h from unittests.  Test public functions.
+#include "libyuv/row.h" /* For ScaleSumSamples_Neon */
+#endif
+
 namespace libyuv {
 
 TEST_F(LibYUVPlanarTest, TestAttenuate) {
@@ -2321,7 +2325,8 @@ TEST_F(LibYUVPlanarTest, TestARGBCopyAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 4);
   align_buffer_page_end(dst_pixels_opt, kPixels);
   align_buffer_page_end(dst_pixels_c, kPixels);
@@ -2349,7 +2354,8 @@ TEST_F(LibYUVPlanarTest, TestARGBExtractAlpha) {
 }
 
 TEST_F(LibYUVPlanarTest, TestARGBCopyYToAlpha) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(orig_pixels, kPixels);
   align_buffer_page_end(dst_pixels_opt, kPixels * 4);
   align_buffer_page_end(dst_pixels_c, kPixels * 4);
@@ -2482,7 +2488,8 @@ TEST_F(LibYUVPlanarTest, SetPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(tmp_pixels_u, kPixels);
   align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2526,7 +2533,8 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 2);
   align_buffer_page_end(tmp_pixels_u, kPixels);
   align_buffer_page_end(tmp_pixels_v, kPixels);
@@ -2568,8 +2576,39 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
   free_aligned_buffer_page_end(dst_pixels_c);
 }
 
+TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
+  align_buffer_page_end(src_pixels, kPixels * 2);
+  align_buffer_page_end(dst_pixels_opt, kPixels * 2);
+  align_buffer_page_end(dst_pixels_c, kPixels * 2);
+
+  MemRandomize(src_pixels, kPixels * 2);
+  MemRandomize(dst_pixels_opt, kPixels * 2);
+  MemRandomize(dst_pixels_c, kPixels * 2);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+              benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    SwapUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
 TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2617,7 +2656,8 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
 }
 
 TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels, kPixels * 3);
   align_buffer_page_end(tmp_pixels_r, kPixels);
   align_buffer_page_end(tmp_pixels_g, kPixels);
@@ -2666,7 +2706,8 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
 // TODO(fbarchard): improve test for platforms and cpu detect
 #ifdef HAS_MERGEUVROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_u, kPixels * 2);
   align_buffer_page_end(src_pixels_v, kPixels * 2);
   align_buffer_page_end(dst_pixels_uv_opt, kPixels * 2 * 2);
@@ -2710,7 +2751,8 @@ TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_MULTIPLYROW_16_AVX2
 TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2746,7 +2788,8 @@ TEST_F(LibYUVPlanarTest, MultiplyRow_16_Opt) {
 #endif  // HAS_MULTIPLYROW_16_AVX2
 
 TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_opt, kPixels);
   align_buffer_page_end(dst_pixels_y_c, kPixels);
@@ -2776,6 +2819,7 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT16TO8ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
@@ -2821,9 +2865,11 @@ TEST_F(LibYUVPlanarTest, Convert16To8Row_Opt) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
+#endif  // ENABLE_ROW_TESTS
 
 TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
-  const int kPixels = benchmark_width_ * benchmark_height_;
+  // Round count up to multiple of 16
+  const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
   align_buffer_page_end(src_pixels_y, kPixels);
   align_buffer_page_end(dst_pixels_y_opt, kPixels * 2);
   align_buffer_page_end(dst_pixels_y_c, kPixels * 2);
@@ -2855,6 +2901,7 @@ TEST_F(LibYUVPlanarTest, Convert8To16Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+#ifdef ENABLE_ROW_TESTS
 // TODO(fbarchard): Improve test for more platforms.
 #ifdef HAS_CONVERT8TO16ROW_AVX2
 TEST_F(LibYUVPlanarTest, Convert8To16Row_Opt) {
@@ -3186,7 +3233,8 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
   }
   GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 640);
   for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
       GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 640);
@@ -3239,7 +3287,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
              &orig_pixels[640 * 3], &orig_pixels[640 * 4], &dst_pixels_c[0],
              640);
   for (int i = 0; i < benchmark_pixels_div1280_ * 2; ++i) {
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__aarch64__) || defined(__ARM_NEON__) || defined(LIBYUV_NEON))
     int has_neon = TestCpuFlag(kCpuHasNEON);
     if (has_neon) {
       GaussCol_NEON(&orig_pixels[0], &orig_pixels[640], &orig_pixels[640 * 2],
@@ -3267,4 +3316,36 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
   EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
 }
 
+TEST_F(LibYUVPlanarTest, SwapUVRow) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
+      SwapUVRow_C;
+
+  align_buffer_page_end(src_pixels_vu, kPixels * 2);
+  align_buffer_page_end(dst_pixels_uv, kPixels * 2);
+  MemRandomize(src_pixels_vu, kPixels * 2);
+  memset(dst_pixels_uv, 1, kPixels * 2);
+
+#if defined(HAS_SWAPUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    SwapUVRow = SwapUVRow_Any_NEON;
+    if (IS_ALIGNED(kPixels, 16)) {
+      SwapUVRow = SwapUVRow_NEON;
+    }
+  }
+#endif
+
+  for (int j = 0; j < benchmark_iterations_; j++) {
+    SwapUVRow(src_pixels_vu, dst_pixels_uv, kPixels);
+  }
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
+    EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_vu);
+  free_aligned_buffer_page_end(dst_pixels_uv);
+}
+#endif
+
 }  // namespace libyuv
diff --git a/libs/libyuv/unit_test/rotate_test.cc b/libs/libyuv/unit_test/rotate_test.cc
index d04b96e9c68..61941e63e0e 100644
--- a/libs/libyuv/unit_test/rotate_test.cc
+++ b/libs/libyuv/unit_test/rotate_test.cc
@@ -135,6 +135,123 @@ TEST_F(LibYUVRotateTest, DISABLED_I420Rotate270_Odd) {
                  benchmark_cpu_info_);
 }
 
+static void I444TestRotate(int src_width,
+                           int src_height,
+                           int dst_width,
+                           int dst_height,
+                           libyuv::RotationMode mode,
+                           int benchmark_iterations,
+                           int disable_cpu_flags,
+                           int benchmark_cpu_info) {
+  if (src_width < 1) {
+    src_width = 1;
+  }
+  if (src_height == 0) {
+    src_height = 1;
+  }
+  if (dst_width < 1) {
+    dst_width = 1;
+  }
+  if (dst_height < 1) {
+    dst_height = 1;
+  }
+  int src_i444_y_size = src_width * Abs(src_height);
+  int src_i444_uv_size = src_width * Abs(src_height);
+  int src_i444_size = src_i444_y_size + src_i444_uv_size * 2;
+  align_buffer_page_end(src_i444, src_i444_size);
+  for (int i = 0; i < src_i444_size; ++i) {
+    src_i444[i] = fastrand() & 0xff;
+  }
+
+  int dst_i444_y_size = dst_width * dst_height;
+  int dst_i444_uv_size = dst_width * dst_height;
+  int dst_i444_size = dst_i444_y_size + dst_i444_uv_size * 2;
+  align_buffer_page_end(dst_i444_c, dst_i444_size);
+  align_buffer_page_end(dst_i444_opt, dst_i444_size);
+  memset(dst_i444_c, 2, dst_i444_size);
+  memset(dst_i444_opt, 3, dst_i444_size);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+             src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+             dst_i444_c, dst_width, dst_i444_c + dst_i444_y_size, dst_width,
+             dst_i444_c + dst_i444_y_size + dst_i444_uv_size, dst_width,
+             src_width, src_height, mode);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    I444Rotate(src_i444, src_width, src_i444 + src_i444_y_size, src_width,
+               src_i444 + src_i444_y_size + src_i444_uv_size, src_width,
+               dst_i444_opt, dst_width, dst_i444_opt + dst_i444_y_size,
+               dst_width, dst_i444_opt + dst_i444_y_size + dst_i444_uv_size,
+               dst_width, src_width, src_height, mode);
+  }
+
+  // Rotation should be exact.
+  for (int i = 0; i < dst_i444_size; ++i) {
+    EXPECT_EQ(dst_i444_c[i], dst_i444_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(dst_i444_c);
+  free_aligned_buffer_page_end(dst_i444_opt);
+  free_aligned_buffer_page_end(src_i444);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate0_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate0, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate90_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate90, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate180_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_width_,
+                 benchmark_height_, kRotate180, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, I444Rotate270_Opt) {
+  I444TestRotate(benchmark_width_, benchmark_height_, benchmark_height_,
+                 benchmark_width_, kRotate270, benchmark_iterations_,
+                 disable_cpu_flags_, benchmark_cpu_info_);
+}
+
+// TODO(fbarchard): Remove odd width tests.
+// Odd width tests work but disabled because they use C code and can be
+// tested by passing an odd width command line or environment variable.
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate0_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate0,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate90_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate90,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate180_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_width_ - 3, benchmark_height_ - 1, kRotate180,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
+TEST_F(LibYUVRotateTest, DISABLED_I444Rotate270_Odd) {
+  I444TestRotate(benchmark_width_ - 3, benchmark_height_ - 1,
+                 benchmark_height_ - 1, benchmark_width_ - 3, kRotate270,
+                 benchmark_iterations_, disable_cpu_flags_,
+                 benchmark_cpu_info_);
+}
+
 static void NV12TestRotate(int src_width,
                            int src_height,
                            int dst_width,
diff --git a/libs/libyuv/unit_test/scale_argb_test.cc b/libs/libyuv/unit_test/scale_argb_test.cc
index 94aef60e226..93b77f56a9c 100644
--- a/libs/libyuv/unit_test/scale_argb_test.cc
+++ b/libs/libyuv/unit_test/scale_argb_test.cc
@@ -259,7 +259,7 @@ static int ARGBClipTestFilter(int src_width,
 
 TEST_FACTOR(2, 1, 2)
 TEST_FACTOR(4, 1, 4)
-TEST_FACTOR(8, 1, 8)
+// TEST_FACTOR(8, 1, 8)  Disable for benchmark performance.
 TEST_FACTOR(3by4, 3, 4)
 TEST_FACTOR(3by8, 3, 8)
 TEST_FACTOR(3, 1, 3)
diff --git a/libs/libyuv/unit_test/scale_test.cc b/libs/libyuv/unit_test/scale_test.cc
index d97d54a8830..94a78590067 100644
--- a/libs/libyuv/unit_test/scale_test.cc
+++ b/libs/libyuv/unit_test/scale_test.cc
@@ -14,7 +14,10 @@
 #include "../unit_test/unit_test.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/scale.h"
+
+#ifdef ENABLE_ROW_TESTS
 #include "libyuv/scale_row.h"  // For ScaleRowDown2Box_Odd_C
+#endif
 
 #define STRINGIZE(line) #line
 #define FILELINESTR(file, line) file ":" STRINGIZE(line)
@@ -22,14 +25,14 @@
 namespace libyuv {
 
 // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
-static int TestFilter(int src_width,
-                      int src_height,
-                      int dst_width,
-                      int dst_height,
-                      FilterMode f,
-                      int benchmark_iterations,
-                      int disable_cpu_flags,
-                      int benchmark_cpu_info) {
+static int I420TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
   if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
     return 0;
   }
@@ -141,14 +144,14 @@ static int TestFilter(int src_width,
 
 // Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
 // 0 = exact.
-static int TestFilter_16(int src_width,
-                         int src_height,
-                         int dst_width,
-                         int dst_height,
-                         FilterMode f,
-                         int benchmark_iterations,
-                         int disable_cpu_flags,
-                         int benchmark_cpu_info) {
+static int I420TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
   if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
     return 0;
   }
@@ -256,6 +259,241 @@ static int TestFilter_16(int src_width,
   return max_diff;
 }
 
+// Test scaling with C vs Opt and return maximum pixel difference. 0 = exact.
+static int I444TestFilter(int src_width,
+                          int src_height,
+                          int dst_width,
+                          int dst_height,
+                          FilterMode f,
+                          int benchmark_iterations,
+                          int disable_cpu_flags,
+                          int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i, j;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  if (!src_y || !src_u || !src_v) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int64_t dst_y_plane_size = (dst_width) * (dst_height);
+  int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_c, dst_y_plane_size);
+  align_buffer_page_end(dst_u_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_c, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_opt, dst_y_plane_size);
+  align_buffer_page_end(dst_u_opt, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_opt, dst_uv_plane_size);
+  if (!dst_y_c || !dst_u_c || !dst_v_c || !dst_y_opt || !dst_u_opt ||
+      !dst_v_opt) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  double c_time = get_time();
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_c, dst_stride_y, dst_u_c,
+            dst_stride_uv, dst_v_c, dst_stride_uv, dst_width, dst_height, f);
+  c_time = (get_time() - c_time);
+
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  double opt_time = get_time();
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+              src_width, src_height, dst_y_opt, dst_stride_y, dst_u_opt,
+              dst_stride_uv, dst_v_opt, dst_stride_uv, dst_width, dst_height,
+              f);
+  }
+  opt_time = (get_time() - opt_time) / benchmark_iterations;
+  // Report performance of C vs OPT.
+  printf("filter %d - %8d us C - %8d us OPT\n", f,
+         static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
+
+  // C version may be a little off from the optimized. Order of
+  //  operations may introduce rounding somewhere. So do a difference
+  //  of the buffers and look to see that the max difference is not
+  //  over 3.
+  int max_diff = 0;
+  for (i = 0; i < (dst_height); ++i) {
+    for (j = 0; j < (dst_width); ++j) {
+      int abs_diff = Abs(dst_y_c[(i * dst_stride_y) + j] -
+                         dst_y_opt[(i * dst_stride_y) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  for (i = 0; i < (dst_height_uv); ++i) {
+    for (j = 0; j < (dst_width_uv); ++j) {
+      int abs_diff = Abs(dst_u_c[(i * dst_stride_uv) + j] -
+                         dst_u_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+      abs_diff = Abs(dst_v_c[(i * dst_stride_uv) + j] -
+                     dst_v_opt[(i * dst_stride_uv) + j]);
+      if (abs_diff > max_diff) {
+        max_diff = abs_diff;
+      }
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_c);
+  free_aligned_buffer_page_end(dst_u_c);
+  free_aligned_buffer_page_end(dst_v_c);
+  free_aligned_buffer_page_end(dst_y_opt);
+  free_aligned_buffer_page_end(dst_u_opt);
+  free_aligned_buffer_page_end(dst_v_opt);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+
+  return max_diff;
+}
+
+// Test scaling with 8 bit C vs 16 bit C and return maximum pixel difference.
+// 0 = exact.
+static int I444TestFilter_16(int src_width,
+                             int src_height,
+                             int dst_width,
+                             int dst_height,
+                             FilterMode f,
+                             int benchmark_iterations,
+                             int disable_cpu_flags,
+                             int benchmark_cpu_info) {
+  if (!SizeValid(src_width, src_height, dst_width, dst_height)) {
+    return 0;
+  }
+
+  int i;
+  int src_width_uv = Abs(src_width);
+  int src_height_uv = Abs(src_height);
+
+  int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height));
+  int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv);
+
+  int src_stride_y = Abs(src_width);
+  int src_stride_uv = src_width_uv;
+
+  align_buffer_page_end(src_y, src_y_plane_size);
+  align_buffer_page_end(src_u, src_uv_plane_size);
+  align_buffer_page_end(src_v, src_uv_plane_size);
+  align_buffer_page_end(src_y_16, src_y_plane_size * 2);
+  align_buffer_page_end(src_u_16, src_uv_plane_size * 2);
+  align_buffer_page_end(src_v_16, src_uv_plane_size * 2);
+  if (!src_y || !src_u || !src_v || !src_y_16 || !src_u_16 || !src_v_16) {
+    printf("Skipped.  Alloc failed " FILELINESTR(__FILE__, __LINE__) "\n");
+    return 0;
+  }
+  uint16_t* p_src_y_16 = reinterpret_cast<uint16_t*>(src_y_16);
+  uint16_t* p_src_u_16 = reinterpret_cast<uint16_t*>(src_u_16);
+  uint16_t* p_src_v_16 = reinterpret_cast<uint16_t*>(src_v_16);
+
+  MemRandomize(src_y, src_y_plane_size);
+  MemRandomize(src_u, src_uv_plane_size);
+  MemRandomize(src_v, src_uv_plane_size);
+
+  for (i = 0; i < src_y_plane_size; ++i) {
+    p_src_y_16[i] = src_y[i];
+  }
+  for (i = 0; i < src_uv_plane_size; ++i) {
+    p_src_u_16[i] = src_u[i];
+    p_src_v_16[i] = src_v[i];
+  }
+
+  int dst_width_uv = dst_width;
+  int dst_height_uv = dst_height;
+
+  int dst_y_plane_size = (dst_width) * (dst_height);
+  int dst_uv_plane_size = (dst_width_uv) * (dst_height_uv);
+
+  int dst_stride_y = dst_width;
+  int dst_stride_uv = dst_width_uv;
+
+  align_buffer_page_end(dst_y_8, dst_y_plane_size);
+  align_buffer_page_end(dst_u_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_v_8, dst_uv_plane_size);
+  align_buffer_page_end(dst_y_16, dst_y_plane_size * 2);
+  align_buffer_page_end(dst_u_16, dst_uv_plane_size * 2);
+  align_buffer_page_end(dst_v_16, dst_uv_plane_size * 2);
+
+  uint16_t* p_dst_y_16 = reinterpret_cast<uint16_t*>(dst_y_16);
+  uint16_t* p_dst_u_16 = reinterpret_cast<uint16_t*>(dst_u_16);
+  uint16_t* p_dst_v_16 = reinterpret_cast<uint16_t*>(dst_v_16);
+
+  MaskCpuFlags(disable_cpu_flags);  // Disable all CPU optimization.
+  I444Scale(src_y, src_stride_y, src_u, src_stride_uv, src_v, src_stride_uv,
+            src_width, src_height, dst_y_8, dst_stride_y, dst_u_8,
+            dst_stride_uv, dst_v_8, dst_stride_uv, dst_width, dst_height, f);
+  MaskCpuFlags(benchmark_cpu_info);  // Enable all CPU optimization.
+  for (i = 0; i < benchmark_iterations; ++i) {
+    I444Scale_16(p_src_y_16, src_stride_y, p_src_u_16, src_stride_uv,
+                 p_src_v_16, src_stride_uv, src_width, src_height, p_dst_y_16,
+                 dst_stride_y, p_dst_u_16, dst_stride_uv, p_dst_v_16,
+                 dst_stride_uv, dst_width, dst_height, f);
+  }
+
+  // Expect an exact match.
+  int max_diff = 0;
+  for (i = 0; i < dst_y_plane_size; ++i) {
+    int abs_diff = Abs(dst_y_8[i] - p_dst_y_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  for (i = 0; i < dst_uv_plane_size; ++i) {
+    int abs_diff = Abs(dst_u_8[i] - p_dst_u_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+    abs_diff = Abs(dst_v_8[i] - p_dst_v_16[i]);
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+
+  free_aligned_buffer_page_end(dst_y_8);
+  free_aligned_buffer_page_end(dst_u_8);
+  free_aligned_buffer_page_end(dst_v_8);
+  free_aligned_buffer_page_end(dst_y_16);
+  free_aligned_buffer_page_end(dst_u_16);
+  free_aligned_buffer_page_end(dst_v_16);
+  free_aligned_buffer_page_end(src_y);
+  free_aligned_buffer_page_end(src_u);
+  free_aligned_buffer_page_end(src_v);
+  free_aligned_buffer_page_end(src_y_16);
+  free_aligned_buffer_page_end(src_u_16);
+  free_aligned_buffer_page_end(src_v_16);
+
+  return max_diff;
+}
+
 // The following adjustments in dimensions ensure the scale factor will be
 // exactly achieved.
 // 2 is chroma subsample.
@@ -263,16 +501,32 @@ static int TestFilter_16(int src_width,
 #define SX(x, nom, denom) static_cast<int>(((x / nom + 1) / 2) * denom * 2)
 
 #define TEST_FACTOR1(name, filter, nom, denom, max_diff)                     \
-  TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter) {                    \
-    int diff = TestFilter(                                                   \
+  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) {                \
+    int diff = I420TestFilter(                                               \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
+        benchmark_cpu_info_);                                                \
+    EXPECT_LE(diff, max_diff);                                               \
+  }                                                                          \
+  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) {                \
+    int diff = I444TestFilter(                                               \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
         benchmark_cpu_info_);                                                \
     EXPECT_LE(diff, max_diff);                                               \
   }                                                                          \
-  TEST_F(LibYUVScaleTest, ScaleDownBy##name##_##filter##_16) {               \
-    int diff = TestFilter_16(                                                \
+  TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) {           \
+    int diff = I420TestFilter_16(                                            \
+        SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
+        DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
+        kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
+        benchmark_cpu_info_);                                                \
+    EXPECT_LE(diff, max_diff);                                               \
+  }                                                                          \
+  TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) {           \
+    int diff = I444TestFilter_16(                                            \
         SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \
         DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \
         kFilter##filter, benchmark_iterations_, disable_cpu_flags_,          \
@@ -290,7 +544,7 @@ static int TestFilter_16(int src_width,
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
 TEST_FACTOR(3by4, 3, 4, 1)
 TEST_FACTOR(3by8, 3, 8, 1)
 TEST_FACTOR(3, 1, 3, 0)
@@ -300,30 +554,58 @@ TEST_FACTOR(3, 1, 3, 0)
 #undef DX
 
 #define TEST_SCALETO1(name, width, height, filter, max_diff)                  \
-  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter) {            \
-    int diff = TestFilter(benchmark_width_, benchmark_height_, width, height, \
-                          kFilter##filter, benchmark_iterations_,             \
-                          disable_cpu_flags_, benchmark_cpu_info_);           \
+  TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) {      \
+    int diff = I420TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter) {      \
+    int diff = I444TestFilter(benchmark_width_, benchmark_height_, width,     \
+                              height, kFilter##filter, benchmark_iterations_, \
+                              disable_cpu_flags_, benchmark_cpu_info_);       \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \
+    int diff = I420TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \
+    int diff = I444TestFilter_16(                                             \
+        benchmark_width_, benchmark_height_, width, height, kFilter##filter,  \
+        benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_);      \
+    EXPECT_LE(diff, max_diff);                                                \
+  }                                                                           \
+  TEST_F(LibYUVScaleTest, I420##name##From##width##x##height##_##filter) {    \
+    int diff = I420TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter) {          \
-    int diff = TestFilter(width, height, Abs(benchmark_width_),               \
-                          Abs(benchmark_height_), kFilter##filter,            \
-                          benchmark_iterations_, disable_cpu_flags_,          \
-                          benchmark_cpu_info_);                               \
+  TEST_F(LibYUVScaleTest, I444##name##From##width##x##height##_##filter) {    \
+    int diff = I444TestFilter(width, height, Abs(benchmark_width_),           \
+                              Abs(benchmark_height_), kFilter##filter,        \
+                              benchmark_iterations_, disable_cpu_flags_,      \
+                              benchmark_cpu_info_);                           \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##To##width##x##height##_##filter##_16) {       \
-    int diff = TestFilter_16(benchmark_width_, benchmark_height_, width,      \
-                             height, kFilter##filter, benchmark_iterations_,  \
-                             disable_cpu_flags_, benchmark_cpu_info_);        \
+  TEST_F(LibYUVScaleTest,                                                     \
+         I420##name##From##width##x##height##_##filter##_16) {                \
+    int diff = I420TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
     EXPECT_LE(diff, max_diff);                                                \
   }                                                                           \
-  TEST_F(LibYUVScaleTest, name##From##width##x##height##_##filter##_16) {     \
-    int diff = TestFilter_16(width, height, Abs(benchmark_width_),            \
-                             Abs(benchmark_height_), kFilter##filter,         \
-                             benchmark_iterations_, disable_cpu_flags_,       \
-                             benchmark_cpu_info_);                            \
+  TEST_F(LibYUVScaleTest,                                                     \
+         I444##name##From##width##x##height##_##filter##_16) {                \
+    int diff = I444TestFilter_16(width, height, Abs(benchmark_width_),        \
+                                 Abs(benchmark_height_), kFilter##filter,     \
+                                 benchmark_iterations_, disable_cpu_flags_,   \
+                                 benchmark_cpu_info_);                        \
     EXPECT_LE(diff, max_diff);                                                \
   }
 
@@ -343,6 +625,7 @@ TEST_SCALETO(Scale, 1920, 1080)
 #undef TEST_SCALETO1
 #undef TEST_SCALETO
 
+#ifdef ENABLE_ROW_TESTS
 #ifdef HAS_SCALEROWDOWN2_SSSE3
 TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_Odd_SSSE3) {
   SIMD_ALIGNED(uint8_t orig_pixels[128 * 2]);
@@ -524,6 +807,7 @@ TEST_F(LibYUVScaleTest, TestScaleRowDown2Box_16) {
   EXPECT_EQ(dst_pixels_c[0], (0 + 1 + 2560 + 2561 + 2) / 4);
   EXPECT_EQ(dst_pixels_c[1279], 3839);
 }
+#endif  // ENABLE_ROW_TESTS
 
 // Test scaling plane with 8 bit C vs 16 bit C and return maximum pixel
 // difference.
@@ -614,7 +898,7 @@ static int TestPlaneFilter_16(int src_width,
 
 TEST_FACTOR(2, 1, 2, 0)
 TEST_FACTOR(4, 1, 4, 0)
-TEST_FACTOR(8, 1, 8, 0)
+// TEST_FACTOR(8, 1, 8, 0) Disable for benchmark performance.  Takes 90 seconds.
 TEST_FACTOR(3by4, 3, 4, 1)
 TEST_FACTOR(3by8, 3, 8, 1)
 TEST_FACTOR(3, 1, 3, 0)
diff --git a/libs/libyuv/unit_test/unit_test.cc b/libs/libyuv/unit_test/unit_test.cc
index a1ae7ea3c7c..2aa9cdaad6e 100644
--- a/libs/libyuv/unit_test/unit_test.cc
+++ b/libs/libyuv/unit_test/unit_test.cc
@@ -17,6 +17,9 @@
 #ifdef LIBYUV_USE_GFLAGS
 #include "gflags/gflags.h"
 #endif
+#ifdef LIBYUV_USE_BASE_FLAGS
+#include "base/commandlineflags.h"
+#endif
 #include "libyuv/cpu_id.h"
 
 unsigned int fastrand_seed = 0xfb;
diff --git a/libs/libyuv/util/psnr.cc b/libs/libyuv/util/psnr.cc
index f54015bab82..c7bee7f97d2 100644
--- a/libs/libyuv/util/psnr.cc
+++ b/libs/libyuv/util/psnr.cc
@@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
         ,
         "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
 #endif
-      );  // NOLINT
+  );  // NOLINT
   return sse;
 }
 #endif  // LIBYUV_DISABLE_X86 etc
diff --git a/libs/miniupnpc/minissdpc.c b/libs/miniupnpc/minissdpc.c
index aa939fb7fb2..96133ff12fd 100644
--- a/libs/miniupnpc/minissdpc.c
+++ b/libs/miniupnpc/minissdpc.c
@@ -46,7 +46,7 @@ getDevicesFromMiniSSDPD(const char * devtype, const char * socketpath)
 	unsigned char * p;
 	unsigned char * url;
 	unsigned int i;
-	unsigned int urlsize, stsize, usnsize, l;
+	unsigned int urlsize, stsize, usnsize, l, plen;
 	int s;
 	struct sockaddr_un addr;
 
@@ -58,7 +58,12 @@ getDevicesFromMiniSSDPD(const char * devtype, const char * socketpath)
 		return NULL;
 	}
 	addr.sun_family = AF_UNIX;
-	memcpy(addr.sun_path, socketpath, sizeof(addr.sun_path));
+	plen = strlen(socketpath);
+	if (plen + 1 > sizeof(addr.sun_path)) {
+		plen = sizeof(addr.sun_path) - 1;
+	}
+	memset(addr.sun_path, 0, sizeof(addr.sun_path));
+	memcpy(addr.sun_path, socketpath, plen);
 	if(connect(s, (struct sockaddr *)&addr, sizeof(struct sockaddr_un)) < 0)
 	{
 		/*syslog(LOG_WARNING, "connect(\"%s\"): %m", socketpath);*/
diff --git a/libs/miniupnpc/miniwget.c b/libs/miniupnpc/miniwget.c
index b4849e43da9..3fa930e313e 100644
--- a/libs/miniupnpc/miniwget.c
+++ b/libs/miniupnpc/miniwget.c
@@ -49,13 +49,11 @@ miniwget2(const char * url, const char * host,
 	SOCKET s;
 #else
 	int s;
-#endif
 	int fd_flags;
+#endif
 	struct sockaddr_in dest;
 	struct hostent *hp;
 
-	fd_flags = 0;
-
 	*size = 0;
 	hp = gethostbyname(host);
 	if(hp==NULL)
diff --git a/libs/sofia-sip/.update b/libs/sofia-sip/.update
index 84d396ff553..bbf71612f84 100644
--- a/libs/sofia-sip/.update
+++ b/libs/sofia-sip/.update
@@ -1 +1 @@
-Wed Dec 11 15:38:35 MST 2019
+Thu Apr 30 14:02:03 UTC 2020
diff --git a/libs/sofia-sip/Makefile.am b/libs/sofia-sip/Makefile.am
index 72796958657..9ef3ac8a5b0 100644
--- a/libs/sofia-sip/Makefile.am
+++ b/libs/sofia-sip/Makefile.am
@@ -7,7 +7,7 @@
 
 AUTOMAKE_OPTIONS = foreign 1.7
 
-SUBDIRS =  libsofia-sip-ua $(GLIB_SUBDIRS) packages # tests s2check utils
+SUBDIRS =  libsofia-sip-ua $(GLIB_SUBDIRS) packages tests s2check utils
 DIST_SUBDIRS = s2check libsofia-sip-ua libsofia-sip-ua-glib utils packages \
 	tests win32 open_c
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/iptsec/auth_module.c b/libs/sofia-sip/libsofia-sip-ua/iptsec/auth_module.c
index 9d29957cf0f..8d121faeecf 100644
--- a/libs/sofia-sip/libsofia-sip-ua/iptsec/auth_module.c
+++ b/libs/sofia-sip/libsofia-sip-ua/iptsec/auth_module.c
@@ -690,8 +690,6 @@ void auth_check_digest(auth_mod_t *am,
     return;
   }
 
-  phrase = "Bad authorization";
-
 #define PA "Authorization missing "
 
   if ((!ar->ar_username && (phrase = PA "username")) ||
diff --git a/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser.c b/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser.c
index 16a3daa4f80..2e3a8d42afd 100644
--- a/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser.c
+++ b/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser.c
@@ -1244,7 +1244,7 @@ issize_t msg_parse_next_field(su_home_t *home, msg_header_t *prev,
 msg_header_t *msg_header_d(su_home_t *home, msg_t const *msg, char const *b)
 {
   msg_mclass_t const *mc = msg->m_class;
-  msg_href_t const *hr = mc->mc_unknown;
+  msg_href_t const *hr;
   isize_t n;			/* Length of header contents */
   isize_t name_len, xtra;
   msg_header_t *h;
@@ -1384,7 +1384,6 @@ issize_t msg_extract_payload(msg_t *msg, msg_pub_t *mo,
     usize_t current, rest;
 
     current = msg->m_buffer->mb_size - msg->m_buffer->mb_used;
-    rest = body_len - current;
 
     /* Use all the data from our current buffer */
     msg_buf_used(msg, current);
diff --git a/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser_util.c b/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser_util.c
index 11fc1358ebf..109ee60ff51 100644
--- a/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser_util.c
+++ b/libs/sofia-sip/libsofia-sip-ua/msg/msg_parser_util.c
@@ -1756,20 +1756,20 @@ issize_t msg_unquoted_e(char *b, isize_t bsiz, char const *s)
     size_t n = strcspn(s, "\"\\");
 
     if (n == 0) {
-      if (e + 2 <= bsiz)
+      if (b && e + 2 <= bsiz)
 	b[e] = '\\', b[e + 1] = s[0];
       e += 2;
       s++;
     }
     else {
-      if (e + n <= bsiz)
+      if (b && (e + n <= bsiz))
 	memcpy(b + e, s, n);
       e += n;
       s += n;
     }
   }
 
-  if (e < bsiz)
+  if (b && e < bsiz)
     b[e] = '"';
   e++;
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/msg/test_msg.c b/libs/sofia-sip/libsofia-sip-ua/msg/test_msg.c
index 440f5d98ce8..6c85c47ad03 100644
--- a/libs/sofia-sip/libsofia-sip-ua/msg/test_msg.c
+++ b/libs/sofia-sip/libsofia-sip-ua/msg/test_msg.c
@@ -1293,7 +1293,7 @@ int test_mime(void)
   TEST_1(mp = mp->mp_next);
 
   TEST_1(mp->mp_data);
-  TEST(memcmp(mp->mp_data, CRLF "--" "LaGqGt4BI6Ho" CR LF, mp->mp_len), 0);
+  TEST(memcmp(mp->mp_data, CRLF "--" "LaGqGt4BI6Ho" CRLF, mp->mp_len), 0);
 
   TEST_1(c = mp->mp_content_type);
   TEST_S(c->c_type, "text/plain"); TEST_S(c->c_subtype, "plain");
diff --git a/libs/sofia-sip/libsofia-sip-ua/nea/nea_server.c b/libs/sofia-sip/libsofia-sip-ua/nea/nea_server.c
index 42539cf82d6..4f1fce43a1b 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nea/nea_server.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nea/nea_server.c
@@ -844,7 +844,7 @@ int nea_view_update(nea_server_t *nes,
     nea_view_queue(nes, evv, evq);
 
   SU_DEBUG_7(("nea_server_update(%p): %s (%s)\n", (void *)nes,
-	      ev->ev_event->o_type, evv->evv_content_type->c_type));
+	      ev->ev_event->o_type, (evv->evv_content_type ? evv->evv_content_type->c_type : "N/A")));
 
   return 1;
 }
diff --git a/libs/sofia-sip/libsofia-sip-ua/nta/Makefile.am b/libs/sofia-sip/libsofia-sip-ua/nta/Makefile.am
index 9beb3bb1393..c450fd1a732 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nta/Makefile.am
+++ b/libs/sofia-sip/libsofia-sip-ua/nta/Makefile.am
@@ -61,7 +61,7 @@ LDADD = 		libnta.la \
 			../msg/libmsg.la \
 			../bnf/libbnf.la \
 			../su/libsu.la \
-			${top_builddir}/s2check/libs2.a
+			${top_builddir}/s2check/libs2.a -lz
 
 test_nta_LDFLAGS = 	-static
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/nta/nta.c b/libs/sofia-sip/libsofia-sip-ua/nta/nta.c
index 9b2e295007a..07bab4f6eb1 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nta/nta.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nta/nta.c
@@ -2406,6 +2406,11 @@ int agent_init_via(nta_agent_t *self, tport_t *primaries, int use_maddr)
     }
   }
 
+  if (!via) {
+      SU_DEBUG_9(("nta: agent_init_via failed\n" VA_NONE));
+      goto error;
+  }
+
   /* Duplicate the list bind to the transports */
   new_via = sip_via_dup(self->sa_home, via);
   /* Duplicate the complete list shown to the application */
@@ -3346,7 +3351,7 @@ void agent_recv_response(nta_agent_t *agent,
     return;
   }
 
-  if (sip->sip_cseq->cs_method == sip_method_ack) {
+  if (sip->sip_cseq && sip->sip_cseq->cs_method == sip_method_ack) {
     /* Drop response messages to ACK */
     agent->sa_stats->as_bad_response++;
     agent->sa_stats->as_bad_message++;
@@ -6932,7 +6937,7 @@ void incoming_retransmit_reply(nta_incoming_t *irq, tport_t *tport)
       }
     }
 
-    tport = tport_tsend(tport, msg, irq->irq_tpn,
+    tport_tsend(tport, msg, irq->irq_tpn,
 			IF_SIGCOMP_TPTAG_COMPARTMENT(irq->irq_cc)
 			TPTAG_MTU(INT_MAX), TAG_END());
     irq->irq_agent->sa_stats->as_sent_msg++;
@@ -6967,7 +6972,7 @@ enum {
 static void
 _nta_incoming_timer(nta_agent_t *sa)
 {
-  uint32_t now = su_time_ms(su_now());
+  uint32_t now;
   nta_incoming_t *irq, *irq_next;
   size_t retransmitted = 0, timeout = 0, terminated = 0, destroyed = 0;
   size_t unconfirmed =
@@ -8960,8 +8965,17 @@ void outgoing_trying(nta_outgoing_t *orq)
 {
   if (orq->orq_forked)
     ;
-  else if (orq->orq_method == sip_method_invite)
-    outgoing_queue(orq->orq_agent->sa_out.inv_calling, orq);
+  else if (orq->orq_method == sip_method_invite) {
+    if (!orq->orq_completed) {
+      outgoing_queue(orq->orq_agent->sa_out.inv_calling, orq);
+    } else {
+      SU_DEBUG_5(("nta(%p): completed request can not be put into inv_calling queue (%u)\n", (void *)orq, orq->orq_cseq->cs_seq));
+      if (orq->orq_queue != orq->orq_agent->sa_out.inv_completed) {
+        /* Put back into inv_completed if it's not there by any reason */
+        outgoing_queue(orq->orq_agent->sa_out.inv_completed, orq); /* Timer D */
+      }
+    }
+  }
   else
     outgoing_queue(orq->orq_agent->sa_out.trying, orq);
 }
@@ -9160,7 +9174,7 @@ outgoing_remove_fork(nta_outgoing_t *orq)
   nta_outgoing_t **slot;
 
   for (slot = &orq->orq_forking->orq_forks;
-       *slot;
+       slot && *slot;
        slot = &(*slot)->orq_forks) {
     if (orq == *slot) {
       *slot = orq->orq_forks;
@@ -9958,10 +9972,16 @@ static int outgoing_query_a(nta_outgoing_t *orq, struct sipdns_query *);
 static void outgoing_answer_a(sres_context_t *orq, sres_query_t *q,
 			      sres_record_t *answers[]);
 
+#ifdef __clang_analyzer__
+#define FUNC_ATTR_NONNULL(...) __attribute__((nonnull(__VA_ARGS__)))
+#else
+#define FUNC_ATTR_NONNULL(...)
+#endif
+
 static void outgoing_query_results(nta_outgoing_t *orq,
 				   struct sipdns_query *sq,
 				   char *results[],
-				   size_t rlen);
+				   size_t rlen) FUNC_ATTR_NONNULL(3);
 
 
 #define SIPDNS_503_ERROR 503, "DNS Error"
@@ -10546,11 +10566,12 @@ struct sipdns_tport const *
 outgoing_naptr_tport(nta_outgoing_t *orq, sres_record_t *answers[])
 {
   int i, j, order, pref;
-  int orders[SIPDNS_TRANSPORTS], prefs[SIPDNS_TRANSPORTS];
+  int orders[SIPDNS_TRANSPORTS] = {0}, prefs[SIPDNS_TRANSPORTS] = {0};
   struct sipdns_tport const *tport;
 
   struct sipdns_resolver *sr = orq->orq_resolver;
 
+  prefs[0] = 0;
   for (j = 0; sr->sr_tports[j]; j++) {
     tport = sr->sr_tports[j];
 
@@ -10682,22 +10703,24 @@ outgoing_answer_srv(sres_context_t *orq, sres_query_t *q,
     if (N > 1 && weight > 0) {
       unsigned rand = su_randint(0,  weight - 1);
 
-      while (rand >= (*tail)->sq_weight) {
+      while (*tail && rand >= (*tail)->sq_weight) {
 	rand -= (*tail)->sq_weight;
 	tail = &(*tail)->sq_next;
       }
     }
 
     /* Remove selected */
-    sq = *tail; *tail = sq->sq_next; assert(sq->sq_priority == priority);
+    if (*tail) {
+      sq = *tail; *tail = sq->sq_next; assert(sq->sq_priority == priority);
 
-    /* Append at *at */
-    sq->sq_next = *at; *at = sq; at = &sq->sq_next; if (!*at) sr->sr_tail = at;
+      /* Append at *at */
+      sq->sq_next = *at; *at = sq; at = &sq->sq_next; if (!*at) sr->sr_tail = at;
 
-    SU_DEBUG_5(("nta: %s IN SRV %u %u  %s %s (%s)\n",
-		sq0->sq_domain,
-		(unsigned)sq->sq_priority, (unsigned)sq->sq_weight,
-		sq->sq_port, sq->sq_domain, sq->sq_proto));
+      SU_DEBUG_5(("nta: %s IN SRV %u %u  %s %s (%s)\n",
+		  sq0->sq_domain,
+		  (unsigned)sq->sq_priority, (unsigned)sq->sq_weight,
+		  sq->sq_port, sq->sq_domain, sq->sq_proto));
+    }
   }
 
   /* This is not needed anymore (?) */
@@ -10787,7 +10810,8 @@ void outgoing_answer_aaaa(sres_context_t *orq, sres_query_t *q,
 
   sres_free_answers(orq->orq_agent->sa_resolver, answers);
 
-  outgoing_query_results(orq, sq, results, found);
+  if (results)
+    outgoing_query_results(orq, sq, results, found);
 }
 #endif /* SU_HAVE_IN6 */
 
@@ -10869,7 +10893,10 @@ void outgoing_answer_a(sres_context_t *orq, sres_query_t *q,
 
   sres_free_answers(orq->orq_agent->sa_resolver, answers);
 
-  outgoing_query_results(orq, sq, results, found);
+  if (results)
+    outgoing_query_results(orq, sq, results, found);
+  else if (!q)
+    outgoing_resolving_error(orq, SIPDNS_503_ERROR);
 }
 
 /** Store A/AAAA query results */
diff --git a/libs/sofia-sip/libsofia-sip-ua/nth/nth_server.c b/libs/sofia-sip/libsofia-sip-ua/nth/nth_server.c
index bbc61c46dee..5fcb15a5c61 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nth/nth_server.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nth/nth_server.c
@@ -342,6 +342,13 @@ nth_site_t *nth_site_create(nth_site_t *parent,
       size_t i, j;
 
       path = (char *)url->url_path;
+
+      if (!path) {
+          SU_DEBUG_3(("nth_site_create(): invalid url\n" VA_NONE));
+          errno = EINVAL;
+          goto error;
+      }
+
       while (path[0] == '/')
 	path++;
 
@@ -842,10 +849,10 @@ void server_request(server_t *srv,
 
   if (subsite)
     subsite->site_access = now;
-  else
+  else if (site)
     site->site_access = now;
 
-  if (subsite && subsite->site_isdir && subpath == site_nodir_match) {
+  if (site && subsite && subsite->site_isdir && subpath == site_nodir_match) {
     /* Answer with 301 */
     http_location_t loc[1];
     http_location_init(loc);
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/Makefile.am b/libs/sofia-sip/libsofia-sip-ua/nua/Makefile.am
index eb2d1acb24f..6b6dfaa211e 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/Makefile.am
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/Makefile.am
@@ -52,7 +52,7 @@ check_nua_SOURCES = 	check_nua.c check_nua.h \
 			check_etsi.c check_simple.c
 
 check_nua_LDADD = 	$(nua_libs) ${top_builddir}/s2check/libs2.a \
-			@CHECK_LIBS@
+			@CHECK_LIBS@ -lz
 
 nua_libs = 		libnua.la  \
 			../iptsec/libiptsec.la \
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_notifier.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_notifier.c
index 8ef6747aff9..97247f9d6de 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_notifier.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_notifier.c
@@ -853,6 +853,9 @@ static int nua_notify_usage_shutdown(nua_handle_t *nh,
   struct notifier_usage *nu = nua_dialog_usage_private(du);
   //nua_client_request_t *cr = du->du_cr;
 
+  if (!nu)
+    return -1;
+
   nu->nu_substate = nua_substate_terminated;
 #if 0
   if (cr) {
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_params.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_params.c
index 2d21acbaa28..2140eff7da8 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_params.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_params.c
@@ -1122,7 +1122,7 @@ int nhp_save_params(nua_handle_t *nh,
   su_home_t *home = nh->nh_home;
   nua_t *nua = nh->nh_nua;
   nua_handle_t *dnh = nua->nua_dhandle;
-  nua_handle_preferences_t *dst = nh->nh_prefs, old[1];
+  nua_handle_preferences_t *dst, old[1];
 
   if (gsrc) {
     *nua->nua_prefs = *gsrc;	/* No pointers this far */
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_register.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_register.c
index 84e1d41fa95..df375b54b7e 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_register.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_register.c
@@ -771,7 +771,6 @@ int nua_register_client_request(nua_client_request_t *cr,
 	sip_header_remove(msg, sip, (sip_header_t *)sip->sip_contact);
       while (m->m_next)
 	sip_header_remove(msg, sip, (sip_header_t *)m->m_next);
-      contacts = m;
       break;
     }
 
@@ -2069,9 +2068,10 @@ sip_contact_t *nua_handle_contact_by_via(nua_handle_t *nh,
 
     /* Make transport parameter lowercase */
     if (strlen(transport) < (sizeof _transport)) {
-      char *s = strcpy(_transport, transport);
+      char *s;
       short c;
 
+      strcpy(_transport, transport);
       for (s = _transport; (c = *s) && c != ';'; s++)
 	if (isupper(c))
 	  *s = tolower(c);
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_registrar.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_registrar.c
index fbef0166c88..a57b511005e 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_registrar.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_registrar.c
@@ -86,6 +86,9 @@ static void nua_registrar_usage_remove(nua_handle_t *nh,
 
   ru = nua_dialog_usage_private(du);
 
+  if (!ru)
+    return;
+
   if (ru->pending)
     tport_release(ru->tport, ru->pending, NULL, NULL, nh, 0), ru->pending = 0;
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_session.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_session.c
index 21c3d65af11..08bc8b69264 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_session.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_session.c
@@ -774,7 +774,6 @@ static int nua_invite_client_request(nua_client_request_t *cr,
   nua_dialog_usage_t *du = cr->cr_usage;
   nua_session_usage_t *ss;
   int offer_sent = 0, retval;
-  sip_time_t invite_timeout;
 
   if (du == NULL)		/* Call terminated */
     return nua_client_return(cr, SIP_481_NO_TRANSACTION, msg);
@@ -784,11 +783,6 @@ static int nua_invite_client_request(nua_client_request_t *cr,
   if (ss->ss_state >= nua_callstate_terminating)
     return nua_client_return(cr, 900, "Session is terminating", msg);
 
-  invite_timeout = NH_PGET(nh, invite_timeout);
-  if (invite_timeout == 0)
-    invite_timeout = UINT_MAX;
-  /* Send CANCEL if we don't get response within timeout*/
-  /* nua_dialog_usage_set_expires(du, invite_timeout); Xyzzy */
   nua_dialog_usage_reset_refresh(du);
 
   /* Add session timer headers */
@@ -3607,8 +3601,11 @@ int nua_update_server_respond(nua_server_request_t *sr, tagi_t const *tags)
       sr_status(sr, SIP_500_INTERNAL_SERVER_ERROR);
     }
     else {
-      sr->sr_answer_sent = 1, ss->ss_oa_sent = Answer;
-      ss->ss_sdp_version = soa_get_user_version(nh->nh_soa);
+      sr->sr_answer_sent = 1;
+      if (ss) {
+        ss->ss_oa_sent = Answer;
+        ss->ss_sdp_version = soa_get_user_version(nh->nh_soa);
+      }
     }
   }
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/nua/nua_stack.c b/libs/sofia-sip/libsofia-sip-ua/nua/nua_stack.c
index 7364d4b2555..4c8a8379cee 100644
--- a/libs/sofia-sip/libsofia-sip-ua/nua/nua_stack.c
+++ b/libs/sofia-sip/libsofia-sip-ua/nua/nua_stack.c
@@ -313,7 +313,7 @@ int nua_stack_event(nua_t *nua, nua_handle_t *nh, msg_t *msg,
       assert(t == t_end); assert(b == end); (void)end;
     }
     else
-      p = e + 1;
+      p = ee + 1;
 
     ee->ee_nua = nua_stack_ref(nua);
     e->e_event = event;
@@ -657,7 +657,7 @@ void nua_stack_signal(nua_t *nua, su_msg_r msg, nua_ee_data_t *ee)
     nua_stack_respond(nua, nh, e->e_status, e->e_phrase, tags);
     break;
   case nua_r_destroy:
-	  if (!nh->nh_destroyed) {
+	  if (nh && !nh->nh_destroyed) {
 		  nua_stack_destroy_handle(nua, nh, tags);
 		  su_msg_destroy(nua->nua_signal);
 	  }
@@ -667,8 +667,10 @@ void nua_stack_signal(nua_t *nua, su_msg_r msg, nua_ee_data_t *ee)
   }
 
   if (error < 0) {
-    nua_stack_event(nh->nh_nua, nh, NULL, event,
+    if (nh) {
+      nua_stack_event(nh->nh_nua, nh, NULL, event,
 		    NUA_ERROR_AT(__FILE__, __LINE__), NULL);
+    }
   }
 
   su_msg_destroy(nua->nua_signal);
diff --git a/libs/sofia-sip/libsofia-sip-ua/sdp/sdp.c b/libs/sofia-sip/libsofia-sip-ua/sdp/sdp.c
index 93c0c230eeb..8ed84ff3572 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sdp/sdp.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sdp/sdp.c
@@ -1189,8 +1189,8 @@ int sdp_session_cmp(sdp_session_t const *a, sdp_session_t const *b)
 
   for (ab = a->sdp_bandwidths, bb = b->sdp_bandwidths;
        ab || bb;
-       ab = ab->b_next, bb = bb->b_next)
-    if ((rv = sdp_bandwidth_cmp(a->sdp_bandwidths, b->sdp_bandwidths)))
+       ab = (ab ? ab->b_next : NULL), bb = (bb ? bb->b_next : NULL))
+    if ((rv = sdp_bandwidth_cmp(ab, bb)))
       return rv;
 
   if ((rv = sdp_time_cmp(a->sdp_time, b->sdp_time)))
@@ -1199,14 +1199,14 @@ int sdp_session_cmp(sdp_session_t const *a, sdp_session_t const *b)
     return rv;
 
   for (aa = a->sdp_attributes, ba = b->sdp_attributes;
-       aa || bb;
-       aa = aa->a_next, ba = ba->a_next)
+       aa || ba;
+       aa = (aa ? aa->a_next : NULL), ba = (ba ? ba->a_next : NULL))
     if ((rv = sdp_attribute_cmp(aa, ba)))
       return rv;
 
   for (am = a->sdp_media, bm = b->sdp_media;
        am || bm;
-       am = am->m_next, bm = bm->m_next)
+       am = (am ? am->m_next : NULL), bm = (bm ? bm->m_next : NULL))
     if ((rv = sdp_media_cmp(am, bm)))
       return rv;
 
@@ -1244,6 +1244,8 @@ int sdp_connection_cmp(sdp_connection_t const *a, sdp_connection_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1; 
   if (a->c_nettype != b->c_nettype)
     return a->c_nettype < b->c_nettype ? -1 : 1;
   if (a->c_addrtype != b->c_addrtype)
@@ -1266,6 +1268,9 @@ int sdp_bandwidth_cmp(sdp_bandwidth_t const *a, sdp_bandwidth_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1;
+
   if (a->b_modifier != b->b_modifier)
     return a->b_modifier < b->b_modifier ? -1 : 1;
   if (a->b_modifier == sdp_bw_x &&
@@ -1308,6 +1313,9 @@ int sdp_repeat_cmp(sdp_repeat_t const *a, sdp_repeat_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1;
+
   if (a->r_interval != b->r_interval)
     return a->r_interval < b->r_interval ? -1 : 1;
   if (a->r_duration != b->r_duration)
@@ -1334,6 +1342,8 @@ int sdp_zone_cmp(sdp_zone_t const *a, sdp_zone_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1;
   n = a->z_number_of_adjustments < b->z_number_of_adjustments
     ? a->z_number_of_adjustments : b->z_number_of_adjustments;
   for (i = 0; i < n; i++) {
@@ -1360,6 +1370,9 @@ int sdp_key_cmp(sdp_key_t const *a, sdp_key_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1;
+
   if (a->k_method != b->k_method)
     return a->k_method < b->k_method ? -1 : 1;
   if (a->k_method == sdp_key_x &&
@@ -1393,6 +1406,9 @@ int sdp_rtpmap_cmp(sdp_rtpmap_t const *a, sdp_rtpmap_t const *b)
   if ((a != NULL) != (b != NULL))
     return (a != NULL) < (b != NULL) ? -1 : 1;
 
+  if (!a || !b)
+    return -1;
+
   if (a->rm_pt != b->rm_pt)
     return a->rm_pt < b->rm_pt ? -1 : 1;
 
@@ -1452,6 +1468,9 @@ int sdp_media_cmp(sdp_media_t const *a, sdp_media_t const *b)
   if ((rv = (a != NULL) - (b != NULL)))
     return rv;
 
+  if (!a || !b)
+    return -1;
+
   if (a->m_type != b->m_type)
     return a->m_type < b->m_type ? -1 : 1;
   if (a->m_type == sdp_media_x)
@@ -1478,7 +1497,7 @@ int sdp_media_cmp(sdp_media_t const *a, sdp_media_t const *b)
 
   for (arm = a->m_rtpmaps, brm = b->m_rtpmaps;
        arm || brm;
-       arm = arm->rm_next, brm = brm->rm_next)
+       arm = (arm ? arm->rm_next : NULL), brm = (brm ? brm->rm_next : NULL))
     if ((rv = sdp_rtpmap_cmp(arm, brm)))
       return rv;
 
@@ -1490,13 +1509,13 @@ int sdp_media_cmp(sdp_media_t const *a, sdp_media_t const *b)
 
   for (ac = a->m_connections, bc = b->m_connections;
        ac || bc;
-       ac = ac->c_next, bc = bc->c_next)
+       ac = (ac ? ac->c_next : NULL), bc = (bc ? bc->c_next : NULL))
   if ((rv = sdp_connection_cmp(ac, bc)))
     return rv;
 
   for (ab = a->m_bandwidths, bb = b->m_bandwidths;
        ab || bb;
-       ab = ab->b_next, bb = bb->b_next)
+       ab = (ab ? ab->b_next : NULL), bb = (bb ? bb->b_next : NULL))
     if ((rv = sdp_bandwidth_cmp(ab, bb)))
       return rv;
 
@@ -1505,7 +1524,7 @@ int sdp_media_cmp(sdp_media_t const *a, sdp_media_t const *b)
 
   for (aa = a->m_attributes, ba = b->m_attributes;
        aa || ba;
-       aa = aa->a_next, ba = ba->a_next)
+       aa = (aa ? aa->a_next : NULL), ba = (ba ? ba->a_next : NULL))
     if ((rv = sdp_attribute_cmp(aa, ba)))
       return rv;
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/sdp/torture_sdp.c b/libs/sofia-sip/libsofia-sip-ua/sdp/torture_sdp.c
index fcd25d645d9..f29ac7f7628 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sdp/torture_sdp.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sdp/torture_sdp.c
@@ -225,6 +225,83 @@ static int test_session(void)
   END();
 }
 
+
+static char const s0_cmp_msg[] =
+"v=0\n"
+"s=/sdp_torture\n"
+"o=sdp_torture 0 0 IN IP4 0.0.0.0\n"
+"b=AS:64\n"
+"b=CRASH:32\n"
+"m=audio 0 RTP/AVP 96 97 98 10 99 8 0\n"
+"a=rtpmap:96 X-AMR-WB/16000\n"
+"a=rtpmap:97 X-AMR/8000\n"
+"a=rtpmap:98 GSM-EFR/8000\n"
+"a=rtpmap:10 L16/16000\n"
+"a=rtpmap:99 G723/8000\n"
+"a=rtpmap:8 PCMA/8000\n"
+"a=rtpmap:0 PCMU/8000\n"
+"m=video 0 *\n"
+"m=* 0 RTP/AVP *\n"
+;
+
+static char const s1_cmp_msg[] =
+"v=0\n"
+"s=/sdp_torture\n"
+"o=sdp_torture 0 0 IN IP4 0.0.0.0\n"
+"b=AS:64\n"
+"m=audio 0 RTP/AVP 96 97 98 10 99 8 0\n"
+"a=rtpmap:96 X-AMR-WB/16000\n"
+"a=rtpmap:97 X-AMR/8000\n"
+"a=rtpmap:98 GSM-EFR/8000\n"
+"a=rtpmap:10 L16/16000\n"
+"a=rtpmap:99 G723/8000\n"
+"a=rtpmap:8 PCMA/8000\n"
+"a=rtpmap:0 PCMU/8000\n"
+"m=video 0 *\n"
+"m=* 0 RTP/AVP *\n"
+;
+
+static int test_sdp_session_cmp(void)
+{
+	su_home_t *home = su_home_create(), *home2 = su_home_create();
+	sdp_session_t *sdp_src, *sdp_target;
+	sdp_session_t const *sdp = NULL;
+	sdp_parser_t *parser, *parser2;
+
+	BEGIN();
+
+	su_home_check(home);
+	TEST_1(home);
+
+	su_home_check(home2);
+	TEST_1(home2);
+
+	TEST_1((parser = sdp_parse(home, s0_cmp_msg, sizeof(s0_cmp_msg), sdp_f_config)));
+	TEST_1((sdp_src = sdp_session(parser)));
+
+	TEST_1((parser2 = sdp_parse(home2, s1_cmp_msg, sizeof(s1_cmp_msg), sdp_f_config)));
+	TEST_1((sdp_target = sdp_session(parser2)));
+
+	/* Check comparing */
+	TEST(sdp_session_cmp(sdp_src, sdp_target), 1);
+
+	/* frees all data created by the parser including 'sdp_src' */
+	sdp_parser_free(parser);
+	
+	/* destroy the first home instance */
+	su_home_check(home);
+	su_home_unref(home);
+
+	/* frees all data created by the parser including 'sdp_target' */
+	sdp_parser_free(parser2);
+
+	/* destroy the second home object */
+	su_home_check(home2);
+	su_home_unref(home2);
+
+	END();
+}
+
 static char const s1_msg[] =
   "v=0\r\n"
   "o=- 2435697 2435697 IN IP4 172.21.137.44\r\n"
@@ -925,6 +1002,7 @@ int main(int argc, char *argv[])
   null = fopen("/dev/null", "ab");
 
   retval |= test_error(); fflush(stdout);
+  retval |= test_sdp_session_cmp(); fflush(stdout);
   retval |= test_session(); fflush(stdout);
   retval |= test_session2(); fflush(stdout);
   retval |= test_pint(); fflush(stdout);
diff --git a/libs/sofia-sip/libsofia-sip-ua/sip/sip_extra.c b/libs/sofia-sip/libsofia-sip-ua/sip/sip_extra.c
index 884ad407061..109bea84f26 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sip/sip_extra.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sip/sip_extra.c
@@ -886,8 +886,8 @@ issize_t sip_if_match_e(char b[], isize_t bsiz, sip_header_t const *h, int f)
 static
 issize_t sip_info_d(su_home_t *home, sip_header_t *h, char *s, isize_t slen)
 {
-  sip_call_info_t *ci = h->sh_call_info;
-  char *end = s + slen;
+  sip_call_info_t *ci;
+  char *end;
 
   for(;;) {
 	  ci = h->sh_call_info;
diff --git a/libs/sofia-sip/libsofia-sip-ua/sip/sip_pref_util.c b/libs/sofia-sip/libsofia-sip-ua/sip/sip_pref_util.c
index 892d5d299d7..0535e36bc75 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sip/sip_pref_util.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sip/sip_pref_util.c
@@ -312,7 +312,7 @@ int sip_prefs_matching(char const *pvalue,
 int sip_is_callerpref(char const *param)
 {
 #define MATCH(s) \
-  (su_casenmatch(param + 1, s + 1, strlen(s) - 1) && \
+  (su_casenmatch(&param[1], &s[1], strlen(s) - 1) && \
    (param[strlen(s)] == '=' || param[strlen(s)] == '\0'))
 
   int xor = 0, base = 0;
diff --git a/libs/sofia-sip/libsofia-sip-ua/sip/sip_util.c b/libs/sofia-sip/libsofia-sip-ua/sip/sip_util.c
index e5ef095eb0b..a5817915747 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sip/sip_util.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sip/sip_util.c
@@ -257,9 +257,11 @@ sip_contact_string_from_via(su_home_t *home,
 
   /* Make transport parameter lowercase */
   if (transport && strlen(transport) < (sizeof _transport)) {
-    char *s = strcpy(_transport, transport);
+    char *s;
     short c;
 
+    strcpy(_transport, transport);
+
     for (s = _transport; (c = *s) && c != ';'; s++)
       if (isupper(c))
 	*s = tolower(c);
diff --git a/libs/sofia-sip/libsofia-sip-ua/soa/soa.c b/libs/sofia-sip/libsofia-sip-ua/soa/soa.c
index 0b838a15a24..bd1466fc6d6 100644
--- a/libs/sofia-sip/libsofia-sip-ua/soa/soa.c
+++ b/libs/sofia-sip/libsofia-sip-ua/soa/soa.c
@@ -2639,7 +2639,7 @@ best_listed_address_in_session(sdp_session_t const *sdp,
       for (m = sdp->sdp_media; m; m = m->m_next) {
 	if (m->m_connections && m->m_connections != sdp->sdp_connection) {
 	  c = sdp->sdp_connection;
-	  if (su_casenmatch(c->c_address, address, n) && c->c_address[n] == 0)
+	  if (c && su_casenmatch(c->c_address, address, n) && c->c_address[n] == 0)
 	    break;
 	  c = NULL;
 	}
diff --git a/libs/sofia-sip/libsofia-sip-ua/sresolv/sres.c b/libs/sofia-sip/libsofia-sip-ua/sresolv/sres.c
index a06650b13e7..d14ed309d30 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sresolv/sres.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sresolv/sres.c
@@ -1640,7 +1640,8 @@ sres_record_compare(sres_record_t const *aa, sres_record_t const *bb)
       D = A->a6_prelen - B->a6_prelen; if (D) return D;
       D = !A->a6_prename - !B->a6_prename;
       if (D == 0 && A->a6_prename && B->a6_prename)
-	D = su_strcasecmp(A->a6_prename, B->a6_prename); if (D) return D;
+        D = su_strcasecmp(A->a6_prename, B->a6_prename);
+      if (D) return D;
       return memcmp(&A->a6_suffix, &B->a6_suffix, sizeof A->a6_suffix);
     }
   case sres_type_aaaa:
@@ -3460,7 +3461,7 @@ sres_resolver_receive(sres_resolver_t *res, int socket)
   sres_record_t **reply;
   sres_server_t *dns;
 
-  struct sockaddr_storage from[1];
+  struct sockaddr_storage from[1] = {{0}};
   socklen_t fromlen = sizeof from;
 
   SU_DEBUG_9(("%s(%p, %u) called\n", "sres_resolver_receive",
diff --git a/libs/sofia-sip/libsofia-sip-ua/sresolv/sresolv.c b/libs/sofia-sip/libsofia-sip-ua/sresolv/sresolv.c
index 07202659261..30812a60f12 100644
--- a/libs/sofia-sip/libsofia-sip-ua/sresolv/sresolv.c
+++ b/libs/sofia-sip/libsofia-sip-ua/sresolv/sresolv.c
@@ -224,7 +224,7 @@ static int sres_sofia_update(sres_sofia_t *srs,
 	if (!(srs->srs_reg + i)->reg_ptr)
 	  break;
       }
-      if (i > N)
+      if (i >= N)
 	return su_seterrno(ENOMEM);
 
       reg = srs->srs_reg + i;
diff --git a/libs/sofia-sip/libsofia-sip-ua/stun/Makefile.am b/libs/sofia-sip/libsofia-sip-ua/stun/Makefile.am
index b3d6e69ac56..80b7669a9dc 100644
--- a/libs/sofia-sip/libsofia-sip-ua/stun/Makefile.am
+++ b/libs/sofia-sip/libsofia-sip-ua/stun/Makefile.am
@@ -14,6 +14,7 @@ AUTOMAKE_OPTIONS = 	foreign
 # Header paths
 
 AM_CPPFLAGS = 		$(INTERNAL_INCLUDES)
+LD_FLAGS = -lz
 
 # ----------------------------------------------------------------------
 # Build targets
diff --git a/libs/sofia-sip/libsofia-sip-ua/su/su_localinfo.c b/libs/sofia-sip/libsofia-sip-ua/su/su_localinfo.c
index 47e951f1445..492900f16e7 100644
--- a/libs/sofia-sip/libsofia-sip-ua/su/su_localinfo.c
+++ b/libs/sofia-sip/libsofia-sip-ua/su/su_localinfo.c
@@ -228,11 +228,15 @@ int su_getlocalinfo(su_localinfo_t const *hints,
 #endif
 
   case AF_INET:
+#ifndef USE_LOCALINFO0
     ip4 = 1;
+#endif
     break;
 
   case 0:
+#ifndef USE_LOCALINFO0
     ip4 = 1;
+#endif
 #if SU_HAVE_IN6  
     ip6 = 1;
 #endif
diff --git a/libs/sofia-sip/libsofia-sip-ua/su/su_taglist.c b/libs/sofia-sip/libsofia-sip-ua/su/su_taglist.c
index 431a2dcb565..27510a40244 100644
--- a/libs/sofia-sip/libsofia-sip-ua/su/su_taglist.c
+++ b/libs/sofia-sip/libsofia-sip-ua/su/su_taglist.c
@@ -485,8 +485,9 @@ tagi_t *t_filter(tagi_t *dst,
     size_t d = 0;
 
     for (f = filter; f; f = t_next(f)) {
-      if (TAG_TYPE_OF(f)->tt_filter)
-	d += (size_t)TAG_TYPE_OF(f)->tt_filter(NULL, f, src, bb);
+      tag_type_t tt_f = TAG_TYPE_OF(f);
+      if (tt_f->tt_filter)
+	d += (size_t)tt_f->tt_filter(NULL, f, src, bb);
       else if (tt == f->t_tag) {
 	d += t_len(src);
 	*bb = (char *)*bb + t_xtra(src, (size_t)*bb);
diff --git a/libs/sofia-sip/libsofia-sip-ua/tport/tport.c b/libs/sofia-sip/libsofia-sip-ua/tport/tport.c
index fbfdfa56b68..9eca15f77a1 100644
--- a/libs/sofia-sip/libsofia-sip-ua/tport/tport.c
+++ b/libs/sofia-sip/libsofia-sip-ua/tport/tport.c
@@ -1661,6 +1661,8 @@ int tport_bind_server(tport_master_t *mr,
   for (tbf = &mr->mr_primaries; *tbf; tbf = &(*tbf)->pri_next)
     ;
 
+  if (!res)
+    return -1;
   port = port0 = port1 = ntohs(((su_sockaddr_t *)res->ai_addr)->su_port);
   error = EPROTONOSUPPORT;
 
diff --git a/libs/sofia-sip/libsofia-sip-ua/tport/tport_logging.c b/libs/sofia-sip/libsofia-sip-ua/tport/tport_logging.c
index ac6003fb0b2..7ffe33ac417 100644
--- a/libs/sofia-sip/libsofia-sip-ua/tport/tport_logging.c
+++ b/libs/sofia-sip/libsofia-sip-ua/tport/tport_logging.c
@@ -866,7 +866,9 @@ void tport_log_msg(tport_t *self, msg_t *msg,
       }
 
       n = su_strncspn(s, end - s, "\r\n");
-
+      if (buffer_pos > buffer_size) {
+        break;
+      } 
       bytes_written = snprintf(buffer + buffer_pos, buffer_size - buffer_pos, "%.*s", (int)n, s);
       if (bytes_written > 0) {
         buffer_pos += bytes_written;
diff --git a/libs/sofia-sip/libsofia-sip-ua/tport/tport_tls.c b/libs/sofia-sip/libsofia-sip-ua/tport/tport_tls.c
index 8b079e91171..7c293ae803d 100644
--- a/libs/sofia-sip/libsofia-sip-ua/tport/tport_tls.c
+++ b/libs/sofia-sip/libsofia-sip-ua/tport/tport_tls.c
@@ -394,7 +394,7 @@ int tls_init_context(tls_t *tls, tls_issues_t const *ti)
                       ti->key));
         } else {
           long options = SSL_OP_CIPHER_SERVER_PREFERENCE | SSL_OP_SINGLE_DH_USE;
-          options = SSL_CTX_set_options(tls->ctx, options);
+          SSL_CTX_set_options(tls->ctx, options);
           SU_DEBUG_3(("%s\n", "tls: initialized DHE"));
         }
         DH_free(dh);
diff --git a/libs/sofia-sip/libsofia-sip-ua/tport/tport_type_tcp.c b/libs/sofia-sip/libsofia-sip-ua/tport/tport_type_tcp.c
index 9b700dc1a20..db3a76ac69e 100644
--- a/libs/sofia-sip/libsofia-sip-ua/tport/tport_type_tcp.c
+++ b/libs/sofia-sip/libsofia-sip-ua/tport/tport_type_tcp.c
@@ -519,7 +519,6 @@ int tport_tcp_ping(tport_t *self, su_time_t now)
     else
       why = " blocking";
 
-    return -1;
   }
 
   SU_DEBUG_7(("%s(%p): %s to " TPN_FORMAT "%s\n",
diff --git a/libs/sofia-sip/libsofia-sip-ua/tport/ws.c b/libs/sofia-sip/libsofia-sip-ua/tport/ws.c
index c004781e12c..0c2709ea98d 100644
--- a/libs/sofia-sip/libsofia-sip-ua/tport/ws.c
+++ b/libs/sofia-sip/libsofia-sip-ua/tport/ws.c
@@ -42,7 +42,7 @@ void deinit_ssl(void)
 }
 
 #else
-static unsigned long pthreads_thread_id(void);
+static void pthreads_thread_id(CRYPTO_THREADID *id);
 static void pthreads_locking_callback(int mode, int type, const char *file, int line);
 
 static pthread_mutex_t *lock_cs;
@@ -62,7 +62,7 @@ static void thread_setup(void)
 		pthread_mutex_init(&(lock_cs[i]), NULL);
 	}
 
-	CRYPTO_set_id_callback(pthreads_thread_id);
+	CRYPTO_THREADID_set_callback(pthreads_thread_id);
 	CRYPTO_set_locking_callback(pthreads_locking_callback);
 }
 
@@ -93,9 +93,9 @@ static void pthreads_locking_callback(int mode, int type, const char *file, int
 
 
 
-static unsigned long pthreads_thread_id(void)
+static void pthreads_thread_id(CRYPTO_THREADID *id)
 {
-	return (unsigned long) pthread_self();
+	CRYPTO_THREADID_set_numeric(id, (unsigned long)pthread_self());
 }
 
 
diff --git a/libs/sofia-sip/s2check/Makefile.am b/libs/sofia-sip/s2check/Makefile.am
index a6332ac6810..3a16f32ca34 100644
--- a/libs/sofia-sip/s2check/Makefile.am
+++ b/libs/sofia-sip/s2check/Makefile.am
@@ -9,6 +9,7 @@
 # Header paths
 
 AM_CPPFLAGS = 		$(INTERNAL_INCLUDES)
+AM_LDFLAGS = -ldl
 
 # ----------------------------------------------------------------------
 # Build targets
diff --git a/libs/sofia-sip/s2check/s2_localinfo.c b/libs/sofia-sip/s2check/s2_localinfo.c
index 5c822753d2a..1d646a8e2e6 100644
--- a/libs/sofia-sip/s2check/s2_localinfo.c
+++ b/libs/sofia-sip/s2check/s2_localinfo.c
@@ -66,7 +66,7 @@ s2_getlocalinfo(su_localinfo_t const *hints,
 		su_localinfo_t **return_localinfo)
 {
   int error = 0, ip4 = 0, ip6 = 0, i;
-  su_localinfo_t *result = NULL, **rr = &result;
+  su_localinfo_t *result = NULL, **rr;
   su_localinfo_t hh[1] = {{ 0 }};
 
   assert(return_localinfo);
diff --git a/libs/sofia-sip/s2check/s2sip.c b/libs/sofia-sip/s2check/s2sip.c
index 57ef2ee3941..9c81c696fee 100644
--- a/libs/sofia-sip/s2check/s2sip.c
+++ b/libs/sofia-sip/s2check/s2sip.c
@@ -195,13 +195,14 @@ s2_sip_next_request(sip_method_t method, char const *name)
   struct message *m;
 
   for (m = s2sip->received; m; m = m->next) {
-    if (m->sip->sip_request)
+    if (m->sip->sip_request) {
       if (method == sip_method_unknown && name == NULL)
 	return s2_sip_remove_message(m);
 
-    if (m->sip->sip_request->rq_method == method &&
-	strcmp(m->sip->sip_request->rq_method_name, name) == 0)
-      return s2_sip_remove_message(m);
+      if (m->sip->sip_request->rq_method == method &&
+  	strcmp(m->sip->sip_request->rq_method_name, name) == 0)
+        return s2_sip_remove_message(m);
+    }
   }
 
   return NULL;
diff --git a/libs/sofia-sip/s2check/s2tcase.c b/libs/sofia-sip/s2check/s2tcase.c
index d67694c2ce4..382aaa3ad7f 100644
--- a/libs/sofia-sip/s2check/s2tcase.c
+++ b/libs/sofia-sip/s2check/s2tcase.c
@@ -57,7 +57,7 @@ void s2_tcase_add_test(TCase *tc, TFun tf, char const *name,
       if (strcmp(*patterns, "*")) {
 	printf("%s: selected\n", name);
       }
-      _tcase_add_test(tc, tf, name, signo, start, end);
+      _tcase_add_test(tc, tf, name, signo, 0, start, end);
       return;
     }
   }
@@ -67,7 +67,7 @@ void s2_tcase_add_test(TCase *tc, TFun tf, char const *name,
       if (strcmp(*patterns, "*")) {
 	printf("%s: selected\n", name);
       }
-      _tcase_add_test(tc, tf, name, signo, start, end);
+      _tcase_add_test(tc, tf, name, signo, 0, start, end);
       return;
     }
   }
diff --git a/libs/sofia-sip/s2check/s2time.c b/libs/sofia-sip/s2check/s2time.c
index dd9a914e1d7..611b694b6a2 100644
--- a/libs/sofia-sip/s2check/s2time.c
+++ b/libs/sofia-sip/s2check/s2time.c
@@ -57,7 +57,7 @@ s2_timed_logger(void *stream, char const *fmt, va_list ap)
 {
   char buffer[4096];
   su_time_t now = su_now();
-  size_t prefix, wrote;
+  size_t prefix;
   int n;
 
   snprintf(buffer, sizeof buffer,
@@ -70,5 +70,5 @@ s2_timed_logger(void *stream, char const *fmt, va_list ap)
   prefix = strlen(buffer);
   n = vsnprintf(buffer + prefix, (sizeof buffer) - prefix, fmt, ap);
   if (n > 0)
-    wrote = fwrite(buffer, prefix + n, 1, stream);
+    fwrite(buffer, prefix + n, 1, stream);
 }
diff --git a/libs/sofia-sip/tests/Makefile.am b/libs/sofia-sip/tests/Makefile.am
index 6839d6a1327..cabc2b5899c 100644
--- a/libs/sofia-sip/tests/Makefile.am
+++ b/libs/sofia-sip/tests/Makefile.am
@@ -13,7 +13,7 @@ EXTRA_DIST = check_sofia.h check_sofia.c suite_for_nua.c
 test_nua_SOURCES = 	test_nua.c
 
 test_nua_LDADD =	$(check_LIBRARIES) \
-			${sofiabuilddir}/libsofia-sip-ua.la
+			${sofiabuilddir}/libsofia-sip-ua.la -ldl
 
 check_LIBRARIES =   	libtestnua.a libtestproxy.a libtestnat.a
 
@@ -47,7 +47,7 @@ check_sofia_LDADD = 	$(check_LIBRARIES) \
 			@CHECK_LIBS@
 
 check_dlopen_sofia_CFLAGS = -I$(top_srcdir)/s2check @CHECK_CFLAGS@
-check_dlopen_sofia_LDADD = ${top_builddir}/s2check/libs2.a @CHECK_LIBS@
+check_dlopen_sofia_LDADD = ${top_builddir}/s2check/libs2.a @CHECK_LIBS@ -ldl
 
 CLEANFILES =            tmp_sippasswd.??????
 
diff --git a/libs/sofia-sip/utils/sip-dig.c b/libs/sofia-sip/utils/sip-dig.c
index b2c81e0bfd7..14d5cc3cfd7 100644
--- a/libs/sofia-sip/utils/sip-dig.c
+++ b/libs/sofia-sip/utils/sip-dig.c
@@ -206,7 +206,9 @@ int main(int argc, char *argv[])
   int exitcode = 0;
   int o_sctp = 1, o_tls_sctp = 1, o_verbatim = 1;
   int family = 0, multiple = 0;
+  /*
   char const *dnsserver = NULL;
+  */
   char const *string;
   url_t *uri = NULL;
 
@@ -272,8 +274,10 @@ int main(int argc, char *argv[])
   if (!family)
     dig->ip4 = 1, dig->ip6 = 2;
 
+  /*
   if (argv[1] && argv[1][0] == '@')
     dnsserver = argv++[1] + 1;
+  */
 
   if (!argv[1])
     usage(2);
diff --git a/libs/sofia-sip/utils/sip-options.c b/libs/sofia-sip/utils/sip-options.c
index d7eef497dd1..660d3bcf7a1 100644
--- a/libs/sofia-sip/utils/sip-options.c
+++ b/libs/sofia-sip/utils/sip-options.c
@@ -398,7 +398,8 @@ char *readfile(FILE *f)
     len += fread(buffer + len, 1, 8192, f);
   }
 
-  buffer[len] = '\0';
+  if (buffer)
+    buffer[len] = '\0';
 
   return buffer;
 }
diff --git a/libs/spandsp/src/at_interpreter.c b/libs/spandsp/src/at_interpreter.c
index 50f1df11d54..8b95c48fa2a 100644
--- a/libs/spandsp/src/at_interpreter.c
+++ b/libs/spandsp/src/at_interpreter.c
@@ -217,6 +217,8 @@ SPAN_DECLARE(void) at_set_at_rx_mode(at_state_t *s, int new_mode)
 SPAN_DECLARE(void) at_put_response(at_state_t *s, const char *t)
 {
     uint8_t buf[3];
+   
+    if (!s) return; 
 
     buf[0] = s->p.s_regs[3];
     buf[1] = s->p.s_regs[4];
diff --git a/libs/spandsp/src/gsm0610_rpe.c b/libs/spandsp/src/gsm0610_rpe.c
index bf27c6550bb..3bd683b5d3d 100644
--- a/libs/spandsp/src/gsm0610_rpe.c
+++ b/libs/spandsp/src/gsm0610_rpe.c
@@ -242,7 +242,6 @@ static void rpe_grid_selection(int16_t x[40], int16_t xM[13], int16_t *Mc_out)
 
     /* The signal x[0..39] is used to select the RPE grid which is
        represented by Mc. */
-    EM = 0;
     Mc = 0;
 
 #undef STEP
@@ -560,7 +559,7 @@ void gsm0610_rpe_encoding(gsm0610_state_t *s,
                           int16_t *Mc,
                           int16_t xMc[13])
 {
-    int16_t x[40];
+    int16_t x[40] = {0};
     int16_t xM[13];
     int16_t xMp[13];
     int16_t mant;
diff --git a/libs/spandsp/src/plc.c b/libs/spandsp/src/plc.c
index bc744c62400..007c9604d05 100644
--- a/libs/spandsp/src/plc.c
+++ b/libs/spandsp/src/plc.c
@@ -85,7 +85,7 @@ static __inline__ void normalise_history(plc_state_t *s)
     if (s->buf_ptr == 0)
         return;
     vec_copyi16(tmp, s->history, s->buf_ptr);
-    vec_copyi16(s->history, &s->history[s->buf_ptr], PLC_HISTORY_LEN - s->buf_ptr);
+    vec_movei16(s->history, &s->history[s->buf_ptr], PLC_HISTORY_LEN - s->buf_ptr);
     vec_copyi16(&s->history[PLC_HISTORY_LEN - s->buf_ptr], tmp, s->buf_ptr);
     s->buf_ptr = 0;
 }
diff --git a/libs/spandsp/src/sig_tone.c b/libs/spandsp/src/sig_tone.c
index 046b2e8735f..b21cd9f57dc 100644
--- a/libs/spandsp/src/sig_tone.c
+++ b/libs/spandsp/src/sig_tone.c
@@ -394,13 +394,13 @@ SPAN_DECLARE(int) sig_tone_rx(sig_tone_rx_state_t *s, int16_t amp[], int len)
 #if defined(SPANDSP_USE_FIXED_POINT)
     int16_t x;
     int32_t v;
-    int16_t notched_signal[3];
+    int16_t notched_signal[3] = {0};
     int16_t bandpass_signal;
     int16_t signal;
 #else
     float x;
     float v;
-    float notched_signal[3];
+    float notched_signal[3] = {0};
     float bandpass_signal;
     float signal;
 #endif
@@ -409,7 +409,7 @@ SPAN_DECLARE(int) sig_tone_rx(sig_tone_rx_state_t *s, int16_t amp[], int len)
     int k;
     int l;
     int m;
-    int32_t notch_power[3];
+    int32_t notch_power[3] = {0};
     int32_t flat_power;
     int immediate;
 
diff --git a/libs/spandsp/src/super_tone_rx.c b/libs/spandsp/src/super_tone_rx.c
index a6324f91bfd..48dc02e693e 100644
--- a/libs/spandsp/src/super_tone_rx.c
+++ b/libs/spandsp/src/super_tone_rx.c
@@ -147,7 +147,7 @@ static int test_cadence(super_tone_rx_segment_t *pattern,
                         int rotation)
 {
     int i;
-    int j;
+    int j = 0;
 
     if (rotation >= 0)
     {
@@ -170,7 +170,8 @@ static int test_cadence(super_tone_rx_segment_t *pattern,
                 return 0;
             }
         }
-        j = (rotation + steps - 1)%steps;
+        if (steps) 
+            j = (rotation + steps - 1)%steps;
         if (pattern[j].f1 != test[9].f1  ||  pattern[j].f2 != test[9].f2)
             return 0;
         if (pattern[j].max_duration < test[9].min_duration*SUPER_TONE_BINS)
@@ -314,9 +315,9 @@ static void super_tone_chunk(super_tone_rx_state_t *s)
     int k1;
     int k2;
 #if defined(SPANDSP_USE_FIXED_POINT)
-    int32_t res[SUPER_TONE_BINS/2];
+    int32_t res[SUPER_TONE_BINS/2] = {0};
 #else
-    float res[SUPER_TONE_BINS/2];
+    float res[SUPER_TONE_BINS/2] = {0};
 #endif
 
     for (i = 0;  i < s->desc->monitored_frequencies;  i++)
diff --git a/libs/spandsp/src/t42.c b/libs/spandsp/src/t42.c
index 748fc7180ca..697f6231069 100644
--- a/libs/spandsp/src/t42.c
+++ b/libs/spandsp/src/t42.c
@@ -397,7 +397,7 @@ int set_illuminant_from_code(logging_state_t *logging, lab_params_t *lab, const
 {
     int i;
     int colour_temp;
-    float xyz[3];
+    float xyz[3] = { 0 };
 
     if (memcmp(code, "CT", 2) == 0)
     {
diff --git a/libs/spandsp/src/t4_tx.c b/libs/spandsp/src/t4_tx.c
index b10131cf978..47353b62ae7 100644
--- a/libs/spandsp/src/t4_tx.c
+++ b/libs/spandsp/src/t4_tx.c
@@ -944,7 +944,7 @@ static int read_tiff_t85_image(t4_tx_state_t *s)
             break;
     }
     if (result == T4_DECODE_MORE_DATA)
-        result = t85_decode_put(&t85, NULL, 0);
+        t85_decode_put(&t85, NULL, 0);
 
     len = t85_decode_get_compressed_image_size(&t85);
     span_log(&s->logging, SPAN_LOG_WARNING, "Compressed image is %d bytes, %d rows\n", len/8, s->tiff.image_length);
@@ -1476,7 +1476,6 @@ static int make_header(t4_tx_state_t *s)
 
 static int header_row_read_handler(void *user_data, uint8_t buf[], size_t len)
 {
-    int x_repeats;
     int y_repeats;
     int pattern;
     int pos;
@@ -1490,51 +1489,40 @@ static int header_row_read_handler(void *user_data, uint8_t buf[], size_t len)
     {
     default:
     case T4_RESOLUTION_100_100:
-        x_repeats = 1;
         y_repeats = 1;
         break;
     case T4_RESOLUTION_R8_STANDARD:
     case T4_RESOLUTION_200_100:
-        x_repeats = 2;
         y_repeats = 1;
         break;
     case T4_RESOLUTION_R8_FINE:
     case T4_RESOLUTION_200_200:
-        x_repeats = 2;
         y_repeats = 2;
         break;
     case T4_RESOLUTION_300_300:
-        x_repeats = 3;
         y_repeats = 3;
         break;
     case T4_RESOLUTION_R8_SUPERFINE:
     case T4_RESOLUTION_200_400:
-        x_repeats = 2;
         y_repeats = 4;
         break;
     case T4_RESOLUTION_R16_SUPERFINE:
     case T4_RESOLUTION_400_400:
-        x_repeats = 4;
         y_repeats = 4;
         break;
     case T4_RESOLUTION_400_800:
-        x_repeats = 4;
         y_repeats = 8;
         break;
     case T4_RESOLUTION_300_600:
-        x_repeats = 3;
         y_repeats = 6;
         break;
     case T4_RESOLUTION_600_600:
-        x_repeats = 6;
         y_repeats = 6;
         break;
     case T4_RESOLUTION_600_1200:
-        x_repeats = 6;
         y_repeats = 12;
         break;
     case T4_RESOLUTION_1200_1200:
-        x_repeats = 12;
         y_repeats = 12;
         break;
     }
@@ -1543,10 +1531,8 @@ static int header_row_read_handler(void *user_data, uint8_t buf[], size_t len)
     case T4_SUPPORT_WIDTH_215MM:
         break;
     case T4_SUPPORT_WIDTH_255MM:
-        x_repeats *= 2;
         break;
     case T4_SUPPORT_WIDTH_303MM:
-        x_repeats *= 3;
         break;
     }
     if (s->header_overlays_image)
@@ -1883,7 +1869,6 @@ SPAN_DECLARE(int) t4_tx_set_tx_image_format(t4_tx_state_t *s,
             break;
         }
     }
-    res = T4_IMAGE_FORMAT_NOSIZESUPPORT;
     s->row_squashing_ratio = 1;
     if (s->metadata.width_code >= 0  &&  (supported_image_sizes & s->metadata.width_code))
     {
diff --git a/libs/srtp/srtp/srtp.c b/libs/srtp/srtp/srtp.c
index 44625015e7b..966757e7ade 100644
--- a/libs/srtp/srtp/srtp.c
+++ b/libs/srtp/srtp/srtp.c
@@ -1046,7 +1046,6 @@ srtp_err_status_t srtp_stream_init_keys(srtp_stream_ctx_t *srtp,
             }
         } else {
             /* Reuse main KDF. */
-            rtp_xtn_hdr_keylen = rtp_keylen;
             rtp_xtn_hdr_base_key_len = rtp_base_key_len;
             rtp_xtn_hdr_salt_len = rtp_salt_len;
             xtn_hdr_kdf = &kdf;
@@ -2601,7 +2600,7 @@ srtp_err_status_t srtp_unprotect_mki(srtp_ctx_t *ctx,
             return status;
 
         /* now compute auth function over packet */
-        status = srtp_auth_update(session_keys->rtp_auth, (uint8_t *)auth_start,
+        srtp_auth_update(session_keys->rtp_auth, (uint8_t *)auth_start,
                                   *pkt_octet_len - tag_len - mki_size);
 
         /* run auth func over ROC, then write tmp tag */
diff --git a/libs/win32/libvpx/include_x64/vp8_rtcd.h b/libs/win32/libvpx/include_x64/vp8_rtcd.h
index 82f987ca71e..67944099099 100644
--- a/libs/win32/libvpx/include_x64/vp8_rtcd.h
+++ b/libs/win32/libvpx/include_x64/vp8_rtcd.h
@@ -27,56 +27,56 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_mmx
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_sse2
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_mmx
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_sse2
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_b vp8_blend_b_c
 
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_mb_inner vp8_blend_mb_inner_c
 
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_mb_outer vp8_blend_mb_outer_c
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem16x16 vp8_copy_mem16x16_sse2
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x4 vp8_copy_mem8x4_mmx
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 #define vp8_copy_mem8x8 vp8_copy_mem8x8_mmx
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_dc_only_idct_add vp8_dc_only_idct_add_mmx
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -87,8 +87,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 #define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_sse2
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
 #define vp8_dequant_idct_add vp8_dequant_idct_add_mmx
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -99,8 +99,8 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 #define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_sse2
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
 #define vp8_dequantize_b vp8_dequantize_b_mmx
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -128,36 +128,36 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *
 int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bv vp8_loop_filter_bv_sse2
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbh vp8_loop_filter_mbh_sse2
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_mbv vp8_loop_filter_mbv_sse2
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_sse2
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_sse2
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbh vp8_loop_filter_simple_horizontal_edge_sse2
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 #define vp8_loop_filter_simple_mbv vp8_loop_filter_simple_vertical_edge_sse2
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
@@ -168,8 +168,8 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 #define vp8_mbuverror vp8_mbuverror_sse2
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 #define vp8_refining_search_sad vp8_refining_search_sadx4
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
@@ -185,40 +185,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_fdct8x4 vp8_short_fdct8x4_sse2
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 #define vp8_short_idct4x4llm vp8_short_idct4x4llm_mmx
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_sse2
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 #define vp8_short_walsh4x4 vp8_short_walsh4x4_sse2
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
diff --git a/libs/win32/libvpx/include_x64/vp9_rtcd.h b/libs/win32/libvpx/include_x64/vp9_rtcd.h
index 0d9acc4d37a..b58898b4666 100644
--- a/libs/win32/libvpx/include_x64/vp9_rtcd.h
+++ b/libs/win32/libvpx/include_x64/vp9_rtcd.h
@@ -31,6 +31,10 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -45,11 +49,6 @@ int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_sit
 int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht16x16 vp9_fht16x16_sse2
@@ -66,8 +65,8 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -81,6 +80,7 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@@ -91,10 +91,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
 void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-
 void vp9_rtcd(void);
 
 #ifdef RTCD_C
@@ -105,22 +101,21 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
     vp9_block_error_fp = vp9_block_error_fp_sse2;
     if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_diamond_search_sad = vp9_diamond_search_sad_c;
     if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
     vp9_quantize_fp = vp9_quantize_fp_sse2;
     if (flags & HAS_SSSE3) vp9_quantize_fp = vp9_quantize_fp_ssse3;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
     vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_c;
     if (flags & HAS_SSSE3) vp9_quantize_fp_32x32 = vp9_quantize_fp_32x32_ssse3;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
-    vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
-    if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
 }
 #endif
 
diff --git a/libs/win32/libvpx/include_x64/vpx_config.asm b/libs/win32/libvpx/include_x64/vpx_config.asm
index 5f038c50b60..769a167071b 100644
--- a/libs/win32/libvpx/include_x64/vpx_config.asm
+++ b/libs/win32/libvpx/include_x64/vpx_config.asm
@@ -72,12 +72,15 @@ CONFIG_ENCODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 0
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_VP9_TEMPORAL_DENOISING equ 0
+CONFIG_CONSISTENT_RECODE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_VP9_HIGHBITDEPTH equ 0
 CONFIG_BETTER_HW_COMPATIBILITY equ 0
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 0
 CONFIG_ALWAYS_ADJUST_BPM equ 0
-CONFIG_SPATIAL_SVC equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_EMULATE_HARDWARE equ 0
+CONFIG_NON_GREEDY_MV equ 0
diff --git a/libs/win32/libvpx/include_x64/vpx_config.h b/libs/win32/libvpx/include_x64/vpx_config.h
index 13db9d6d28b..2f2f39d4d38 100644
--- a/libs/win32/libvpx/include_x64/vpx_config.h
+++ b/libs/win32/libvpx/include_x64/vpx_config.h
@@ -84,13 +84,16 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_ALWAYS_ADJUST_BPM 0
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
 #endif /* VPX_CONFIG_H */
diff --git a/libs/win32/libvpx/include_x64/vpx_dsp_rtcd.h b/libs/win32/libvpx/include_x64/vpx_dsp_rtcd.h
index d9aedba713c..f2da7ec6b58 100644
--- a/libs/win32/libvpx/include_x64/vpx_dsp_rtcd.h
+++ b/libs/win32/libvpx/include_x64/vpx_dsp_rtcd.h
@@ -77,162 +77,162 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_sse2
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_sse2
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_sse2
 
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_sse2
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_sse2
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_sse2
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_sse2
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_sse2
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_sse2
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_sse2
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_sse2
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_sse2
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_sse2
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_sse2
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_sse2
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_sse2
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_sse2
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_sse2
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_sse2
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
@@ -274,36 +274,36 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct8x8_1 vpx_fdct8x8_1_sse2
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 #define vpx_get8x8var vpx_get8x8var_sse2
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 #define vpx_get_mb_ss vpx_get_mb_ss_sse2
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_16x16 vpx_h_predictor_16x16_sse2
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_32x32 vpx_h_predictor_32x32_sse2
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_4x4 vpx_h_predictor_4x4_sse2
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
 void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
@@ -311,12 +311,17 @@ void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int1
 void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
 void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -439,8 +444,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
 #define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -451,21 +456,22 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 #define vpx_minmax_8x8 vpx_minmax_8x8_sse2
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_sse2
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x16 vpx_mse8x16_sse2
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_sse2
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -500,8 +506,8 @@ void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *
 void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
 void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -516,8 +522,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -533,8 +539,8 @@ void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *r
 void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
 void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -551,8 +557,8 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -565,10 +571,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -580,8 +586,8 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -596,8 +602,8 @@ void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
 void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -612,8 +618,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -626,8 +632,8 @@ unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -640,11 +646,10 @@ unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -658,8 +663,8 @@ void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_
 void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
 void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -674,8 +679,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -690,8 +695,8 @@ void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -722,139 +727,139 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
@@ -864,96 +869,99 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 #define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_sse2
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_sse2
 
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_sse2
 
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_sse2
 
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_sse2
 
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_16x16 vpx_v_predictor_16x16_sse2
 
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_32x32 vpx_v_predictor_32x32_sse2
 
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_4x4 vpx_v_predictor_4x4_sse2
 
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_v_predictor_8x8 vpx_v_predictor_8x8_sse2
 
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x32 vpx_variance16x32_sse2
-
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance16x8 vpx_variance16x8_sse2
-
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_variance32x64 vpx_variance32x64_sse2
-
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x4 vpx_variance4x4_sse2
 
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance4x8 vpx_variance4x8_sse2
 
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x16 vpx_variance8x16_sse2
 
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x4 vpx_variance8x4_sse2
 
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_variance8x8 vpx_variance8x8_sse2
 
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1024,6 +1032,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_get16x16var = vpx_get16x16var_avx2;
     vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
     if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     if (flags & HAS_SSSE3) vpx_hadamard_8x8 = vpx_hadamard_8x8_ssse3;
     vpx_idct32x32_135_add = vpx_idct32x32_135_add_sse2;
@@ -1038,6 +1048,8 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_lpf_horizontal_16_dual = vpx_lpf_horizontal_16_dual_avx2;
     vpx_mse16x16 = vpx_mse16x16_sse2;
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
+    vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_quantize_b = vpx_quantize_b_sse2;
     if (flags & HAS_SSSE3) vpx_quantize_b = vpx_quantize_b_ssse3;
     if (flags & HAS_AVX) vpx_quantize_b = vpx_quantize_b_avx;
@@ -1082,7 +1094,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_sad64x64_avg = vpx_sad64x64_avg_avx2;
     vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
-    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
     vpx_sad8x16x3 = vpx_sad8x16x3_c;
     if (flags & HAS_SSE3) vpx_sad8x16x3 = vpx_sad8x16x3_sse3;
     vpx_sad8x16x8 = vpx_sad8x16x8_c;
@@ -1153,10 +1164,16 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSSE3) vpx_sub_pixel_variance8x8 = vpx_sub_pixel_variance8x8_ssse3;
     vpx_variance16x16 = vpx_variance16x16_sse2;
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
+    vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
+    vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
     vpx_variance32x32 = vpx_variance32x32_sse2;
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
+    vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance64x32 = vpx_variance64x32_sse2;
     if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2;
     vpx_variance64x64 = vpx_variance64x64_sse2;
diff --git a/libs/win32/libvpx/include_x86/vp8_rtcd.h b/libs/win32/libvpx/include_x86/vp8_rtcd.h
index bd0776f7cde..55374e68df2 100644
--- a/libs/win32/libvpx/include_x86/vp8_rtcd.h
+++ b/libs/win32/libvpx/include_x86/vp8_rtcd.h
@@ -27,57 +27,57 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
-void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict4x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_bilinear_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_bilinear_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_bilinear_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_b vp8_blend_b_c
 
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_mb_inner vp8_blend_mb_inner_c
 
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride);
+void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
 #define vp8_blend_mb_outer vp8_blend_mb_outer_c
 
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 RTCD_EXTERN int (*vp8_block_error)(short *coeff, short *dqcoeff);
 
-void vp8_copy32xn_c(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse2(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-void vp8_copy32xn_sse3(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
-RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int source_stride, unsigned char *dst_ptr, int dst_stride, int n);
+void vp8_copy32xn_c(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse2(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+void vp8_copy32xn_sse3(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
+RTCD_EXTERN void (*vp8_copy32xn)(const unsigned char *src_ptr, int src_stride, unsigned char *dst_ptr, int dst_stride, int height);
 
-void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem16x16_sse2(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem16x16_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem16x16)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x4_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x4_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x4)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-void vp8_copy_mem8x8_mmx(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch);
+void vp8_copy_mem8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+void vp8_copy_mem8x8_mmx(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
+RTCD_EXTERN void (*vp8_copy_mem8x8)(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride);
 
-void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-void vp8_dc_only_idct_add_mmx(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride);
+void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_dc_only_idct_add)(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
 int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
@@ -87,9 +87,9 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, u
 int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 RTCD_EXTERN int (*vp8_denoiser_filter_uv)(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising);
 
-void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride);
-void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *output, int stride);
-RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *output, int stride);
+void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *dest, int stride);
+void vp8_dequant_idct_add_mmx(short *input, short *dq, unsigned char *dest, int stride);
+RTCD_EXTERN void (*vp8_dequant_idct_add)(short *input, short *dq, unsigned char *dest, int stride);
 
 void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
 void vp8_dequant_idct_add_uv_block_sse2(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs);
@@ -99,9 +99,9 @@ void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int
 void vp8_dequant_idct_add_y_block_sse2(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 RTCD_EXTERN void (*vp8_dequant_idct_add_y_block)(short *q, short *dq, unsigned char *dst, int stride, char *eobs);
 
-void vp8_dequantize_b_c(struct blockd*, short *dqc);
-void vp8_dequantize_b_mmx(struct blockd*, short *dqc);
-RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *dqc);
+void vp8_dequantize_b_c(struct blockd*, short *DQC);
+void vp8_dequantize_b_mmx(struct blockd*, short *DQC);
+RTCD_EXTERN void (*vp8_dequantize_b)(struct blockd*, short *DQC);
 
 int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 int vp8_diamond_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
@@ -128,37 +128,37 @@ int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *
 int vp8_full_search_sadx8(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 RTCD_EXTERN int (*vp8_full_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
-void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_bv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_bv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbh_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbh)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-void vp8_loop_filter_mbv_sse2(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
-RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
+RTCD_EXTERN void (*vp8_loop_filter_mbv)(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 
-void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bhs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_bvs_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_bv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_horizontal_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbh)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
-void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit);
-void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y, int ystride, const unsigned char *blimit);
-RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y, int ystride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+void vp8_loop_filter_simple_vertical_edge_sse2(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
+RTCD_EXTERN void (*vp8_loop_filter_simple_mbv)(unsigned char *y_ptr, int y_stride, const unsigned char *blimit);
 
 int vp8_mbblock_error_c(struct macroblock *mb, int dc);
 int vp8_mbblock_error_sse2(struct macroblock *mb, int dc);
@@ -168,9 +168,9 @@ int vp8_mbuverror_c(struct macroblock *mb);
 int vp8_mbuverror_sse2(struct macroblock *mb);
 RTCD_EXTERN int (*vp8_mbuverror)(struct macroblock *mb);
 
-int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
+RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
 
 void vp8_regular_quantize_b_c(struct block *, struct blockd *);
 void vp8_regular_quantize_b_sse2(struct block *, struct blockd *);
@@ -185,40 +185,40 @@ void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
 void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
 
-void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
-RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride);
+void vp8_short_idct4x4llm_c(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
+RTCD_EXTERN void (*vp8_short_idct4x4llm)(short *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride);
 
-void vp8_short_inv_walsh4x4_c(short *input, short *output);
-void vp8_short_inv_walsh4x4_sse2(short *input, short *output);
-RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *output);
+void vp8_short_inv_walsh4x4_c(short *input, short *mb_dqcoeff);
+void vp8_short_inv_walsh4x4_sse2(short *input, short *mb_dqcoeff);
+RTCD_EXTERN void (*vp8_short_inv_walsh4x4)(short *input, short *mb_dqcoeff);
 
-void vp8_short_inv_walsh4x4_1_c(short *input, short *output);
+void vp8_short_inv_walsh4x4_1_c(short *input, short *mb_dqcoeff);
 #define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c
 
 void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
 void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch);
 RTCD_EXTERN void (*vp8_short_walsh4x4)(short *input, short *output, int pitch);
 
-void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict16x16_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict16x16_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict16x16)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_mmx(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict4x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict4x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict4x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x4_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x4_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x4)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
-void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_sse2(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-void vp8_sixtap_predict8x8_ssse3(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
-RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch);
+void vp8_sixtap_predict8x8_c(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
+RTCD_EXTERN void (*vp8_sixtap_predict8x8)(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 
 void vp8_temporal_filter_apply_c(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
 void vp8_temporal_filter_apply_sse2(unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count);
@@ -238,9 +238,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict16x16 = vp8_bilinear_predict16x16_ssse3;
     vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict4x4 = vp8_bilinear_predict4x4_sse2;
     vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_c;
-    if (flags & HAS_MMX) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_mmx;
+    if (flags & HAS_SSE2) vp8_bilinear_predict8x4 = vp8_bilinear_predict8x4_sse2;
     vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_c;
     if (flags & HAS_SSE2) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_sse2;
     if (flags & HAS_SSSE3) vp8_bilinear_predict8x8 = vp8_bilinear_predict8x8_ssse3;
diff --git a/libs/win32/libvpx/include_x86/vp9_rtcd.h b/libs/win32/libvpx/include_x86/vp9_rtcd.h
index 0bd53965a14..654e791d807 100644
--- a/libs/win32/libvpx/include_x86/vp9_rtcd.h
+++ b/libs/win32/libvpx/include_x86/vp9_rtcd.h
@@ -31,6 +31,10 @@ struct yv12_buffer_config;
 extern "C" {
 #endif
 
+void vp9_apply_temporal_filter_c(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+void vp9_apply_temporal_filter_sse4_1(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+RTCD_EXTERN void (*vp9_apply_temporal_filter)(const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count);
+
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
@@ -45,11 +49,6 @@ int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_sit
 int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 
-void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 RTCD_EXTERN void (*vp9_fht16x16)(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -66,9 +65,9 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vp9_fwht4x4)(const int16_t *input, tran_low_t *output, int stride);
 
-void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
+RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
@@ -80,6 +79,7 @@ RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, in
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 RTCD_EXTERN void (*vp9_quantize_fp)(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
@@ -89,10 +89,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
 void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 RTCD_EXTERN void (*vp9_scale_and_extend_frame)(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-void vp9_temporal_filter_apply_sse4_1(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-RTCD_EXTERN void (*vp9_temporal_filter_apply)(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-
 void vp9_rtcd(void);
 
 #ifdef RTCD_C
@@ -103,6 +99,8 @@ static void setup_rtcd_internal(void)
 
     (void)flags;
 
+    vp9_apply_temporal_filter = vp9_apply_temporal_filter_c;
+    if (flags & HAS_SSE4_1) vp9_apply_temporal_filter = vp9_apply_temporal_filter_sse4_1;
     vp9_block_error = vp9_block_error_c;
     if (flags & HAS_SSE2) vp9_block_error = vp9_block_error_sse2;
     if (flags & HAS_AVX2) vp9_block_error = vp9_block_error_avx2;
@@ -111,9 +109,6 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vp9_block_error_fp = vp9_block_error_fp_avx2;
     vp9_diamond_search_sad = vp9_diamond_search_sad_c;
     if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx;
-    vp9_fdct8x8_quant = vp9_fdct8x8_quant_c;
-    if (flags & HAS_SSE2) vp9_fdct8x8_quant = vp9_fdct8x8_quant_sse2;
-    if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3;
     vp9_fht16x16 = vp9_fht16x16_c;
     if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2;
     vp9_fht4x4 = vp9_fht4x4_c;
@@ -130,10 +125,9 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_SSE2) vp9_iht8x8_64_add = vp9_iht8x8_64_add_sse2;
     vp9_quantize_fp = vp9_quantize_fp_c;
     if (flags & HAS_SSE2) vp9_quantize_fp = vp9_quantize_fp_sse2;
+    if (flags & HAS_AVX2) vp9_quantize_fp = vp9_quantize_fp_avx2;
     vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_c;
     if (flags & HAS_SSSE3) vp9_scale_and_extend_frame = vp9_scale_and_extend_frame_ssse3;
-    vp9_temporal_filter_apply = vp9_temporal_filter_apply_c;
-    if (flags & HAS_SSE4_1) vp9_temporal_filter_apply = vp9_temporal_filter_apply_sse4_1;
 }
 #endif
 
diff --git a/libs/win32/libvpx/include_x86/vpx_config.asm b/libs/win32/libvpx/include_x86/vpx_config.asm
index a61d838f49e..17ae2df81a6 100644
--- a/libs/win32/libvpx/include_x86/vpx_config.asm
+++ b/libs/win32/libvpx/include_x86/vpx_config.asm
@@ -72,12 +72,15 @@ CONFIG_ENCODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 0
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_VP9_TEMPORAL_DENOISING equ 0
+CONFIG_CONSISTENT_RECODE equ 0
 CONFIG_COEFFICIENT_RANGE_CHECKING equ 0
 CONFIG_VP9_HIGHBITDEPTH equ 0
 CONFIG_BETTER_HW_COMPATIBILITY equ 0
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_SIZE_LIMIT equ 0
 CONFIG_ALWAYS_ADJUST_BPM equ 0
-CONFIG_SPATIAL_SVC equ 0
+CONFIG_BITSTREAM_DEBUG equ 0
+CONFIG_MISMATCH_DEBUG equ 0
 CONFIG_FP_MB_STATS equ 0
 CONFIG_EMULATE_HARDWARE equ 0
+CONFIG_NON_GREEDY_MV equ 0
diff --git a/libs/win32/libvpx/include_x86/vpx_config.h b/libs/win32/libvpx/include_x86/vpx_config.h
index 2193ae1bf28..eea77b31334 100644
--- a/libs/win32/libvpx/include_x86/vpx_config.h
+++ b/libs/win32/libvpx/include_x86/vpx_config.h
@@ -84,13 +84,16 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
+#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 0
 #define CONFIG_ALWAYS_ADJUST_BPM 0
-#define CONFIG_SPATIAL_SVC 0
+#define CONFIG_BITSTREAM_DEBUG 0
+#define CONFIG_MISMATCH_DEBUG 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
+#define CONFIG_NON_GREEDY_MV 0
 #endif /* VPX_CONFIG_H */
diff --git a/libs/win32/libvpx/include_x86/vpx_dsp_rtcd.h b/libs/win32/libvpx/include_x86/vpx_dsp_rtcd.h
index fddc2be61b3..837a40a7cf4 100644
--- a/libs/win32/libvpx/include_x86/vpx_dsp_rtcd.h
+++ b/libs/win32/libvpx/include_x86/vpx_dsp_rtcd.h
@@ -77,163 +77,163 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 RTCD_EXTERN void (*vpx_convolve_copy)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
 
-void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
 
-void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
 
-void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
 
-void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c
 
-void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c
 
-void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_c
 
-void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c
 
-void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d153_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d153_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d207_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d207_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d45_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
-void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_d63_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
 
-void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_128_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_128_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_left_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_left_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_dc_top_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_dc_top_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -273,48 +273,53 @@ void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 RTCD_EXTERN void (*vpx_fdct8x8_1)(const int16_t *input, tran_low_t *output, int stride);
 
-void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get16x16var_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get16x16var_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get16x16var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride);
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride);
 #define vpx_get4x4sse_cs vpx_get4x4sse_cs_c
 
-void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-void vpx_get8x8var_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+void vpx_get8x8var_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+RTCD_EXTERN void (*vpx_get8x8var)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
 
 unsigned int vpx_get_mb_ss_c(const int16_t *);
 unsigned int vpx_get_mb_ss_sse2(const int16_t *);
 RTCD_EXTERN unsigned int (*vpx_get_mb_ss)(const int16_t *);
 
-void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
-void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_h_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 
 void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 void vpx_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 RTCD_EXTERN void (*vpx_hadamard_16x16)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
+void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+RTCD_EXTERN void (*vpx_hadamard_32x32)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
+
 void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff);
 
-void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -437,9 +442,9 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, co
 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
 
-void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
-RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols,int flimit);
+void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, int pitch, int rows, int cols,int flimit);
+RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols,int flimit);
 
 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit);
 void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit);
@@ -449,22 +454,23 @@ void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *mi
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 RTCD_EXTERN void (*vpx_minmax_8x8)(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
 
-unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
-unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_mse8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_mse8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
 void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch);
@@ -498,9 +504,9 @@ void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *
 void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad16x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad16x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 void vpx_sad16x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -514,9 +520,9 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad16x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad16x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -531,9 +537,9 @@ void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *r
 void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad16x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad16x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 void vpx_sad16x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -549,9 +555,9 @@ unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x16_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x16_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad32x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -563,10 +569,10 @@ unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad32x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -578,9 +584,9 @@ unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad32x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad32x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad32x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -594,9 +600,9 @@ void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad4x4x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad4x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 void vpx_sad4x4x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -610,9 +616,9 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad4x8_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad4x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -624,9 +630,9 @@ unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad64x32_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x32_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad64x32x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -638,11 +644,10 @@ unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_sad64x64_avg_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad64x64_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad64x64x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -656,9 +661,9 @@ void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_
 void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad8x16x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad8x16x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 void vpx_sad8x16x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -672,9 +677,9 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 RTCD_EXTERN unsigned int (*vpx_sad8x4_avg)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad8x4x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -688,9 +693,9 @@ void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 RTCD_EXTERN void (*vpx_sad8x8x3)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+RTCD_EXTERN void (*vpx_sad8x8x4d)(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
 
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 void vpx_sad8x8x8_sse4_1(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
@@ -720,139 +725,139 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
-uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-
-uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-
-uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_sub_pixel_avg_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_avg_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+
+uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance16x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance16x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance32x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance32x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance4x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance4x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x32_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x32)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance64x64)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x16)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x4_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x4)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+
+uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_sub_pixel_variance8x8_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+RTCD_EXTERN uint32_t (*vpx_sub_pixel_variance8x8)(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 
 void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
 void vpx_subtract_block_sse2(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride);
@@ -862,96 +867,99 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size);
 uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size);
 RTCD_EXTERN uint64_t (*vpx_sum_squares_2d_i16)(const int16_t *src, int stride, int size);
 
-void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-
-unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-
-void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_tm_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_tm_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_16x16_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_16x16)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_32x32_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_32x32)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_4x4_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_4x4)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+void vpx_v_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vpx_v_predictor_8x8)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+
+unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance16x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance32x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance32x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance4x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x32_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x32)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance64x64_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+
+void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c
 
 int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl);
@@ -1104,6 +1112,9 @@ static void setup_rtcd_internal(void)
     vpx_hadamard_16x16 = vpx_hadamard_16x16_c;
     if (flags & HAS_SSE2) vpx_hadamard_16x16 = vpx_hadamard_16x16_sse2;
     if (flags & HAS_AVX2) vpx_hadamard_16x16 = vpx_hadamard_16x16_avx2;
+    vpx_hadamard_32x32 = vpx_hadamard_32x32_c;
+    if (flags & HAS_SSE2) vpx_hadamard_32x32 = vpx_hadamard_32x32_sse2;
+    if (flags & HAS_AVX2) vpx_hadamard_32x32 = vpx_hadamard_32x32_avx2;
     vpx_hadamard_8x8 = vpx_hadamard_8x8_c;
     if (flags & HAS_SSE2) vpx_hadamard_8x8 = vpx_hadamard_8x8_sse2;
     vpx_idct16x16_10_add = vpx_idct16x16_10_add_c;
@@ -1178,6 +1189,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_mse16x16 = vpx_mse16x16_avx2;
     vpx_mse16x8 = vpx_mse16x8_c;
     if (flags & HAS_SSE2) vpx_mse16x8 = vpx_mse16x8_sse2;
+    if (flags & HAS_AVX2) vpx_mse16x8 = vpx_mse16x8_avx2;
     vpx_mse8x16 = vpx_mse8x16_c;
     if (flags & HAS_SSE2) vpx_mse8x16 = vpx_mse8x16_sse2;
     vpx_mse8x8 = vpx_mse8x8_c;
@@ -1279,7 +1291,6 @@ static void setup_rtcd_internal(void)
     vpx_sad64x64x4d = vpx_sad64x64x4d_c;
     if (flags & HAS_SSE2) vpx_sad64x64x4d = vpx_sad64x64x4d_sse2;
     if (flags & HAS_AVX2) vpx_sad64x64x4d = vpx_sad64x64x4d_avx2;
-    if (flags & HAS_AVX512) vpx_sad64x64x4d = vpx_sad64x64x4d_avx512;
     vpx_sad8x16 = vpx_sad8x16_c;
     if (flags & HAS_SSE2) vpx_sad8x16 = vpx_sad8x16_sse2;
     vpx_sad8x16_avg = vpx_sad8x16_avg_c;
@@ -1418,8 +1429,10 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance16x16 = vpx_variance16x16_avx2;
     vpx_variance16x32 = vpx_variance16x32_c;
     if (flags & HAS_SSE2) vpx_variance16x32 = vpx_variance16x32_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x32 = vpx_variance16x32_avx2;
     vpx_variance16x8 = vpx_variance16x8_c;
     if (flags & HAS_SSE2) vpx_variance16x8 = vpx_variance16x8_sse2;
+    if (flags & HAS_AVX2) vpx_variance16x8 = vpx_variance16x8_avx2;
     vpx_variance32x16 = vpx_variance32x16_c;
     if (flags & HAS_SSE2) vpx_variance32x16 = vpx_variance32x16_sse2;
     if (flags & HAS_AVX2) vpx_variance32x16 = vpx_variance32x16_avx2;
@@ -1428,6 +1441,7 @@ static void setup_rtcd_internal(void)
     if (flags & HAS_AVX2) vpx_variance32x32 = vpx_variance32x32_avx2;
     vpx_variance32x64 = vpx_variance32x64_c;
     if (flags & HAS_SSE2) vpx_variance32x64 = vpx_variance32x64_sse2;
+    if (flags & HAS_AVX2) vpx_variance32x64 = vpx_variance32x64_avx2;
     vpx_variance4x4 = vpx_variance4x4_c;
     if (flags & HAS_SSE2) vpx_variance4x4 = vpx_variance4x4_sse2;
     vpx_variance4x8 = vpx_variance4x8_c;
diff --git a/libs/win32/libvpx/libvpx.2017.vcxproj b/libs/win32/libvpx/libvpx.2017.vcxproj
index aae4ba1f75a..0597ef2d65b 100644
--- a/libs/win32/libvpx/libvpx.2017.vcxproj
+++ b/libs/win32/libvpx/libvpx.2017.vcxproj
@@ -1,4 +1,4 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ItemGroup Label="ProjectConfigurations">
     <ProjectConfiguration Include="Debug|Win32">
@@ -153,20 +153,6 @@
     <ClCompile Include="..\..\libvpx\vpx_scale\vpx_scale_rtcd.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_ports\emms.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_ports_emms_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_ports_emms_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_ports_emms_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_ports_emms_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\prob.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -188,130 +174,21 @@
     <ClCompile Include="..\..\libvpx\vpx_dsp\intrapred.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\intrapred_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\intrapred_ssse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\add_noise.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
     <ClCompile Include="..\..\libvpx\vpx_dsp\deblock.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\add_noise_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\deblock_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
-    </CustomBuild>
+    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\post_proc_sse2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
     <ClCompile Include="..\..\libvpx\vpx_dsp\vpx_convolve.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\vpx_asm_stubs.c">
+    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_4t_intrin_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_bilinear_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_ssse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_bilinear_ssse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_intrin_avx2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
@@ -319,20 +196,6 @@
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_intrin_ssse3.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_convolve_copy_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\loopfilter.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -349,14 +212,6 @@
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\fwd_txfm_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\fwd_txfm_ssse3_x86_64.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\fwd_txfm_avx2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
@@ -367,20 +222,6 @@
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\inv_txfm_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\inv_wht_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\inv_txfm_ssse3.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -407,14 +248,6 @@
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\avg_ssse3_x86_64.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\skin_detection.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -427,54 +260,9 @@
     <ClCompile Include="..\..\libvpx\vpx_dsp\sum_squares.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\sad4d_avx512.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\sum_squares_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_ssse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse4.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\sad4d_avx2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
@@ -483,51 +271,12 @@
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad4d_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\subtract_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vpx_dsp\variance.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
+    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\avg_pred_sse2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
     <ClCompile Include="..\..\libvpx\vpx_dsp\x86\variance_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -535,33 +284,7 @@
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
     </ClCompile>
-    <ClCompile Include="..\..\libvpx\vpx_dsp\x86\avg_pred_sse2.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-      <AdditionalOptions>/arch:AVX</AdditionalOptions>
-    </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\ssim_opt_x86_64.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\subpel_variance_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <ClCompile Include="..\..\libvpx\vpx_dsp\vpx_dsp_rtcd.c">
+    <ClCompile Include="..\..\libvpx\vpx_dsp\vpx_dsp_rtcd.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
     <ClCompile Include="..\..\libvpx\vpx_util\vpx_thread.c">
@@ -576,9 +299,6 @@
     <ClCompile Include="..\..\libvpx\vp8\common\blockd.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp8\common\copy_c.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
     <ClCompile Include="..\..\libvpx\vp8\common\dequantize.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -645,9 +365,6 @@
     <ClCompile Include="..\..\libvpx\vp8\common\treecoder.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp8\common\x86\filter_x86.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
     <ClCompile Include="..\..\libvpx\vp8\common\x86\vp8_asm_stubs.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -660,202 +377,15 @@
     <ClCompile Include="..\..\libvpx\vp8\common\postproc.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\dequantize_mmx.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vp8\common\x86\idct_blk_mmx.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\idctllm_mmx.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\recon_mmx.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_mmx.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\copy_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_copy_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_copy_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vp8\common\x86\idct_blk_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\idctllm_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\recon_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\loopfilter_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\iwalsh_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\copy_sse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_copy_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_copy_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_copy_sse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_copy_sse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_copy_sse3_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_ssse3.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\mfqe_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\common\x86\loopfilter_block_sse2_x86_64.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Outputs>
-    </CustomBuild>
+    <ClCompile Include="..\..\libvpx\vp8\common\x86\bilinear_filter_sse2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
     <ClCompile Include="..\..\libvpx\vp8\vp8_cx_iface.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -865,6 +395,9 @@
     <ClCompile Include="..\..\libvpx\vp8\encoder\boolhuff.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp8\encoder\copy_c.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
     <ClCompile Include="..\..\libvpx\vp8\encoder\dct.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -931,34 +464,6 @@
     <ClCompile Include="..\..\libvpx\vp8\encoder\temporal_filter.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\dct_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
-    </CustomBuild>
-    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\fwalsh_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vp8\encoder\x86\vp8_quantize_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -971,37 +476,9 @@
     <ClCompile Include="..\..\libvpx\vp8\encoder\x86\denoising_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\temporal_filter_apply_sse2.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vp8\encoder\x86\vp8_enc_stubs_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
-    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\encodeopt.asm">
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Outputs>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Command>
-      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_encodeopt_asm.obj</Outputs>
-    </CustomBuild>
     <ClCompile Include="..\..\libvpx\vp8\vp8_dx_iface.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
@@ -1209,10 +686,570 @@
     <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_quantize_sse2.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
     </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_quantize_avx2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+      <AdditionalOptions>/arch:AVX</AdditionalOptions>
+    </ClCompile>
     <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_diamond_search_sad_avx.c">
       <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
       <AdditionalOptions>/arch:AVX</AdditionalOptions>
     </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_dct_intrin_sse2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_frame_scale_ssse3.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_error_avx2.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+      <AdditionalOptions>/arch:AVX</AdditionalOptions>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\vp9_dx_iface.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decodemv.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decodeframe.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_detokenize.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decoder.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_dsubexp.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_job_queue.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Condition="'$(Platform)'=='Win32'" Include="include_x86\vpx_config.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <ClCompile Condition="'$(Platform)'=='x64'" Include="include_x64\vpx_config.c">
+      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
+    </ClCompile>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\intrapred_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_intrapred_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\intrapred_ssse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_intrapred_ssse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\add_noise_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_add_noise_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\deblock_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_deblock_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_bilinear_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_8t_ssse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_8t_ssse3.asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_subpixel_bilinear_ssse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_subpixel_bilinear_ssse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\vpx_convolve_copy_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_vpx_convolve_copy_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\fwd_txfm_ssse3_x86_64.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_fwd_txfm_ssse3_x86_64_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\inv_wht_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_inv_wht_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\avg_ssse3_x86_64.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_avg_ssse3_x86_64_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_ssse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_ssse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse4.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse4_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad4d_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad4d_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\sad_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_sad_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\subtract_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_subtract_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\ssim_opt_x86_64.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_ssim_opt_x86_64_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vpx_dsp\x86\subpel_variance_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_dsp_x86_subpel_variance_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\dequantize_mmx.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_dequantize_mmx_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\idctllm_mmx.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_idctllm_mmx_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\recon_mmx.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_recon_mmx_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_mmx.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_mmx_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\copy_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_copy_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\idctllm_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_idctllm_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\recon_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_recon_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_recon_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\loopfilter_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_loopfilter_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\iwalsh_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_iwalsh_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\copy_sse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_copy_sse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\subpixel_ssse3.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_subpixel_ssse3_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\mfqe_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_mfqe_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\common\x86\loopfilter_block_sse2_x86_64.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_common_x86_loopfilter_block_sse2_x86_64_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\dct_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_dct_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\fwalsh_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_fwalsh_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\temporal_filter_apply_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_temporal_filter_apply_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
+    <CustomBuild Include="..\..\libvpx\vp8\encoder\x86\block_error_sse2.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp8_encoder_x86_block_error_sse2_asm.obj</Outputs>
+    </CustomBuild>
+    
     <CustomBuild Include="..\..\libvpx\vp9\encoder\x86\vp9_dct_sse2.asm">
       <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
       <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vp9_encoder_x86_vp9_dct_sse2_asm.obj</Command>
@@ -1249,40 +1286,32 @@
       <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vp9_encoder_x86_vp9_quantize_ssse3_x86_64_asm.obj</Command>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vp9_encoder_x86_vp9_quantize_ssse3_x86_64_asm.obj</Outputs>
     </CustomBuild>
-    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_dct_intrin_sse2.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_dct_ssse3.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_frame_scale_ssse3.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\encoder\x86\vp9_error_avx2.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-      <AdditionalOptions>/arch:AVX</AdditionalOptions>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\vp9_dx_iface.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decodemv.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decodeframe.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_detokenize.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_decoder.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="..\..\libvpx\vp9\decoder\vp9_dsubexp.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
-    <ClCompile Include="include_x86\vpx_config.c">
-      <ObjectFileName>$(IntDir)$([System.String]::Copy(%(RelativeDir)).Replace('\','_'))\</ObjectFileName>
-    </ClCompile>
+    
+    <CustomBuild Include="..\..\libvpx\vpx_ports\emms_mmx.asm">
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -g cv8 -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(IntDir)vpx_ports_emms_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"..\..\libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(IntDir)vpx_ports_emms_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)vpx_ports_emms_mmx_asm.obj</Outputs>
+      <Message Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_emms_mmx_asm.obj</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)vpx_ports_emms_mmx_asm.obj</Outputs>
+    </CustomBuild>
+
+    <CustomBuild Include="..\..\libvpx\vpx_ports\float_control_word.asm">
+      <Message>Assembling %(Filename)%(Extension)</Message>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_float_control_word_asm.obj</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">yasm -Xvc -f win32  -I".\include_x86" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_float_control_word_asm.obj</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">yasm -Xvc -g cv8 -f win64  -I".\include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_float_control_word_asm.obj</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">yasm -Xvc -f win64  -I"./include_x64" -I"../../libvpx" "%(FullPath)" -o $(IntDir)vpx_ports_float_control_word_asm.obj</Command>
+      <Outputs>$(IntDir)vpx_ports_float_control_word_asm.obj</Outputs>
+    </CustomBuild>
+
+
     <None Include="vpx.def" />
   </ItemGroup>
   <ItemGroup>
@@ -1316,11 +1345,12 @@
     <ClInclude Include="..\..\libvpx\vpx_dsp\bitreader.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\bitreader_buffer.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\postproc.h" />
-    <ClInclude Include="..\..\libvpx\vpx_dsp\vpx_convolve.h" />
-    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve_ssse3.h" />
-    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve_avx2.h"  />
     <ClInclude Include="..\..\libvpx\vpx_dsp\vpx_filter.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\vpx_convolve.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve_sse2.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve_ssse3.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\convolve_avx2.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\txfm_common.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\txfm_common_sse2.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\fwd_txfm.h" />
@@ -1332,15 +1362,17 @@
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\inv_txfm_sse2.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\inv_txfm_ssse3.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\quantize.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\quantize_sse2.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\quantize_ssse3.h" />
+    <ClInclude Include="..\..\libvpx\vpx_dsp\skin_detection.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\variance.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\mem_sse2.h" />
     <ClInclude Include="..\..\libvpx\vpx_dsp\x86\transpose_sse2.h" />
     <ClInclude Include="..\..\libvpx\vpx_util\vpx_atomics.h" />
-    <ClInclude Include="..\..\libvpx\vpx_dsp\x86\quantize_x86.h" />
-    <ClInclude Include="..\..\libvpx\vpx_dsp\skin_detection.h" />
     <ClInclude Include="..\..\libvpx\vpx_util\vpx_thread.h" />
     <ClInclude Include="..\..\libvpx\vpx_util\endian_inl.h" />
     <ClInclude Include="..\..\libvpx\vpx_util\vpx_write_yuv_frame.h" />
+    <ClInclude Include="..\..\libvpx\vpx_util\vpx_timestamp.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\ppflags.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\onyx.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\onyxd.h" />
@@ -1371,7 +1403,6 @@
     <ClInclude Include="..\..\libvpx\vp8\common\threading.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\treecoder.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\vp8_entropymodedata.h" />
-    <ClInclude Include="..\..\libvpx\vp8\common\x86\filter_x86.h" />
     <ClInclude Include="..\..\libvpx\vp8\common\postproc.h" />
     <ClInclude Include="..\..\libvpx\vp8\encoder\defaultcoefcounts.h" />
     <ClInclude Include="..\..\libvpx\vp8\encoder\encodeframe.h" />
@@ -1456,6 +1487,7 @@
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_tokenize.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_treewriter.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_picklpf.h" />
+    <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_partition_models.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_segmentation.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_speed_features.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_subexp.h" />
@@ -1469,20 +1501,26 @@
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_noise_estimate.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_temporal_filter.h" />
     <ClInclude Include="..\..\libvpx\vp9\encoder\vp9_mbgraph.h" />
+    <ClInclude Include="..\..\libvpx\vp9\encoder\x86\temporal_filter_constants.h" />
     <ClInclude Include="..\..\libvpx\vp9\vp9_dx_iface.h" />
     <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_decodeframe.h" />
     <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_decodemv.h" />
     <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_detokenize.h" />
     <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_decoder.h" />
     <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_dsubexp.h" />
+    <ClInclude Include="..\..\libvpx\vp9\decoder\vp9_job_queue.h" />
     <ClInclude Include="..\..\libvpx\vpx_ports\emmintrin_compat.h" />
     <ClInclude Include="..\..\libvpx\vpx_ports\mem_ops.h" />
     <ClInclude Include="..\..\libvpx\vpx_ports\mem_ops_aligned.h" />
     <ClInclude Include="..\..\libvpx\vpx_ports\vpx_once.h" />
-    <ClInclude Include="include_x86\vpx_scale_rtcd.h" />
-    <ClInclude Include="include_x86\vpx_dsp_rtcd.h" />
-    <ClInclude Include="include_x86\vp8_rtcd.h" />
-    <ClInclude Include="include_x86\vp9_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x64'" Include="include_x64\vpx_scale_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x86'" Include="include_x86\vpx_scale_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x64'" Include="include_x64\vpx_dsp_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x86'" Include="include_x86\vpx_dsp_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x64'" Include="include_x64\vp8_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x86'" Include="include_x86\vp8_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x64'" Include="include_x64\vp9_rtcd.h" />
+    <ClInclude Condition="'$(Platform)'=='x86'" Include="include_x86\vp9_rtcd.h" />
   </ItemGroup>
   <ItemGroup>
     <None Include="..\..\libvpx\libs.mk" />
@@ -1499,8 +1537,6 @@
     <None Include="..\..\libvpx\vp9\vp9cx.mk" />
     <None Include="..\..\libvpx\vp9\vp9dx.mk" />
   </ItemGroup>
-  <ItemGroup>
-  </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
   </ImportGroup>
diff --git a/libs/win32/libvpx/vpx_version.h b/libs/win32/libvpx/vpx_version.h
index 5cff3b429f4..83f6172674e 100644
--- a/libs/win32/libvpx/vpx_version.h
+++ b/libs/win32/libvpx/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  0
+#define VERSION_MINOR  8
+#define VERSION_PATCH  1
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.0"
-#define VERSION_STRING      " v1.6.0"
+#define VERSION_STRING_NOSP "v1.8.1"
+#define VERSION_STRING      " v1.8.1"
diff --git a/src/cJSON.c b/src/cJSON.c
index 8cd284ebb65..6d0e4946fd4 100644
--- a/src/cJSON.c
+++ b/src/cJSON.c
@@ -1,16 +1,13 @@
 /*
-  Copyright (c) 2009 Dave Gamble
-
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:
-
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.
-
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,29 +20,72 @@
 /* cJSON */
 /* JSON parser in C. */
 
+/* disable warnings about old C89 functions in MSVC */
+#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER)
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#ifdef __GNUC__
+#pragma GCC visibility push(default)
+#endif
+#if defined(_MSC_VER)
+#pragma warning (push)
+/* disable warning about single line comments in system headers */
+#pragma warning (disable : 4001)
+#endif
+
 #include <string.h>
 #include <stdio.h>
 #include <math.h>
 #include <stdlib.h>
-#include <float.h>
 #include <limits.h>
 #include <ctype.h>
+
+#ifdef ENABLE_LOCALES
+#include <locale.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+
 #include "switch_cJSON.h"
 
 /* define our own boolean type */
-typedef int cjbool;
-#define true ((cjbool)1)
-#define false ((cjbool)0)
+#ifdef true
+#undef true
+#endif
+#define true ((cJSON_bool)1)
+
+#ifdef false
+#undef false
+#endif
+#define false ((cJSON_bool)0)
 
-static const unsigned char *global_ep = NULL;
+typedef struct {
+    const unsigned char *json;
+    size_t position;
+} error;
+static error global_error = { NULL, 0 };
 
 CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void)
 {
-    return (const char*) global_ep;
+    return (const char*) (global_error.json + global_error.position);
+}
+
+CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item) {
+    if (!cJSON_IsString(item)) {
+        return NULL;
+    }
+
+    return item->valuestring;
 }
 
 /* This is a safeguard to prevent copy-pasters from using incompatible C and header files */
-#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 3) || (CJSON_VERSION_PATCH != 0)
+#if (CJSON_VERSION_MAJOR != 1) || (CJSON_VERSION_MINOR != 7) || (CJSON_VERSION_PATCH != 12)
     #error cJSON.h and cJSON.c have different versions. Make sure that both have the same.
 #endif
 
@@ -57,49 +97,79 @@ CJSON_PUBLIC(const char*) cJSON_Version(void)
     return version;
 }
 
-/* case insensitive strcmp */
-static int cJSON_strcasecmp(const unsigned char *s1, const unsigned char *s2)
+/* Case insensitive string comparison, doesn't consider two NULL pointers equal though */
+static int case_insensitive_strcmp(const unsigned char *string1, const unsigned char *string2)
 {
-    if (!s1)
+    if ((string1 == NULL) || (string2 == NULL))
     {
-        return (s1 == s2) ? 0 : 1; /* both NULL? */
+        return 1;
     }
-    if (!s2)
+
+    if (string1 == string2)
     {
-        return 1;
+        return 0;
     }
-    for(; tolower(*s1) == tolower(*s2); ++s1, ++s2)
+
+    for(; tolower(*string1) == tolower(*string2); (void)string1++, string2++)
     {
-        if (*s1 == '\0')
+        if (*string1 == '\0')
         {
             return 0;
         }
     }
 
-    return tolower(*s1) - tolower(*s2);
+    return tolower(*string1) - tolower(*string2);
 }
 
 typedef struct internal_hooks
 {
-    void *(*allocate)(size_t size);
-    void (*deallocate)(void *pointer);
-    void *(*reallocate)(void *pointer, size_t size);
+    void *(CJSON_CDECL *allocate)(size_t size);
+    void (CJSON_CDECL *deallocate)(void *pointer);
+    void *(CJSON_CDECL *reallocate)(void *pointer, size_t size);
 } internal_hooks;
 
-static internal_hooks global_hooks = { malloc, free, realloc };
+#if defined(_MSC_VER)
+/* work around MSVC error C2322: '...' address of dillimport '...' is not static */
+static void * CJSON_CDECL internal_malloc(size_t size)
+{
+    return malloc(size);
+}
+static void CJSON_CDECL internal_free(void *pointer)
+{
+    free(pointer);
+}
+static void * CJSON_CDECL internal_realloc(void *pointer, size_t size)
+{
+    return realloc(pointer, size);
+}
+#else
+#define internal_malloc malloc
+#define internal_free free
+#define internal_realloc realloc
+#endif
+
+/* strlen of character literals resolved at compile time */
+#define static_strlen(string_literal) (sizeof(string_literal) - sizeof(""))
 
-static unsigned char* cJSON_strdup(const unsigned char* str, const internal_hooks * const hooks)
+static internal_hooks global_hooks = { internal_malloc, internal_free, internal_realloc };
+
+static unsigned char* cJSON_strdup(const unsigned char* string, const internal_hooks * const hooks)
 {
-    size_t len = 0;
+    size_t length = 0;
     unsigned char *copy = NULL;
-	const unsigned char *s = str ? str : (unsigned char *)"";
 
-    len = strlen((const char*)s) + 1;
-    if (!(copy = (unsigned char*)hooks->allocate(len)))
+    if (string == NULL)
+    {
+        return NULL;
+    }
+
+    length = strlen((const char*)string) + sizeof("");
+    copy = (unsigned char*)hooks->allocate(length);
+    if (copy == NULL)
     {
         return NULL;
     }
-    memcpy(copy, s, len);
+    memcpy(copy, string, length);
 
     return copy;
 }
@@ -148,44 +218,110 @@ static cJSON *cJSON_New_Item(const internal_hooks * const hooks)
 }
 
 /* Delete a cJSON structure. */
-CJSON_PUBLIC(void) cJSON_Delete(cJSON *c)
+CJSON_PUBLIC(void) cJSON_Delete(cJSON *item)
 {
     cJSON *next = NULL;
-    while (c)
+    while (item != NULL)
     {
-        next = c->next;
-        if (!(c->type & cJSON_IsReference) && c->child)
+        next = item->next;
+        if (!(item->type & cJSON_IsReference) && (item->child != NULL))
         {
-            cJSON_Delete(c->child);
+            cJSON_Delete(item->child);
         }
-        if (!(c->type & cJSON_IsReference) && c->valuestring)
+        if (!(item->type & cJSON_IsReference) && (item->valuestring != NULL))
         {
-            global_hooks.deallocate(c->valuestring);
+            global_hooks.deallocate(item->valuestring);
         }
-        if (!(c->type & cJSON_StringIsConst) && c->string)
+        if (!(item->type & cJSON_StringIsConst) && (item->string != NULL))
         {
-            global_hooks.deallocate(c->string);
+            global_hooks.deallocate(item->string);
         }
-        global_hooks.deallocate(c);
-        c = next;
+        global_hooks.deallocate(item);
+        item = next;
     }
 }
 
+/* get the decimal point character of the current locale */
+static unsigned char get_decimal_point(void)
+{
+#ifdef ENABLE_LOCALES
+    struct lconv *lconv = localeconv();
+    return (unsigned char) lconv->decimal_point[0];
+#else
+    return '.';
+#endif
+}
+
+typedef struct
+{
+    const unsigned char *content;
+    size_t length;
+    size_t offset;
+    size_t depth; /* How deeply nested (in arrays/objects) is the input at the current offset. */
+    internal_hooks hooks;
+} parse_buffer;
+
+/* check if the given size is left to read in a given parse buffer (starting with 1) */
+#define can_read(buffer, size) ((buffer != NULL) && (((buffer)->offset + size) <= (buffer)->length))
+/* check if the buffer can be accessed at the given index (starting with 0) */
+#define can_access_at_index(buffer, index) ((buffer != NULL) && (((buffer)->offset + index) < (buffer)->length))
+#define cannot_access_at_index(buffer, index) (!can_access_at_index(buffer, index))
+/* get a pointer to the buffer at the position */
+#define buffer_at_offset(buffer) ((buffer)->content + (buffer)->offset)
+
 /* Parse the input text to generate a number, and populate the result into item. */
-static const unsigned char *parse_number(cJSON * const item, const unsigned char * const input)
+static cJSON_bool parse_number(cJSON * const item, parse_buffer * const input_buffer)
 {
     double number = 0;
     unsigned char *after_end = NULL;
+    unsigned char number_c_string[64];
+    unsigned char decimal_point = get_decimal_point();
+    size_t i = 0;
 
-    if (input == NULL)
+    if ((input_buffer == NULL) || (input_buffer->content == NULL))
     {
-        return NULL;
+        return false;
+    }
+
+    /* copy the number into a temporary buffer and replace '.' with the decimal point
+     * of the current locale (for strtod)
+     * This also takes care of '\0' not necessarily being available for marking the end of the input */
+    for (i = 0; (i < (sizeof(number_c_string) - 1)) && can_access_at_index(input_buffer, i); i++)
+    {
+        switch (buffer_at_offset(input_buffer)[i])
+        {
+            case '0':
+            case '1':
+            case '2':
+            case '3':
+            case '4':
+            case '5':
+            case '6':
+            case '7':
+            case '8':
+            case '9':
+            case '+':
+            case '-':
+            case 'e':
+            case 'E':
+                number_c_string[i] = buffer_at_offset(input_buffer)[i];
+                break;
+
+            case '.':
+                number_c_string[i] = decimal_point;
+                break;
+
+            default:
+                goto loop_end;
+        }
     }
+loop_end:
+    number_c_string[i] = '\0';
 
-    number = strtod((const char*)input, (char**)&after_end);
-    if (input == after_end)
+    number = strtod((const char*)number_c_string, (char**)&after_end);
+    if (number_c_string == after_end)
     {
-        return NULL; /* parse_error */
+        return false; /* parse_error */
     }
 
     item->valuedouble = number;
@@ -195,7 +331,7 @@ static const unsigned char *parse_number(cJSON * const item, const unsigned char
     {
         item->valueint = INT_MAX;
     }
-    else if (number <= INT_MIN)
+    else if (number <= (double)INT_MIN)
     {
         item->valueint = INT_MIN;
     }
@@ -206,7 +342,8 @@ static const unsigned char *parse_number(cJSON * const item, const unsigned char
 
     item->type = cJSON_Number;
 
-    return after_end;
+    input_buffer->offset += (size_t)(after_end - number_c_string);
+    return true;
 }
 
 /* don't ask me, but the original cJSON_SetNumberValue returns an integer or double */
@@ -216,13 +353,13 @@ CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number)
     {
         object->valueint = INT_MAX;
     }
-    else if (number <= INT_MIN)
+    else if (number <= (double)INT_MIN)
     {
         object->valueint = INT_MIN;
     }
     else
     {
-        object->valueint = cJSON_Number;
+        object->valueint = (int)number;
     }
 
     return object->valuedouble = number;
@@ -233,11 +370,14 @@ typedef struct
     unsigned char *buffer;
     size_t length;
     size_t offset;
-    cjbool noalloc;
+    size_t depth; /* current nesting depth (for formatted printing) */
+    cJSON_bool noalloc;
+    cJSON_bool format; /* is this print a formatted print */
+    internal_hooks hooks;
 } printbuffer;
 
 /* realloc printbuffer if necessary to have at least "needed" bytes more */
-static unsigned char* ensure(printbuffer * const p, size_t needed, const internal_hooks * const hooks)
+static unsigned char* ensure(printbuffer * const p, size_t needed)
 {
     unsigned char *newbuffer = NULL;
     size_t newsize = 0;
@@ -247,13 +387,19 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
         return NULL;
     }
 
+    if ((p->length > 0) && (p->offset >= p->length))
+    {
+        /* make sure that offset is valid */
+        return NULL;
+    }
+
     if (needed > INT_MAX)
     {
         /* sizes bigger than INT_MAX are currently not supported */
         return NULL;
     }
 
-    needed += p->offset;
+    needed += p->offset + 1;
     if (needed <= p->length)
     {
         return p->buffer + p->offset;
@@ -264,8 +410,7 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
     }
 
     /* calculate new buffer size */
-    newsize = needed * 2;
-    if (newsize > INT_MAX)
+    if (needed > (INT_MAX / 2))
     {
         /* overflow of int, use INT_MAX if possible */
         if (needed <= INT_MAX)
@@ -277,19 +422,31 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
             return NULL;
         }
     }
+    else
+    {
+        newsize = needed * 2;
+    }
 
-    if (hooks->reallocate != NULL)
+    if (p->hooks.reallocate != NULL)
     {
         /* reallocate with realloc if available */
-        newbuffer = (unsigned char*)hooks->reallocate(p->buffer, newsize);
+        newbuffer = (unsigned char*)p->hooks.reallocate(p->buffer, newsize);
+        if (newbuffer == NULL)
+        {
+            p->hooks.deallocate(p->buffer);
+            p->length = 0;
+            p->buffer = NULL;
+
+            return NULL;
+        }
     }
     else
     {
         /* otherwise reallocate manually */
-        newbuffer = (unsigned char*)hooks->allocate(newsize);
+        newbuffer = (unsigned char*)p->hooks.allocate(newsize);
         if (!newbuffer)
         {
-            hooks->deallocate(p->buffer);
+            p->hooks.deallocate(p->buffer);
             p->length = 0;
             p->buffer = NULL;
 
@@ -299,7 +456,7 @@ static unsigned char* ensure(printbuffer * const p, size_t needed, const interna
         {
             memcpy(newbuffer, p->buffer, p->offset + 1);
         }
-        hooks->deallocate(p->buffer);
+        p->hooks.deallocate(p->buffer);
     }
     p->length = newsize;
     p->buffer = newbuffer;
@@ -321,54 +478,69 @@ static void update_offset(printbuffer * const buffer)
 }
 
 /* Render the number nicely from the given item into a string. */
-static unsigned char *print_number(const cJSON * const item, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_number(const cJSON * const item, printbuffer * const output_buffer)
 {
     unsigned char *output_pointer = NULL;
     double d = item->valuedouble;
+    int length = 0;
+    size_t i = 0;
+    unsigned char number_buffer[26]; /* temporary buffer to print the number into */
+    unsigned char decimal_point = get_decimal_point();
+    double test;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
-    /* value is an int */
-    if ((fabs(((double)item->valueint) - d) <= DBL_EPSILON) && (d <= INT_MAX) && (d >= INT_MIN))
+    /* This checks for NaN and Infinity */
+    if ((d * 0) != 0)
+    {
+        length = sprintf((char*)number_buffer, "null");
+    }
+    else
     {
-        /* 2^64+1 can be represented in 21 chars. */
-        output_pointer = ensure(output_buffer, 21, hooks);
-        if (output_pointer != NULL)
+        /* Try 15 decimal places of precision to avoid nonsignificant nonzero digits */
+        length = sprintf((char*)number_buffer, "%1.15g", d);
+
+        /* Check whether the original double can be recovered */
+        if ((sscanf((char*)number_buffer, "%lg", &test) != 1) || ((double)test != d))
         {
-            sprintf((char*)output_pointer, "%d", item->valueint);
+            /* If not, print with 17 decimal places of precision */
+            length = sprintf((char*)number_buffer, "%1.17g", d);
         }
     }
-    /* value is a floating point number */
-    else
+
+    /* sprintf failed or buffer overrun occurred */
+    if ((length < 0) || (length > (int)(sizeof(number_buffer) - 1)))
+    {
+        return false;
+    }
+
+    /* reserve appropriate space in the output */
+    output_pointer = ensure(output_buffer, (size_t)length + sizeof(""));
+    if (output_pointer == NULL)
+    {
+        return false;
+    }
+
+    /* copy the printed number to the output and replace locale
+     * dependent decimal point with '.' */
+    for (i = 0; i < ((size_t)length); i++)
     {
-        /* This is a nice tradeoff. */
-        output_pointer = ensure(output_buffer, 64, hooks);
-        if (output_pointer != NULL)
+        if (number_buffer[i] == decimal_point)
         {
-            /* This checks for NaN and Infinity */
-            if ((d * 0) != 0)
-            {
-                sprintf((char*)output_pointer, "null");
-            }
-            else if ((fabs(floor(d) - d) <= DBL_EPSILON) && (fabs(d) < 1.0e60))
-            {
-                sprintf((char*)output_pointer, "%.0f", d);
-            }
-            else if ((fabs(d) < 1.0e-6) || (fabs(d) > 1.0e9))
-            {
-                sprintf((char*)output_pointer, "%e", d);
-            }
-            else
-            {
-                sprintf((char*)output_pointer, "%f", d);
-            }
+            output_pointer[i] = '.';
+            continue;
         }
+
+        output_pointer[i] = number_buffer[i];
     }
+    output_pointer[i] = '\0';
+
+    output_buffer->offset += (size_t)length;
 
-    return output_pointer;
+    return true;
 }
 
 /* parse 4 digit hexadecimal number */
@@ -409,28 +581,19 @@ static unsigned parse_hex4(const unsigned char * const input)
 
 /* converts a UTF-16 literal to UTF-8
  * A literal can be one or two sequences of the form \uXXXX */
-static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer, const unsigned char **error_pointer)
+static unsigned char utf16_literal_to_utf8(const unsigned char * const input_pointer, const unsigned char * const input_end, unsigned char **output_pointer)
 {
-    /* first bytes of UTF8 encoding for a given length in bytes */
-    static const unsigned char firstByteMark[5] =
-    {
-        0x00, /* should never happen */
-        0x00, /* 0xxxxxxx */
-        0xC0, /* 110xxxxx */
-        0xE0, /* 1110xxxx */
-        0xF0 /* 11110xxx */
-    };
-
     long unsigned int codepoint = 0;
     unsigned int first_code = 0;
     const unsigned char *first_sequence = input_pointer;
     unsigned char utf8_length = 0;
+    unsigned char utf8_position = 0;
     unsigned char sequence_length = 0;
+    unsigned char first_byte_mark = 0;
 
     if ((input_end - first_sequence) < 6)
     {
         /* input ends unexpectedly */
-        *error_pointer = first_sequence;
         goto fail;
     }
 
@@ -438,9 +601,8 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
     first_code = parse_hex4(first_sequence + 2);
 
     /* check that the code is valid */
-    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)) || (first_code == 0))
+    if (((first_code >= 0xDC00) && (first_code <= 0xDFFF)))
     {
-        *error_pointer = first_sequence;
         goto fail;
     }
 
@@ -454,14 +616,12 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
         if ((input_end - second_sequence) < 6)
         {
             /* input ends unexpectedly */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
         if ((second_sequence[0] != '\\') || (second_sequence[1] != 'u'))
         {
             /* missing second half of the surrogate pair */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
@@ -471,7 +631,6 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
         if ((second_code < 0xDC00) || (second_code > 0xDFFF))
         {
             /* invalid second half of the surrogate pair */
-            *error_pointer = first_sequence;
             goto fail;
         }
 
@@ -497,47 +656,43 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
     {
         /* two bytes, encoding 110xxxxx 10xxxxxx */
         utf8_length = 2;
+        first_byte_mark = 0xC0; /* 11000000 */
     }
     else if (codepoint < 0x10000)
     {
         /* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
         utf8_length = 3;
+        first_byte_mark = 0xE0; /* 11100000 */
     }
     else if (codepoint <= 0x10FFFF)
     {
         /* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
         utf8_length = 4;
+        first_byte_mark = 0xF0; /* 11110000 */
     }
     else
     {
         /* invalid unicode codepoint */
-        *error_pointer = first_sequence;
         goto fail;
     }
 
     /* encode as utf8 */
-    switch (utf8_length)
-    {
-        case 4:
-            /* 10xxxxxx */
-            (*output_pointer)[3] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 3:
-            /* 10xxxxxx */
-            (*output_pointer)[2] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 2:
-            (*output_pointer)[1] = (unsigned char)((codepoint | 0x80) & 0xBF);
-            codepoint >>= 6;
-        case 1:
-            /* depending on the length in bytes this determines the
-               encoding of the first UTF8 byte */
-            (*output_pointer)[0] = (unsigned char)((codepoint | firstByteMark[utf8_length]) & 0xFF);
-            break;
-        default:
-            *error_pointer = first_sequence;
-            goto fail;
+    for (utf8_position = (unsigned char)(utf8_length - 1); utf8_position > 0; utf8_position--)
+    {
+        /* 10xxxxxx */
+        (*output_pointer)[utf8_position] = (unsigned char)((codepoint | 0x80) & 0xBF);
+        codepoint >>= 6;
+    }
+    /* encode first byte */
+    if (utf8_length > 1)
+    {
+        (*output_pointer)[0] = (unsigned char)((codepoint | first_byte_mark) & 0xFF);
     }
+    else
+    {
+        (*output_pointer)[0] = (unsigned char)(codepoint & 0x7F);
+    }
+
     *output_pointer += utf8_length;
 
     return sequence_length;
@@ -547,17 +702,16 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
 }
 
 /* Parse the input text into an unescaped cinput, and populate item. */
-static const unsigned char *parse_string(cJSON * const item, const unsigned char * const input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_string(cJSON * const item, parse_buffer * const input_buffer)
 {
-    const unsigned char *input_pointer = input + 1;
-    const unsigned char *input_end = input + 1;
+    const unsigned char *input_pointer = buffer_at_offset(input_buffer) + 1;
+    const unsigned char *input_end = buffer_at_offset(input_buffer) + 1;
     unsigned char *output_pointer = NULL;
     unsigned char *output = NULL;
 
     /* not a string */
-    if (*input != '\"')
+    if (buffer_at_offset(input_buffer)[0] != '\"')
     {
-        *error_pointer = input;
         goto fail;
     }
 
@@ -565,12 +719,12 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
         /* calculate approximate size of the output (overestimate) */
         size_t allocation_length = 0;
         size_t skipped_bytes = 0;
-        while ((*input_end != '\"') && (*input_end != '\0'))
+        while (((size_t)(input_end - input_buffer->content) < input_buffer->length) && (*input_end != '\"'))
         {
             /* is escape sequence */
             if (input_end[0] == '\\')
             {
-                if (input_end[1] == '\0')
+                if ((size_t)(input_end + 1 - input_buffer->content) >= input_buffer->length)
                 {
                     /* prevent buffer overflow when last input character is a backslash */
                     goto fail;
@@ -580,14 +734,14 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
             }
             input_end++;
         }
-        if (*input_end == '\0')
+        if (((size_t)(input_end - input_buffer->content) >= input_buffer->length) || (*input_end != '\"'))
         {
             goto fail; /* string ended unexpectedly */
         }
 
         /* This is at most how much we need for the output */
-        allocation_length = (size_t) (input_end - input) - skipped_bytes;
-        output = (unsigned char*)hooks->allocate(allocation_length + sizeof('\0'));
+        allocation_length = (size_t) (input_end - buffer_at_offset(input_buffer)) - skipped_bytes;
+        output = (unsigned char*)input_buffer->hooks.allocate(allocation_length + sizeof(""));
         if (output == NULL)
         {
             goto fail; /* allocation failure */
@@ -606,6 +760,11 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
         else
         {
             unsigned char sequence_length = 2;
+            if ((input_end - input_pointer) < 1)
+            {
+                goto fail;
+            }
+
             switch (input_pointer[1])
             {
                 case 'b':
@@ -631,7 +790,7 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
 
                 /* UTF-16 literal */
                 case 'u':
-                    sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer, error_pointer);
+                    sequence_length = utf16_literal_to_utf8(input_pointer, input_end, &output_pointer);
                     if (sequence_length == 0)
                     {
                         /* failed to convert UTF16-literal to UTF-8 */
@@ -640,7 +799,6 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
                     break;
 
                 default:
-                    *error_pointer = input_pointer;
                     goto fail;
             }
             input_pointer += sequence_length;
@@ -653,19 +811,27 @@ static const unsigned char *parse_string(cJSON * const item, const unsigned char
     item->type = cJSON_String;
     item->valuestring = (char*)output;
 
-    return input_end + 1;
+    input_buffer->offset = (size_t) (input_end - input_buffer->content);
+    input_buffer->offset++;
+
+    return true;
 
 fail:
     if (output != NULL)
     {
-        hooks->deallocate(output);
+        input_buffer->hooks.deallocate(output);
     }
 
-    return NULL;
+    if (input_pointer != NULL)
+    {
+        input_buffer->offset = (size_t)(input_pointer - input_buffer->content);
+    }
+
+    return false;
 }
 
 /* Render the cstring provided to an escaped version that can be printed. */
-static unsigned char *print_string_ptr(const unsigned char * const input, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_string_ptr(const unsigned char * const input, printbuffer * const output_buffer)
 {
     const unsigned char *input_pointer = NULL;
     unsigned char *output = NULL;
@@ -676,42 +842,52 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* empty string */
     if (input == NULL)
     {
-        output = ensure(output_buffer, sizeof("\"\""), hooks);
+        output = ensure(output_buffer, sizeof("\"\""));
         if (output == NULL)
         {
-            return NULL;
+            return false;
         }
         strcpy((char*)output, "\"\"");
 
-        return output;
+        return true;
     }
 
     /* set "flag" to 1 if something needs to be escaped */
     for (input_pointer = input; *input_pointer; input_pointer++)
     {
-        if (strchr("\"\\\b\f\n\r\t", *input_pointer))
-        {
-            /* one character escape sequence */
-            escape_characters++;
-        }
-        else if (*input_pointer < 32)
-        {
-            /* UTF-16 escape sequence uXXXX */
-            escape_characters += 5;
+        switch (*input_pointer)
+        {
+            case '\"':
+            case '\\':
+            case '\b':
+            case '\f':
+            case '\n':
+            case '\r':
+            case '\t':
+                /* one character escape sequence */
+                escape_characters++;
+                break;
+            default:
+                if (*input_pointer < 32)
+                {
+                    /* UTF-16 escape sequence uXXXX */
+                    escape_characters += 5;
+                }
+                break;
         }
     }
     output_length = (size_t)(input_pointer - input) + escape_characters;
 
-    output = ensure(output_buffer, output_length + sizeof("\"\""), hooks);
+    output = ensure(output_buffer, output_length + sizeof("\"\""));
     if (output == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* no characters have to be escaped */
@@ -722,13 +898,13 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
         output[output_length + 1] = '\"';
         output[output_length + 2] = '\0';
 
-        return output;
+        return true;
     }
 
     output[0] = '\"';
     output_pointer = output + 1;
     /* copy the string */
-    for (input_pointer = input; *input_pointer != '\0'; input_pointer++, output_pointer++)
+    for (input_pointer = input; *input_pointer != '\0'; (void)input_pointer++, output_pointer++)
     {
         if ((*input_pointer > 31) && (*input_pointer != '\"') && (*input_pointer != '\\'))
         {
@@ -773,72 +949,138 @@ static unsigned char *print_string_ptr(const unsigned char * const input, printb
     output[output_length + 1] = '\"';
     output[output_length + 2] = '\0';
 
-    return output;
+    return true;
 }
 
 /* Invoke print_string_ptr (which is useful) on an item. */
-static unsigned char *print_string(const cJSON * const item, printbuffer * const p, const internal_hooks * const hooks)
+static cJSON_bool print_string(const cJSON * const item, printbuffer * const p)
 {
-    return print_string_ptr((unsigned char*)item->valuestring, p, hooks);
+    return print_string_ptr((unsigned char*)item->valuestring, p);
 }
 
 /* Predeclare these prototypes. */
-static const unsigned char *parse_value(cJSON * const item, const unsigned char * const input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_value(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
-static const unsigned char *parse_array(cJSON * const item, const unsigned char *input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_array(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
-static const unsigned char *parse_object(cJSON * const item, const unsigned char *input, const unsigned char ** const ep, const internal_hooks * const hooks);
-static unsigned char *print_object(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks);
+static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer);
+static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_array(const cJSON * const item, printbuffer * const output_buffer);
+static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer);
+static cJSON_bool print_object(const cJSON * const item, printbuffer * const output_buffer);
 
 /* Utility to jump whitespace and cr/lf */
-static const unsigned char *skip_whitespace(const unsigned char *in)
+static parse_buffer *buffer_skip_whitespace(parse_buffer * const buffer)
 {
-    while (in && *in && (*in <= 32))
+    if ((buffer == NULL) || (buffer->content == NULL))
     {
-        in++;
+        return NULL;
+    }
+
+    while (can_access_at_index(buffer, 0) && (buffer_at_offset(buffer)[0] <= 32))
+    {
+       buffer->offset++;
     }
 
-    return in;
+    if (buffer->offset == buffer->length)
+    {
+        buffer->offset--;
+    }
+
+    return buffer;
 }
 
-/* Parse an object - create a new root, and populate. */
-CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cjbool require_null_terminated)
+/* skip the UTF-8 BOM (byte order mark) if it is at the beginning of a buffer */
+static parse_buffer *skip_utf8_bom(parse_buffer * const buffer)
 {
-    const unsigned char *end = NULL;
-    /* use global error pointer if no specific one was given */
-    const unsigned char **ep = return_parse_end ? (const unsigned char**)return_parse_end : &global_ep;
-    cJSON *c = cJSON_New_Item(&global_hooks);
-    *ep = NULL;
-    if (!c) /* memory fail */
+    if ((buffer == NULL) || (buffer->content == NULL) || (buffer->offset != 0))
     {
         return NULL;
     }
 
-    end = parse_value(c, skip_whitespace((const unsigned char*)value), ep, &global_hooks);
-    if (!end)
+    if (can_access_at_index(buffer, 4) && (strncmp((const char*)buffer_at_offset(buffer), "\xEF\xBB\xBF", 3) == 0))
+    {
+        buffer->offset += 3;
+    }
+
+    return buffer;
+}
+
+/* Parse an object - create a new root, and populate. */
+CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated)
+{
+    parse_buffer buffer = { 0, 0, 0, 0, { 0, 0, 0 } };
+    cJSON *item = NULL;
+
+    /* reset error position */
+    global_error.json = NULL;
+    global_error.position = 0;
+
+    if (value == NULL)
+    {
+        goto fail;
+    }
+
+    buffer.content = (const unsigned char*)value;
+    buffer.length = strlen((const char*)value) + sizeof("");
+    buffer.offset = 0;
+    buffer.hooks = global_hooks;
+
+    item = cJSON_New_Item(&global_hooks);
+    if (item == NULL) /* memory fail */
+    {
+        goto fail;
+    }
+
+    if (!parse_value(item, buffer_skip_whitespace(skip_utf8_bom(&buffer))))
     {
         /* parse failure. ep is set. */
-        cJSON_Delete(c);
-        return NULL;
+        goto fail;
     }
 
     /* if we require null-terminated JSON without appended garbage, skip and then check for a null terminator */
     if (require_null_terminated)
     {
-        end = skip_whitespace(end);
-        if (*end)
+        buffer_skip_whitespace(&buffer);
+        if ((buffer.offset >= buffer.length) || buffer_at_offset(&buffer)[0] != '\0')
         {
-            cJSON_Delete(c);
-            *ep = end;
-            return NULL;
+            goto fail;
         }
     }
     if (return_parse_end)
     {
-        *return_parse_end = (const char*)end;
+        *return_parse_end = (const char*)buffer_at_offset(&buffer);
+    }
+
+    return item;
+
+fail:
+    if (item != NULL)
+    {
+        cJSON_Delete(item);
+    }
+
+    if (value != NULL)
+    {
+        error local_error;
+        local_error.json = (const unsigned char*)value;
+        local_error.position = 0;
+
+        if (buffer.offset < buffer.length)
+        {
+            local_error.position = buffer.offset;
+        }
+        else if (buffer.length > 0)
+        {
+            local_error.position = buffer.length - 1;
+        }
+
+        if (return_parse_end != NULL)
+        {
+            *return_parse_end = (const char*)local_error.json + local_error.position;
+        }
+
+        global_error = local_error;
     }
 
-    return c;
+    return NULL;
 }
 
 /* Default options for cJSON_Parse */
@@ -847,42 +1089,55 @@ CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value)
     return cJSON_ParseWithOpts(value, 0, 0);
 }
 
-#ifndef min
-#define min(a, b) ((a < b) ? a : b)
-#endif
+#define cjson_min(a, b) ((a < b) ? a : b)
 
-static unsigned char *print(const cJSON * const item, cjbool format, const internal_hooks * const hooks)
+static unsigned char *print(const cJSON * const item, cJSON_bool format, const internal_hooks * const hooks)
 {
+    static const size_t default_buffer_size = 256;
     printbuffer buffer[1];
     unsigned char *printed = NULL;
 
     memset(buffer, 0, sizeof(buffer));
 
     /* create buffer */
-    buffer->buffer = (unsigned char*) hooks->allocate(256);
+    buffer->buffer = (unsigned char*) hooks->allocate(default_buffer_size);
+    buffer->length = default_buffer_size;
+    buffer->format = format;
+    buffer->hooks = *hooks;
     if (buffer->buffer == NULL)
     {
         goto fail;
     }
 
     /* print the value */
-    if (print_value(item, 0, format, buffer, hooks) == NULL)
+    if (!print_value(item, buffer))
     {
         goto fail;
     }
     update_offset(buffer);
 
-    /* copy the buffer over to a new one */
-    printed = (unsigned char*) hooks->allocate(buffer->offset + 1);
-    if (printed == NULL)
+    /* check if reallocate is available */
+    if (hooks->reallocate != NULL)
     {
-        goto fail;
+        printed = (unsigned char*) hooks->reallocate(buffer->buffer, buffer->offset + 1);
+        if (printed == NULL) {
+            goto fail;
+        }
+        buffer->buffer = NULL;
     }
-    strncpy((char*)printed, (char*)buffer->buffer, min(buffer->length, buffer->offset + 1));
-    printed[buffer->offset] = '\0'; /* just to be sure */
+    else /* otherwise copy the JSON over to a new buffer */
+    {
+        printed = (unsigned char*) hooks->allocate(buffer->offset + 1);
+        if (printed == NULL)
+        {
+            goto fail;
+        }
+        memcpy(printed, buffer->buffer, cjson_min(buffer->length, buffer->offset + 1));
+        printed[buffer->offset] = '\0'; /* just to be sure */
 
-    /* free the buffer */
-    hooks->deallocate(buffer->buffer);
+        /* free the buffer */
+        hooks->deallocate(buffer->buffer);
+    }
 
     return printed;
 
@@ -911,9 +1166,9 @@ CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item)
     return (char*)print(item, false, &global_hooks);
 }
 
-CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cjbool fmt)
+CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt)
 {
-    printbuffer p;
+    printbuffer p = { 0, 0, 0, 0, 0, 0, { 0, 0, 0 } };
 
     if (prebuffer < 0)
     {
@@ -929,15 +1184,23 @@ CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cjboo
     p.length = (size_t)prebuffer;
     p.offset = 0;
     p.noalloc = false;
+    p.format = fmt;
+    p.hooks = global_hooks;
+
+    if (!print_value(item, &p))
+    {
+        global_hooks.deallocate(p.buffer);
+        return NULL;
+    }
 
-    return (char*)print_value(item, 0, fmt, &p, &global_hooks);
+    return (char*)p.buffer;
 }
 
-CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const cjbool fmt)
+CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const cJSON_bool fmt)
 {
-    printbuffer p;
+    printbuffer p = { 0, 0, 0, 0, 0, 0, { 0, 0, 0 } };
 
-    if (len < 0)
+    if ((len < 0) || (buf == NULL))
     {
         return false;
     }
@@ -946,164 +1209,181 @@ CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len,
     p.length = (size_t)len;
     p.offset = 0;
     p.noalloc = true;
-    return print_value(item, 0, fmt, &p, &global_hooks) != NULL;
+    p.format = fmt;
+    p.hooks = global_hooks;
+
+    return print_value(item, &p);
 }
 
 /* Parser core - when encountering text, process appropriately. */
-static const unsigned  char *parse_value(cJSON * const item, const unsigned char * const input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_value(cJSON * const item, parse_buffer * const input_buffer)
 {
-    if (input == NULL)
+    if ((input_buffer == NULL) || (input_buffer->content == NULL))
     {
-        return NULL; /* no input */
+        return false; /* no input */
     }
 
     /* parse the different types of values */
     /* null */
-    if (!strncmp((const char*)input, "null", 4))
+    if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "null", 4) == 0))
     {
         item->type = cJSON_NULL;
-        return input + 4;
+        input_buffer->offset += 4;
+        return true;
     }
     /* false */
-    if (!strncmp((const char*)input, "false", 5))
+    if (can_read(input_buffer, 5) && (strncmp((const char*)buffer_at_offset(input_buffer), "false", 5) == 0))
     {
         item->type = cJSON_False;
-        return input + 5;
+        input_buffer->offset += 5;
+        return true;
     }
     /* true */
-    if (!strncmp((const char*)input, "true", 4))
+    if (can_read(input_buffer, 4) && (strncmp((const char*)buffer_at_offset(input_buffer), "true", 4) == 0))
     {
         item->type = cJSON_True;
         item->valueint = 1;
-        return input + 4;
+        input_buffer->offset += 4;
+        return true;
     }
     /* string */
-    if (*input == '\"')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '\"'))
     {
-        return parse_string(item, input, error_pointer, hooks);
+        return parse_string(item, input_buffer);
     }
     /* number */
-    if ((*input == '-') || ((*input >= '0') && (*input <= '9')))
+    if (can_access_at_index(input_buffer, 0) && ((buffer_at_offset(input_buffer)[0] == '-') || ((buffer_at_offset(input_buffer)[0] >= '0') && (buffer_at_offset(input_buffer)[0] <= '9'))))
     {
-        return parse_number(item, input);
+        return parse_number(item, input_buffer);
     }
     /* array */
-    if (*input == '[')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '['))
     {
-        return parse_array(item, input, error_pointer, hooks);
+        return parse_array(item, input_buffer);
     }
     /* object */
-    if (*input == '{')
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '{'))
     {
-        return parse_object(item, input, error_pointer, hooks);
+        return parse_object(item, input_buffer);
     }
 
-    /* failure. */
-    *error_pointer = input;
-    return NULL;
+    return false;
 }
 
 /* Render a value to text. */
-static unsigned char *print_value(const cJSON * const item, const size_t depth, const cjbool format,  printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_value(const cJSON * const item, printbuffer * const output_buffer)
 {
     unsigned char *output = NULL;
 
     if ((item == NULL) || (output_buffer == NULL))
     {
-        return NULL;
+        return false;
     }
 
     switch ((item->type) & 0xFF)
     {
         case cJSON_NULL:
-            output = ensure(output_buffer, 5, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 5);
+            if (output == NULL)
             {
-                strcpy((char*)output, "null");
+                return false;
             }
-            break;
+            strcpy((char*)output, "null");
+            return true;
+
         case cJSON_False:
-            output = ensure(output_buffer, 6, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 6);
+            if (output == NULL)
             {
-                strcpy((char*)output, "false");
+                return false;
             }
-            break;
+            strcpy((char*)output, "false");
+            return true;
+
         case cJSON_True:
-            output = ensure(output_buffer, 5, hooks);
-            if (output != NULL)
+            output = ensure(output_buffer, 5);
+            if (output == NULL)
             {
-                strcpy((char*)output, "true");
+                return false;
             }
-            break;
+            strcpy((char*)output, "true");
+            return true;
+
         case cJSON_Number:
-            output = print_number(item, output_buffer, hooks);
-            break;
+            return print_number(item, output_buffer);
+
         case cJSON_Raw:
         {
             size_t raw_length = 0;
             if (item->valuestring == NULL)
             {
-                if (!output_buffer->noalloc)
-                {
-                    hooks->deallocate(output_buffer->buffer);
-                }
-                output = NULL;
-                break;
+                return false;
             }
 
-            raw_length = strlen(item->valuestring) + sizeof('\0');
-            output = ensure(output_buffer, raw_length, hooks);
-            if (output != NULL)
+            raw_length = strlen(item->valuestring) + sizeof("");
+            output = ensure(output_buffer, raw_length);
+            if (output == NULL)
             {
-                memcpy(output, item->valuestring, raw_length);
+                return false;
             }
-            break;
+            memcpy(output, item->valuestring, raw_length);
+            return true;
         }
+
         case cJSON_String:
-            output = print_string(item, output_buffer, hooks);
-            break;
+            return print_string(item, output_buffer);
+
         case cJSON_Array:
-            output = print_array(item, depth, format, output_buffer, hooks);
-            break;
+            return print_array(item, output_buffer);
+
         case cJSON_Object:
-            output = print_object(item, depth, format, output_buffer, hooks);
-            break;
+            return print_object(item, output_buffer);
+
         default:
-            output = NULL;
-            break;
+            return false;
     }
-
-    return output;
 }
 
 /* Build an array from input text. */
-static const unsigned char *parse_array(cJSON * const item, const unsigned char *input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_array(cJSON * const item, parse_buffer * const input_buffer)
 {
     cJSON *head = NULL; /* head of the linked list */
     cJSON *current_item = NULL;
 
-    if (*input != '[')
+    if (input_buffer->depth >= CJSON_NESTING_LIMIT)
+    {
+        return false; /* to deeply nested */
+    }
+    input_buffer->depth++;
+
+    if (buffer_at_offset(input_buffer)[0] != '[')
     {
         /* not an array */
-        *error_pointer = input;
         goto fail;
     }
 
-    input = skip_whitespace(input + 1);
-    if (*input == ']')
+    input_buffer->offset++;
+    buffer_skip_whitespace(input_buffer);
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ']'))
     {
         /* empty array */
         goto success;
     }
 
+    /* check if we skipped to the end of the buffer */
+    if (cannot_access_at_index(input_buffer, 0))
+    {
+        input_buffer->offset--;
+        goto fail;
+    }
+
     /* step back to character in front of the first element */
-    input--;
+    input_buffer->offset--;
     /* loop through the comma separated array elements */
     do
     {
         /* allocate next item */
-        cJSON *new_item = cJSON_New_Item(hooks);
+        cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
         if (new_item == NULL)
         {
             goto fail; /* allocation failure */
@@ -1124,27 +1404,30 @@ static const unsigned char *parse_array(cJSON * const item, const unsigned char
         }
 
         /* parse next value */
-        input = skip_whitespace(input + 1);
-        input = parse_value(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_value(current_item, input_buffer))
         {
             goto fail; /* failed to parse value */
         }
+        buffer_skip_whitespace(input_buffer);
     }
-    while (*input == ',');
+    while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
 
-    if (*input != ']')
+    if (cannot_access_at_index(input_buffer, 0) || buffer_at_offset(input_buffer)[0] != ']')
     {
-        *error_pointer = input;
         goto fail; /* expected end of array */
     }
 
 success:
+    input_buffer->depth--;
+
     item->type = cJSON_Array;
     item->child = head;
 
-    return input + 1;
+    input_buffer->offset++;
+
+    return true;
 
 fail:
     if (head != NULL)
@@ -1152,52 +1435,50 @@ static const unsigned char *parse_array(cJSON * const item, const unsigned char
         cJSON_Delete(head);
     }
 
-    return NULL;
+    return false;
 }
 
 /* Render an array to text */
-static unsigned char *print_array(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_array(const cJSON * const item, printbuffer * const output_buffer)
 {
-    unsigned char *output = NULL;
     unsigned char *output_pointer = NULL;
     size_t length = 0;
     cJSON *current_element = item->child;
-    size_t output_offset = 0;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* Compose the output array. */
     /* opening square bracket */
-    output_offset = output_buffer->offset;
-    output_pointer = ensure(output_buffer, 1, hooks);
+    output_pointer = ensure(output_buffer, 1);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     *output_pointer = '[';
     output_buffer->offset++;
+    output_buffer->depth++;
 
     while (current_element != NULL)
     {
-        if (print_value(current_element, depth + 1, format, output_buffer, hooks) == NULL)
+        if (!print_value(current_element, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
         if (current_element->next)
         {
-            length = format ? 2 : 1;
-            output_pointer = ensure(output_buffer, length + 1, hooks);
+            length = (size_t) (output_buffer->format ? 2 : 1);
+            output_pointer = ensure(output_buffer, length + 1);
             if (output_pointer == NULL)
             {
-                return NULL;
+                return false;
             }
             *output_pointer++ = ',';
-            if(format)
+            if(output_buffer->format)
             {
                 *output_pointer++ = ' ';
             }
@@ -1207,43 +1488,56 @@ static unsigned char *print_array(const cJSON * const item, const size_t depth,
         current_element = current_element->next;
     }
 
-    output_pointer = ensure(output_buffer, 2, hooks);
+    output_pointer = ensure(output_buffer, 2);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
     *output_pointer++ = ']';
     *output_pointer = '\0';
-    output = output_buffer->buffer + output_offset;
+    output_buffer->depth--;
 
-    return output;
+    return true;
 }
 
 /* Build an object from the text. */
-static const unsigned char *parse_object(cJSON * const item, const unsigned char *input, const unsigned char ** const error_pointer, const internal_hooks * const hooks)
+static cJSON_bool parse_object(cJSON * const item, parse_buffer * const input_buffer)
 {
     cJSON *head = NULL; /* linked list head */
     cJSON *current_item = NULL;
 
-    if (*input != '{')
+    if (input_buffer->depth >= CJSON_NESTING_LIMIT)
+    {
+        return false; /* to deeply nested */
+    }
+    input_buffer->depth++;
+
+    if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '{'))
     {
-        *error_pointer = input;
         goto fail; /* not an object */
     }
 
-    input = skip_whitespace(input + 1);
-    if (*input == '}')
+    input_buffer->offset++;
+    buffer_skip_whitespace(input_buffer);
+    if (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == '}'))
     {
         goto success; /* empty object */
     }
 
+    /* check if we skipped to the end of the buffer */
+    if (cannot_access_at_index(input_buffer, 0))
+    {
+        input_buffer->offset--;
+        goto fail;
+    }
+
     /* step back to character in front of the first element */
-    input--;
+    input_buffer->offset--;
     /* loop through the comma separated array elements */
     do
     {
         /* allocate next item */
-        cJSON *new_item = cJSON_New_Item(hooks);
+        cJSON *new_item = cJSON_New_Item(&(input_buffer->hooks));
         if (new_item == NULL)
         {
             goto fail; /* allocation failure */
@@ -1264,46 +1558,47 @@ static const unsigned char *parse_object(cJSON * const item, const unsigned char
         }
 
         /* parse the name of the child */
-        input = skip_whitespace(input + 1);
-        input = parse_string(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_string(current_item, input_buffer))
         {
-            goto fail; /* faile to parse name */
+            goto fail; /* failed to parse name */
         }
+        buffer_skip_whitespace(input_buffer);
 
         /* swap valuestring and string, because we parsed the name */
         current_item->string = current_item->valuestring;
         current_item->valuestring = NULL;
 
-        if (*input != ':')
+        if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != ':'))
         {
-            *error_pointer = input;
             goto fail; /* invalid object */
         }
 
         /* parse the value */
-        input = skip_whitespace(input + 1);
-        input = parse_value(current_item, input, error_pointer, hooks);
-        input = skip_whitespace(input);
-        if (input == NULL)
+        input_buffer->offset++;
+        buffer_skip_whitespace(input_buffer);
+        if (!parse_value(current_item, input_buffer))
         {
             goto fail; /* failed to parse value */
         }
+        buffer_skip_whitespace(input_buffer);
     }
-    while (*input == ',');
+    while (can_access_at_index(input_buffer, 0) && (buffer_at_offset(input_buffer)[0] == ','));
 
-    if (*input != '}')
+    if (cannot_access_at_index(input_buffer, 0) || (buffer_at_offset(input_buffer)[0] != '}'))
     {
-        *error_pointer = input;
         goto fail; /* expected end of object */
     }
 
 success:
+    input_buffer->depth--;
+
     item->type = cJSON_Object;
     item->child = head;
 
-    return input + 1;
+    input_buffer->offset++;
+    return true;
 
 fail:
     if (head != NULL)
@@ -1311,34 +1606,32 @@ static const unsigned char *parse_object(cJSON * const item, const unsigned char
         cJSON_Delete(head);
     }
 
-    return NULL;
+    return false;
 }
 
 /* Render an object to text. */
-static unsigned char *print_object(const cJSON * const item, const size_t depth, const cjbool format, printbuffer * const output_buffer, const internal_hooks * const hooks)
+static cJSON_bool print_object(const cJSON * const item, printbuffer * const output_buffer)
 {
-    unsigned char *output = NULL;
     unsigned char *output_pointer = NULL;
     size_t length = 0;
-    size_t output_offset = 0;
     cJSON *current_item = item->child;
 
     if (output_buffer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     /* Compose the output: */
-    output_offset = output_buffer->offset;
-    length = format ? 2 : 1; /* fmt: {\n */
-    output_pointer = ensure(output_buffer, length + 1, hooks);
+    length = (size_t) (output_buffer->format ? 2 : 1); /* fmt: {\n */
+    output_pointer = ensure(output_buffer, length + 1);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
 
     *output_pointer++ = '{';
-    if (format)
+    output_buffer->depth++;
+    if (output_buffer->format)
     {
         *output_pointer++ = '\n';
     }
@@ -1346,61 +1639,61 @@ static unsigned char *print_object(const cJSON * const item, const size_t depth,
 
     while (current_item)
     {
-        if (format)
+        if (output_buffer->format)
         {
             size_t i;
-            output_pointer = ensure(output_buffer, depth + 1, hooks);
+            output_pointer = ensure(output_buffer, output_buffer->depth);
             if (output_pointer == NULL)
             {
-                return NULL;
+                return false;
             }
-            for (i = 0; i < depth + 1; i++)
+            for (i = 0; i < output_buffer->depth; i++)
             {
                 *output_pointer++ = '\t';
             }
-            output_buffer->offset += depth + 1;
+            output_buffer->offset += output_buffer->depth;
         }
 
         /* print key */
-        if (print_string_ptr((unsigned char*)current_item->string, output_buffer, hooks) == NULL)
+        if (!print_string_ptr((unsigned char*)current_item->string, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
 
-        length = format ? 2 : 1;
-        output_pointer = ensure(output_buffer, length, hooks);
+        length = (size_t) (output_buffer->format ? 2 : 1);
+        output_pointer = ensure(output_buffer, length);
         if (output_pointer == NULL)
         {
-            return NULL;
+            return false;
         }
         *output_pointer++ = ':';
-        if (format)
+        if (output_buffer->format)
         {
             *output_pointer++ = '\t';
         }
         output_buffer->offset += length;
 
         /* print value */
-        if (!print_value(current_item, depth + 1, format, output_buffer, hooks))
+        if (!print_value(current_item, output_buffer))
         {
-            return NULL;
+            return false;
         }
         update_offset(output_buffer);
 
         /* print comma if not last */
-        length = (size_t) (format ? 1 : 0) + (current_item->next ? 1 : 0);
-        output_pointer = ensure(output_buffer, length + 1, hooks);
+        length = ((size_t)(output_buffer->format ? 1 : 0) + (size_t)(current_item->next ? 1 : 0));
+        output_pointer = ensure(output_buffer, length + 1);
         if (output_pointer == NULL)
         {
-            return NULL;
+            return false;
         }
         if (current_item->next)
         {
             *output_pointer++ = ',';
         }
 
-        if (format)
+        if (output_buffer->format)
         {
             *output_pointer++ = '\n';
         }
@@ -1410,83 +1703,122 @@ static unsigned char *print_object(const cJSON * const item, const size_t depth,
         current_item = current_item->next;
     }
 
-    output_pointer = ensure(output_buffer, format ? (depth + 2) : 2, hooks);
+    output_pointer = ensure(output_buffer, output_buffer->format ? (output_buffer->depth + 1) : 2);
     if (output_pointer == NULL)
     {
-        return NULL;
+        return false;
     }
-    if (format)
+    if (output_buffer->format)
     {
         size_t i;
-        for (i = 0; i < (depth); i++)
+        for (i = 0; i < (output_buffer->depth - 1); i++)
         {
             *output_pointer++ = '\t';
         }
     }
     *output_pointer++ = '}';
     *output_pointer = '\0';
-    output = (output_buffer->buffer) + output_offset;
+    output_buffer->depth--;
 
-    return output;
+    return true;
 }
 
 /* Get Array size/item / object item. */
 CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array)
 {
-    cJSON *c = array->child;
-    size_t i = 0;
-    while(c)
+    cJSON *child = NULL;
+    size_t size = 0;
+
+    if (array == NULL)
     {
-        i++;
-        c = c->next;
+        return 0;
+    }
+
+    child = array->child;
+
+    while(child != NULL)
+    {
+        size++;
+        child = child->next;
     }
 
     /* FIXME: Can overflow here. Cannot be fixed without breaking the API */
 
-    return (int)i;
+    return (int)size;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int item)
+static cJSON* get_array_item(const cJSON *array, size_t index)
 {
-    cJSON *c = array ? array->child : NULL;
-    while (c && item > 0)
+    cJSON *current_child = NULL;
+
+    if (array == NULL)
     {
-        item--;
-        c = c->next;
+        return NULL;
     }
 
-    return c;
+    current_child = array->child;
+    while ((current_child != NULL) && (index > 0))
+    {
+        index--;
+        current_child = current_child->next;
+    }
+
+    return current_child;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON *object, const char *string)
+CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index)
 {
-    cJSON *c = object ? object->child : NULL;
-    while (c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
+    if (index < 0)
     {
-        c = c->next;
+        return NULL;
     }
-    return c;
+
+    return get_array_item(array, (size_t)index);
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string)
+static cJSON *get_object_item(const cJSON * const object, const char * const name, const cJSON_bool case_sensitive)
 {
     cJSON *current_element = NULL;
 
-    if ((object == NULL) || (string == NULL))
+    if ((object == NULL) || (name == NULL))
     {
         return NULL;
     }
 
     current_element = object->child;
-    while ((current_element != NULL) && (strcmp(string, current_element->string) != 0))
+    if (case_sensitive)
     {
-        current_element = current_element->next;
+        while ((current_element != NULL) && (current_element->string != NULL) && (strcmp(name, current_element->string) != 0))
+        {
+            current_element = current_element->next;
+        }
+    }
+    else
+    {
+        while ((current_element != NULL) && (case_insensitive_strcmp((const unsigned char*)name, (const unsigned char*)(current_element->string)) != 0))
+        {
+            current_element = current_element->next;
+        }
+    }
+
+    if ((current_element == NULL) || (current_element->string == NULL)) {
+        return NULL;
     }
 
     return current_element;
 }
 
-CJSON_PUBLIC(cjbool) cJSON_HasObjectItem(const cJSON *object, const char *string)
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string)
+{
+    return get_object_item(object, string, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string)
+{
+    return get_object_item(object, string, true);
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string)
 {
     return cJSON_GetObjectItem(object, string) ? 1 : 0;
 }
@@ -1501,26 +1833,32 @@ static void suffix_object(cJSON *prev, cJSON *item)
 /* Utility for handling references. */
 static cJSON *create_reference(const cJSON *item, const internal_hooks * const hooks)
 {
-    cJSON *ref = cJSON_New_Item(hooks);
-    if (!ref)
+    cJSON *reference = NULL;
+    if (item == NULL)
+    {
+        return NULL;
+    }
+
+    reference = cJSON_New_Item(hooks);
+    if (reference == NULL)
     {
         return NULL;
     }
-    memcpy(ref, item, sizeof(cJSON));
-    ref->string = NULL;
-    ref->type |= cJSON_IsReference;
-    ref->next = ref->prev = NULL;
-    return ref;
+
+    memcpy(reference, item, sizeof(cJSON));
+    reference->string = NULL;
+    reference->type |= cJSON_IsReference;
+    reference->next = reference->prev = NULL;
+    return reference;
 }
 
-/* Add item to array/object. */
-CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
+static cJSON_bool add_item_to_array(cJSON *array, cJSON *item)
 {
     cJSON *child = NULL;
 
     if ((item == NULL) || (array == NULL))
     {
-        return;
+        return false;
     }
 
     child = array->child;
@@ -1539,80 +1877,238 @@ CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
         }
         suffix_object(child, item);
     }
+
+    return true;
+}
+
+/* Add item to array/object. */
+CJSON_PUBLIC(void) cJSON_AddItemToArray(cJSON *array, cJSON *item)
+{
+    add_item_to_array(array, item);
+}
+
+#if defined(__clang__) || (defined(__GNUC__)  && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))
+    #pragma GCC diagnostic push
+#endif
+#ifdef __GNUC__
+#pragma GCC diagnostic ignored "-Wcast-qual"
+#endif
+/* helper function to cast away const */
+static void* cast_away_const(const void* string)
+{
+    return (void*)string;
+}
+#if defined(__clang__) || (defined(__GNUC__)  && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))
+    #pragma GCC diagnostic pop
+#endif
+
+
+static cJSON_bool add_item_to_object(cJSON * const object, const char * const string, cJSON * const item, const internal_hooks * const hooks, const cJSON_bool constant_key)
+{
+    char *new_key = NULL;
+    int new_type = cJSON_Invalid;
+
+    if ((object == NULL) || (string == NULL) || (item == NULL))
+    {
+        return false;
+    }
+
+    if (constant_key)
+    {
+        new_key = (char*)cast_away_const(string);
+        new_type = item->type | cJSON_StringIsConst;
+    }
+    else
+    {
+        new_key = (char*)cJSON_strdup((const unsigned char*)string, hooks);
+        if (new_key == NULL)
+        {
+            return false;
+        }
+
+        new_type = item->type & ~cJSON_StringIsConst;
+    }
+
+    if (!(item->type & cJSON_StringIsConst) && (item->string != NULL))
+    {
+        hooks->deallocate(item->string);
+    }
+
+    item->string = new_key;
+    item->type = new_type;
+
+    return add_item_to_array(object, item);
 }
 
 CJSON_PUBLIC(void) cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item)
 {
-    /* call cJSON_AddItemToObjectCS for code reuse */
-    cJSON_AddItemToObjectCS(object, (char*)cJSON_strdup((const unsigned char*)string, &global_hooks), item);
-    /* remove cJSON_StringIsConst flag */
-    item->type &= ~cJSON_StringIsConst;
+    add_item_to_object(object, string, item, &global_hooks, false);
 }
 
 /* Add an item to an object with constant string as key */
 CJSON_PUBLIC(void) cJSON_AddItemToObjectCS(cJSON *object, const char *string, cJSON *item)
 {
-    if (!item)
+    add_item_to_object(object, string, item, &global_hooks, true);
+}
+
+CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item)
+{
+    if (array == NULL)
     {
         return;
     }
-    if (!(item->type & cJSON_StringIsConst) && item->string)
+
+    add_item_to_array(array, create_reference(item, &global_hooks));
+}
+
+CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item)
+{
+    if ((object == NULL) || (string == NULL))
     {
-        global_hooks.deallocate(item->string);
+        return;
     }
-#ifdef __GNUC__
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wcast-qual"
-#endif
-    item->string = (char*)string;
-#ifdef __GNUC__
-#pragma GCC diagnostic pop
-#endif
-    item->type |= cJSON_StringIsConst;
-    cJSON_AddItemToArray(object, item);
+
+    add_item_to_object(object, string, create_reference(item, &global_hooks), &global_hooks, false);
 }
 
-CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item)
+CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name)
 {
-    cJSON_AddItemToArray(array, create_reference(item, &global_hooks));
+    cJSON *null = cJSON_CreateNull();
+    if (add_item_to_object(object, name, null, &global_hooks, false))
+    {
+        return null;
+    }
+
+    cJSON_Delete(null);
+    return NULL;
 }
 
-CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item)
+CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name)
 {
-    cJSON_AddItemToObject(object, string, create_reference(item, &global_hooks));
+    cJSON *true_item = cJSON_CreateTrue();
+    if (add_item_to_object(object, name, true_item, &global_hooks, false))
+    {
+        return true_item;
+    }
+
+    cJSON_Delete(true_item);
+    return NULL;
 }
 
-static cJSON *DetachItemFromArray(cJSON *array, size_t which)
+CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    cJSON *false_item = cJSON_CreateFalse();
+    if (add_item_to_object(object, name, false_item, &global_hooks, false))
     {
-        c = c->next;
-        which--;
+        return false_item;
     }
-    if (!c)
+
+    cJSON_Delete(false_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean)
+{
+    cJSON *bool_item = cJSON_CreateBool(boolean);
+    if (add_item_to_object(object, name, bool_item, &global_hooks, false))
+    {
+        return bool_item;
+    }
+
+    cJSON_Delete(bool_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number)
+{
+    cJSON *number_item = cJSON_CreateNumber(number);
+    if (add_item_to_object(object, name, number_item, &global_hooks, false))
+    {
+        return number_item;
+    }
+
+    cJSON_Delete(number_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string)
+{
+    cJSON *string_item = cJSON_CreateString(string);
+    if (add_item_to_object(object, name, string_item, &global_hooks, false))
+    {
+        return string_item;
+    }
+
+    cJSON_Delete(string_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw)
+{
+    cJSON *raw_item = cJSON_CreateRaw(raw);
+    if (add_item_to_object(object, name, raw_item, &global_hooks, false))
+    {
+        return raw_item;
+    }
+
+    cJSON_Delete(raw_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name)
+{
+    cJSON *object_item = cJSON_CreateObject();
+    if (add_item_to_object(object, name, object_item, &global_hooks, false))
+    {
+        return object_item;
+    }
+
+    cJSON_Delete(object_item);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name)
+{
+    cJSON *array = cJSON_CreateArray();
+    if (add_item_to_object(object, name, array, &global_hooks, false))
+    {
+        return array;
+    }
+
+    cJSON_Delete(array);
+    return NULL;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item)
+{
+    if ((parent == NULL) || (item == NULL))
     {
-        /* item doesn't exist */
         return NULL;
     }
-    if (c->prev)
+
+    if (item->prev != NULL)
     {
         /* not the first element */
-        c->prev->next = c->next;
+        item->prev->next = item->next;
     }
-    if (c->next)
+    if (item->next != NULL)
     {
-        c->next->prev = c->prev;
+        /* not the last element */
+        item->next->prev = item->prev;
     }
-    if (c==array->child)
+
+    if (item == parent->child)
     {
-        array->child = c->next;
+        /* first element */
+        parent->child = item->next;
     }
     /* make sure the detached item doesn't point anywhere anymore */
-    c->prev = c->next = NULL;
+    item->prev = NULL;
+    item->next = NULL;
 
-    return c;
+    return item;
 }
+
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which)
 {
     if (which < 0)
@@ -1620,7 +2116,7 @@ CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which)
         return NULL;
     }
 
-    return DetachItemFromArray(array, (size_t)which);
+    return cJSON_DetachItemViaPointer(array, get_array_item(array, (size_t)which));
 }
 
 CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which)
@@ -1630,19 +2126,16 @@ CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which)
 
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string)
 {
-    size_t i = 0;
-    cJSON *c = object->child;
-    while (c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
-    {
-        i++;
-        c = c->next;
-    }
-    if (c)
-    {
-        return DetachItemFromArray(object, i);
-    }
+    cJSON *to_detach = cJSON_GetObjectItem(object, string);
 
-    return NULL;
+    return cJSON_DetachItemViaPointer(object, to_detach);
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string)
+{
+    cJSON *to_detach = cJSON_GetObjectItemCaseSensitive(object, string);
+
+    return cJSON_DetachItemViaPointer(object, to_detach);
 }
 
 CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string)
@@ -1650,24 +2143,32 @@ CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string)
     cJSON_Delete(cJSON_DetachItemFromObject(object, string));
 }
 
+CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string)
+{
+    cJSON_Delete(cJSON_DetachItemFromObjectCaseSensitive(object, string));
+}
+
 /* Replace array/object items with new ones. */
 CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    cJSON *after_inserted = NULL;
+
+    if (which < 0)
     {
-        c = c->next;
-        which--;
+        return;
     }
-    if (!c)
+
+    after_inserted = get_array_item(array, (size_t)which);
+    if (after_inserted == NULL)
     {
-        cJSON_AddItemToArray(array, newitem);
+        add_item_to_array(array, newitem);
         return;
     }
-    newitem->next = c;
-    newitem->prev = c->prev;
-    c->prev = newitem;
-    if (c == array->child)
+
+    newitem->next = after_inserted;
+    newitem->prev = after_inserted->prev;
+    after_inserted->prev = newitem;
+    if (after_inserted == array->child)
     {
         array->child = newitem;
     }
@@ -1677,35 +2178,41 @@ CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newit
     }
 }
 
-static void ReplaceItemInArray(cJSON *array, size_t which, cJSON *newitem)
+CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement)
 {
-    cJSON *c = array->child;
-    while (c && (which > 0))
+    if ((parent == NULL) || (replacement == NULL) || (item == NULL))
     {
-        c = c->next;
-        which--;
+        return false;
     }
-    if (!c)
+
+    if (replacement == item)
     {
-        return;
+        return true;
     }
-    newitem->next = c->next;
-    newitem->prev = c->prev;
-    if (newitem->next)
+
+    replacement->next = item->next;
+    replacement->prev = item->prev;
+
+    if (replacement->next != NULL)
     {
-        newitem->next->prev = newitem;
+        replacement->next->prev = replacement;
     }
-    if (c == array->child)
+    if (replacement->prev != NULL)
     {
-        array->child = newitem;
+        replacement->prev->next = replacement;
     }
-    else
+    if (parent->child == item)
     {
-        newitem->prev->next = newitem;
+        parent->child = replacement;
     }
-    c->next = c->prev = NULL;
-    cJSON_Delete(c);
+
+    item->next = NULL;
+    item->prev = NULL;
+    cJSON_Delete(item);
+
+    return true;
 }
+
 CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem)
 {
     if (which < 0)
@@ -1713,29 +2220,37 @@ CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newi
         return;
     }
 
-    ReplaceItemInArray(array, (size_t)which, newitem);
+    cJSON_ReplaceItemViaPointer(array, get_array_item(array, (size_t)which), newitem);
 }
 
-CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem)
+static cJSON_bool replace_item_in_object(cJSON *object, const char *string, cJSON *replacement, cJSON_bool case_sensitive)
 {
-    size_t i = 0;
-    cJSON *c = object->child;
-    while(c && cJSON_strcasecmp((unsigned char*)c->string, (const unsigned char*)string))
+    if ((replacement == NULL) || (string == NULL))
     {
-        i++;
-        c = c->next;
+        return false;
     }
-    if(c)
-    {
-        /* free the old string if not const */
-        if (!(newitem->type & cJSON_StringIsConst) && newitem->string)
-        {
-             global_hooks.deallocate(newitem->string);
-        }
 
-        newitem->string = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks);
-        ReplaceItemInArray(object, i, newitem);
+    /* replace the name in the replacement */
+    if (!(replacement->type & cJSON_StringIsConst) && (replacement->string != NULL))
+    {
+        cJSON_free(replacement->string);
     }
+    replacement->string = (char*)cJSON_strdup((const unsigned char*)string, &global_hooks);
+    replacement->type &= ~cJSON_StringIsConst;
+
+    cJSON_ReplaceItemViaPointer(object, get_object_item(object, string, case_sensitive), replacement);
+
+    return true;
+}
+
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object, const char *string, cJSON *newitem)
+{
+    replace_item_in_object(object, string, newitem, false);
+}
+
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object, const char *string, cJSON *newitem)
+{
+    replace_item_in_object(object, string, newitem, true);
 }
 
 /* Create basic types: */
@@ -1772,7 +2287,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void)
     return item;
 }
 
-CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cjbool b)
+CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool b)
 {
     cJSON *item = cJSON_New_Item(&global_hooks);
     if(item)
@@ -1796,7 +2311,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num)
         {
             item->valueint = INT_MAX;
         }
-        else if (num <= INT_MIN)
+        else if (num <= (double)INT_MIN)
         {
             item->valueint = INT_MIN;
         }
@@ -1826,6 +2341,39 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string)
     return item;
 }
 
+CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string)
+{
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL)
+    {
+        item->type = cJSON_String | cJSON_IsReference;
+        item->valuestring = (char*)cast_away_const(string);
+    }
+
+    return item;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child)
+{
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL) {
+        item->type = cJSON_Object | cJSON_IsReference;
+        item->child = (cJSON*)cast_away_const(child);
+    }
+
+    return item;
+}
+
+CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child) {
+    cJSON *item = cJSON_New_Item(&global_hooks);
+    if (item != NULL) {
+        item->type = cJSON_Array | cJSON_IsReference;
+        item->child = (cJSON*)cast_away_const(child);
+    }
+
+    return item;
+}
+
 CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw)
 {
     cJSON *item = cJSON_New_Item(&global_hooks);
@@ -1873,7 +2421,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1908,7 +2456,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1917,7 +2465,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count)
 
     for(i = 0; a && (i < (size_t)count); i++)
     {
-        n = cJSON_CreateNumber(numbers[i]);
+        n = cJSON_CreateNumber((double)numbers[i]);
         if(!n)
         {
             cJSON_Delete(a);
@@ -1944,7 +2492,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateDoubleArray(const double *numbers, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (numbers == NULL))
     {
         return NULL;
     }
@@ -1980,7 +2528,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count)
     cJSON *p = NULL;
     cJSON *a = NULL;
 
-    if (count < 0)
+    if ((count < 0) || (strings == NULL))
     {
         return NULL;
     }
@@ -2010,7 +2558,7 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateStringArray(const char **strings, int count)
 }
 
 /* Duplication */
-CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cjbool recurse)
+CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse)
 {
     cJSON *newitem = NULL;
     cJSON *child = NULL;
@@ -2089,66 +2637,334 @@ CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cjbool recurse)
     return NULL;
 }
 
-CJSON_PUBLIC(void) cJSON_Minify(char *json)
+static void skip_oneline_comment(char **input)
 {
-    unsigned char *into = (unsigned char*)json;
-    while (*json)
+    *input += static_strlen("//");
+
+    for (; (*input)[0] != '\0'; ++(*input))
     {
-        if (*json == ' ')
-        {
-            json++;
+        if ((*input)[0] == '\n') {
+            *input += static_strlen("\n");
+            return;
         }
-        else if (*json == '\t')
+    }
+}
+
+static void skip_multiline_comment(char **input)
+{
+    *input += static_strlen("/*");
+
+    for (; (*input)[0] != '\0'; ++(*input))
+    {
+        if (((*input)[0] == '*') && ((*input)[1] == '/'))
         {
-            /* Whitespace characters. */
-            json++;
+            *input += static_strlen("*/");
+            return;
         }
-        else if (*json == '\r')
-        {
-            json++;
+    }
+}
+
+static void minify_string(char **input, char **output) {
+    (*output)[0] = (*input)[0];
+    *input += static_strlen("\"");
+    *output += static_strlen("\"");
+
+
+    for (; (*input)[0] != '\0'; (void)++(*input), ++(*output)) {
+        (*output)[0] = (*input)[0];
+
+        if ((*input)[0] == '\"') {
+            (*output)[0] = '\"';
+            *input += static_strlen("\"");
+            *output += static_strlen("\"");
+            return;
+        } else if (((*input)[0] == '\\') && ((*input)[1] == '\"')) {
+            (*output)[1] = (*input)[1];
+            *input += static_strlen("\"");
+            *output += static_strlen("\"");
         }
-        else if (*json=='\n')
+    }
+}
+
+CJSON_PUBLIC(void) cJSON_Minify(char *json)
+{
+    char *into = json;
+
+    if (json == NULL)
+    {
+        return;
+    }
+
+    while (json[0] != '\0')
+    {
+        switch (json[0])
         {
-            json++;
+            case ' ':
+            case '\t':
+            case '\r':
+            case '\n':
+                json++;
+                break;
+
+            case '/':
+                if (json[1] == '/')
+                {
+                    skip_oneline_comment(&json);
+                }
+                else if (json[1] == '*')
+                {
+                    skip_multiline_comment(&json);
+                } else {
+                    json++;
+                }
+                break;
+
+            case '\"':
+                minify_string(&json, (char**)&into);
+                break;
+
+            default:
+                into[0] = json[0];
+                json++;
+                into++;
         }
-        else if ((*json == '/') && (json[1] == '/'))
-        {
-            /* double-slash comments, to end of line. */
-            while (*json && (*json != '\n'))
+    }
+
+    /* and null-terminate. */
+    *into = '\0';
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Invalid;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_False;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xff) == cJSON_True;
+}
+
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & (cJSON_True | cJSON_False)) != 0;
+}
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_NULL;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Number;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_String;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Array;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Object;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item)
+{
+    if (item == NULL)
+    {
+        return false;
+    }
+
+    return (item->type & 0xFF) == cJSON_Raw;
+}
+
+CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive)
+{
+    if ((a == NULL) || (b == NULL) || ((a->type & 0xFF) != (b->type & 0xFF)) || cJSON_IsInvalid(a))
+    {
+        return false;
+    }
+
+    /* check if type is valid */
+    switch (a->type & 0xFF)
+    {
+        case cJSON_False:
+        case cJSON_True:
+        case cJSON_NULL:
+        case cJSON_Number:
+        case cJSON_String:
+        case cJSON_Raw:
+        case cJSON_Array:
+        case cJSON_Object:
+            break;
+
+        default:
+            return false;
+    }
+
+    /* identical objects are equal */
+    if (a == b)
+    {
+        return true;
+    }
+
+    switch (a->type & 0xFF)
+    {
+        /* in these cases and equal type is enough */
+        case cJSON_False:
+        case cJSON_True:
+        case cJSON_NULL:
+            return true;
+
+        case cJSON_Number:
+            if (a->valuedouble == b->valuedouble)
             {
-                json++;
+                return true;
             }
-        }
-        else if ((*json == '/') && (json[1] == '*'))
-        {
-            /* multiline comments. */
-            while (*json && !((*json == '*') && (json[1] == '/')))
+            return false;
+
+        case cJSON_String:
+        case cJSON_Raw:
+            if ((a->valuestring == NULL) || (b->valuestring == NULL))
             {
-                json++;
+                return false;
             }
-            json += 2;
-        }
-        else if (*json == '\"')
+            if (strcmp(a->valuestring, b->valuestring) == 0)
+            {
+                return true;
+            }
+
+            return false;
+
+        case cJSON_Array:
         {
-            /* string literals, which are \" sensitive. */
-            *into++ = (unsigned char)*json++;
-            while (*json && (*json != '\"'))
+            cJSON *a_element = a->child;
+            cJSON *b_element = b->child;
+
+            for (; (a_element != NULL) && (b_element != NULL);)
             {
-                if (*json == '\\')
+                if (!cJSON_Compare(a_element, b_element, case_sensitive))
                 {
-                    *into++ = (unsigned char)*json++;
+                    return false;
                 }
-                *into++ = (unsigned char)*json++;
+
+                a_element = a_element->next;
+                b_element = b_element->next;
+            }
+
+            /* one of the arrays is longer than the other */
+            if (a_element != b_element) {
+                return false;
             }
-            *into++ = (unsigned char)*json++;
+
+            return true;
         }
-        else
+
+        case cJSON_Object:
         {
-            /* All other characters. */
-            *into++ = (unsigned char)*json++;
+            cJSON *a_element = NULL;
+            cJSON *b_element = NULL;
+            cJSON_ArrayForEach(a_element, a)
+            {
+                /* TODO This has O(n^2) runtime, which is horrible! */
+                b_element = get_object_item(b, a_element->string, case_sensitive);
+                if (b_element == NULL)
+                {
+                    return false;
+                }
+
+                if (!cJSON_Compare(a_element, b_element, case_sensitive))
+                {
+                    return false;
+                }
+            }
+
+            /* doing this twice, once on a and b to prevent true comparison if a subset of b
+             * TODO: Do this the proper way, this is just a fix for now */
+            cJSON_ArrayForEach(b_element, b)
+            {
+                a_element = get_object_item(a, b_element->string, case_sensitive);
+                if (a_element == NULL)
+                {
+                    return false;
+                }
+
+                if (!cJSON_Compare(b_element, a_element, case_sensitive))
+                {
+                    return false;
+                }
+            }
+
+            return true;
         }
+
+        default:
+            return false;
     }
+}
 
-    /* and null-terminate. */
-    *into = '\0';
+CJSON_PUBLIC(void *) cJSON_malloc(size_t size)
+{
+    return global_hooks.allocate(size);
+}
+
+CJSON_PUBLIC(void) cJSON_free(void *object)
+{
+    global_hooks.deallocate(object);
 }
diff --git a/src/cJSON_Utils.c b/src/cJSON_Utils.c
index 240daba53f6..65465708d35 100644
--- a/src/cJSON_Utils.c
+++ b/src/cJSON_Utils.c
@@ -1,125 +1,194 @@
+/*
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+/* disable warnings about old C89 functions in MSVC */
+#if !defined(_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER)
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+
+#ifdef __GNUCC__
+#pragma GCC visibility push(default)
+#endif
+#if defined(_MSC_VER)
+#pragma warning (push)
+/* disable warning about single line comments in system headers */
+#pragma warning (disable : 4001)
+#endif
+
 #include <ctype.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <limits.h>
 
+#if defined(_MSC_VER)
+#pragma warning (pop)
+#endif
+#ifdef __GNUCC__
+#pragma GCC visibility pop
+#endif
+
 #include "switch_cJSON_Utils.h"
 
-static unsigned char* cJSONUtils_strdup(const unsigned char* str)
+/* define our own boolean type */
+#ifdef true
+#undef true
+#endif
+#define true ((cJSON_bool)1)
+
+#ifdef false
+#undef false
+#endif
+#define false ((cJSON_bool)0)
+
+static unsigned char* cJSONUtils_strdup(const unsigned char* const string)
 {
-    size_t len = 0;
+    size_t length = 0;
     unsigned char *copy = NULL;
 
-    len = strlen((const char*)str) + 1;
-    if (!(copy = (unsigned char*)malloc(len)))
+    length = strlen((const char*)string) + sizeof("");
+    copy = (unsigned char*) cJSON_malloc(length);
+    if (copy == NULL)
     {
         return NULL;
     }
-    memcpy(copy, str, len);
+    memcpy(copy, string, length);
 
     return copy;
 }
 
-static int cJSONUtils_strcasecmp(const unsigned char *s1, const unsigned char *s2)
+/* string comparison which doesn't consider NULL pointers equal */
+static int compare_strings(const unsigned char *string1, const unsigned char *string2, const cJSON_bool case_sensitive)
 {
-    if (!s1)
+    if ((string1 == NULL) || (string2 == NULL))
     {
-        return (s1 == s2) ? 0 : 1; /* both NULL? */
+        return 1;
     }
-    if (!s2)
+
+    if (string1 == string2)
     {
-        return 1;
+        return 0;
     }
-    for(; tolower(*s1) == tolower(*s2); ++s1, ++s2)
+
+    if (case_sensitive)
     {
-        if(*s1 == 0)
+        return strcmp((const char*)string1, (const char*)string2);
+    }
+
+    for(; tolower(*string1) == tolower(*string2); (void)string1++, string2++)
+    {
+        if (*string1 == '\0')
         {
             return 0;
         }
     }
 
-    return tolower(*s1) - tolower(*s2);
+    return tolower(*string1) - tolower(*string2);
 }
 
-/* JSON Pointer implementation: */
-static int cJSONUtils_Pstrcasecmp(const unsigned char *a, const unsigned char *e)
+/* Compare the next path element of two JSON pointers, two NULL pointers are considered unequal: */
+static cJSON_bool compare_pointers(const unsigned char *name, const unsigned char *pointer, const cJSON_bool case_sensitive)
 {
-    if (!a || !e)
+    if ((name == NULL) || (pointer == NULL))
     {
-        return (a == e) ? 0 : 1; /* both NULL? */
+        return false;
     }
-    for (; *a && *e && (*e != '/'); a++, e++) /* compare until next '/' */
+
+    for (; (*name != '\0') && (*pointer != '\0') && (*pointer != '/'); (void)name++, pointer++) /* compare until next '/' */
     {
-        if (*e == '~')
+        if (*pointer == '~')
         {
             /* check for escaped '~' (~0) and '/' (~1) */
-            if (!((e[1] == '0') && (*a == '~')) && !((e[1] == '1') && (*a == '/')))
+            if (((pointer[1] != '0') || (*name != '~')) && ((pointer[1] != '1') || (*name != '/')))
             {
-                /* invalid escape sequence or wrong character in *a */
-                return 1;
+                /* invalid escape sequence or wrong character in *name */
+                return false;
             }
             else
             {
-                e++;
+                pointer++;
             }
         }
-        else if (tolower(*a) != tolower(*e))
+        else if ((!case_sensitive && (tolower(*name) != tolower(*pointer))) || (case_sensitive && (*name != *pointer)))
         {
-            return 1;
+            return false;
         }
     }
-    if (((*e != 0) && (*e != '/')) != (*a != 0))
+    if (((*pointer != 0) && (*pointer != '/')) != (*name != 0))
     {
         /* one string has ended, the other not */
-        return 1;
+        return false;;
     }
 
-    return 0;
+    return true;
 }
 
-static size_t cJSONUtils_PointerEncodedstrlen(const unsigned char *s)
+/* calculate the length of a string if encoded as JSON pointer with ~0 and ~1 escape sequences */
+static size_t pointer_encoded_length(const unsigned char *string)
 {
-    size_t l = 0;
-    for (; *s; s++, l++)
+    size_t length;
+    for (length = 0; *string != '\0'; (void)string++, length++)
     {
-        if ((*s == '~') || (*s == '/'))
+        /* character needs to be escaped? */
+        if ((*string == '~') || (*string == '/'))
         {
-            l++;
+            length++;
         }
     }
 
-    return l;
+    return length;
 }
 
-static void cJSONUtils_PointerEncodedstrcpy(unsigned char *d, const unsigned char *s)
+/* copy a string while escaping '~' and '/' with ~0 and ~1 JSON pointer escape codes */
+static void encode_string_as_pointer(unsigned char *destination, const unsigned char *source)
 {
-    for (; *s; s++)
+    for (; source[0] != '\0'; (void)source++, destination++)
     {
-        if (*s == '/')
+        if (source[0] == '/')
         {
-            *d++ = '~';
-            *d++ = '1';
+            destination[1] = '1';
+            destination++;
         }
-        else if (*s == '~')
+        else if (source[0] == '~')
         {
-            *d++ = '~';
-            *d++ = '0';
+            destination[0] = '~';
+            destination[1] = '1';
+            destination++;
         }
         else
         {
-            *d++ = *s;
+            destination[0] = source[0];
         }
     }
 
-    *d = '\0';
+    destination[0] = '\0';
 }
 
-CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *target)
+CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(const cJSON * const object, const cJSON * const target)
 {
-    int type = object->type;
-    size_t c = 0;
-    cJSON *obj = 0;
+    size_t child_index = 0;
+    cJSON *current_child = 0;
+
+    if ((object == NULL) || (target == NULL))
+    {
+        return NULL;
+    }
 
     if (object == target)
     {
@@ -127,42 +196,44 @@ CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *ta
         return (char*)cJSONUtils_strdup((const unsigned char*)"");
     }
 
-    /* recursively search all children of the object */
-    for (obj = object->child; obj; obj = obj->next, c++)
+    /* recursively search all children of the object or array */
+    for (current_child = object->child; current_child != NULL; (void)(current_child = current_child->next), child_index++)
     {
-        unsigned char *found = (unsigned char*)cJSONUtils_FindPointerFromObjectTo(obj, target);
-        if (found)
+        unsigned char *target_pointer = (unsigned char*)cJSONUtils_FindPointerFromObjectTo(current_child, target);
+        /* found the target? */
+        if (target_pointer != NULL)
         {
-            if ((type & 0xFF) == cJSON_Array)
+            if (cJSON_IsArray(object))
             {
                 /* reserve enough memory for a 64 bit integer + '/' and '\0' */
-                unsigned char *ret = (unsigned char*)malloc(strlen((char*)found) + 23);
+                unsigned char *full_pointer = (unsigned char*)cJSON_malloc(strlen((char*)target_pointer) + 20 + sizeof("/"));
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (child_index > ULONG_MAX)
                 {
-                    free(found);
+                    cJSON_free(target_pointer);
                     return NULL;
                 }
-                sprintf((char*)ret, "/%lu%s", (unsigned long)c, found); /* /<array_index><path> */
-                free(found);
+                sprintf((char*)full_pointer, "/%lu%s", (unsigned long)child_index, target_pointer); /* /<array_index><path> */
+                cJSON_free(target_pointer);
 
-                return (char*)ret;
+                return (char*)full_pointer;
             }
-            else if ((type & 0xFF) == cJSON_Object)
+
+            if (cJSON_IsObject(object))
             {
-                unsigned char *ret = (unsigned char*)malloc(strlen((char*)found) + cJSONUtils_PointerEncodedstrlen((unsigned char*)obj->string) + 2);
-                *ret = '/';
-                cJSONUtils_PointerEncodedstrcpy(ret + 1, (unsigned char*)obj->string);
-                strcat((char*)ret, (char*)found);
-                free(found);
+                unsigned char *full_pointer = (unsigned char*)cJSON_malloc(strlen((char*)target_pointer) + pointer_encoded_length((unsigned char*)current_child->string) + 2);
+                full_pointer[0] = '/';
+                encode_string_as_pointer(full_pointer + 1, (unsigned char*)current_child->string);
+                strcat((char*)full_pointer, (char*)target_pointer);
+                cJSON_free(target_pointer);
 
-                return (char*)ret;
+                return (char*)full_pointer;
             }
 
             /* reached leaf of the tree, found nothing */
-            free(found);
+            cJSON_free(target_pointer);
             return NULL;
         }
     }
@@ -171,392 +242,881 @@ CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *ta
     return NULL;
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON *object, const char *pointer)
+/* non broken version of cJSON_GetArrayItem */
+static cJSON *get_array_item(const cJSON *array, size_t item)
+{
+    cJSON *child = array ? array->child : NULL;
+    while ((child != NULL) && (item > 0))
+    {
+        item--;
+        child = child->next;
+    }
+
+    return child;
+}
+
+static cJSON_bool decode_array_index_from_pointer(const unsigned char * const pointer, size_t * const index)
 {
+    size_t parsed_index = 0;
+    size_t position = 0;
+
+    if ((pointer[0] == '0') && ((pointer[1] != '\0') && (pointer[1] != '/')))
+    {
+        /* leading zeroes are not permitted */
+        return 0;
+    }
+
+    for (position = 0; (pointer[position] >= '0') && (pointer[0] <= '9'); position++)
+    {
+        parsed_index = (10 * parsed_index) + (size_t)(pointer[position] - '0');
+
+    }
+
+    if ((pointer[position] != '\0') && (pointer[position] != '/'))
+    {
+        return 0;
+    }
+
+    *index = parsed_index;
+
+    return 1;
+}
+
+static cJSON *get_item_from_pointer(cJSON * const object, const char * pointer, const cJSON_bool case_sensitive)
+{
+    cJSON *current_element = object;
+
+    if (pointer == NULL)
+    {
+        return NULL;
+    }
+
     /* follow path of the pointer */
-    while ((*pointer++ == '/') && object)
+    while ((pointer[0] == '/') && (current_element != NULL))
     {
-        if ((object->type & 0xFF) == cJSON_Array)
+        pointer++;
+        if (cJSON_IsArray(current_element))
         {
-            size_t which = 0;
-            /* parse array index */
-            while ((*pointer >= '0') && (*pointer <= '9'))
-            {
-                which = (10 * which) + (size_t)(*pointer++ - '0');
-            }
-            if (*pointer && (*pointer != '/'))
-            {
-                /* not end of string or new path token */
-                return NULL;
-            }
-            if (which > INT_MAX)
+            size_t index = 0;
+            if (!decode_array_index_from_pointer((const unsigned char*)pointer, &index))
             {
                 return NULL;
             }
-            object = cJSON_GetArrayItem(object, (int)which);
+
+            current_element = get_array_item(current_element, index);
         }
-        else if ((object->type & 0xFF) == cJSON_Object)
+        else if (cJSON_IsObject(current_element))
         {
-            object = object->child;
+            current_element = current_element->child;
             /* GetObjectItem. */
-            while (object && cJSONUtils_Pstrcasecmp((unsigned char*)object->string, (const unsigned char*)pointer))
+            while ((current_element != NULL) && !compare_pointers((unsigned char*)current_element->string, (const unsigned char*)pointer, case_sensitive))
             {
-                object = object->next;
-            }
-            /* skip to the next path token or end of string */
-            while (*pointer && (*pointer != '/'))
-            {
-                pointer++;
+                current_element = current_element->next;
             }
         }
         else
         {
             return NULL;
         }
+
+        /* skip to the next path token or end of string */
+        while ((pointer[0] != '\0') && (pointer[0] != '/'))
+        {
+            pointer++;
+        }
     }
 
-    return object;
+    return current_element;
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON * const object, const char *pointer)
+{
+    return get_item_from_pointer(object, pointer, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointerCaseSensitive(cJSON * const object, const char *pointer)
+{
+    return get_item_from_pointer(object, pointer, true);
 }
 
 /* JSON Patch implementation. */
-static void cJSONUtils_InplaceDecodePointerString(unsigned char *string)
+static void decode_pointer_inplace(unsigned char *string)
 {
-    unsigned char *s2 = string;
+    unsigned char *decoded_string = string;
 
     if (string == NULL) {
         return;
     }
 
-    for (; *string; s2++, string++)
+    for (; *string; (void)decoded_string++, string++)
     {
-        *s2 = (*string != '~')
-            ? (*string)
-            : ((*(++string) == '0')
-                    ? '~'
-                    : '/');
+        if (string[0] == '~')
+        {
+            if (string[1] == '0')
+            {
+                decoded_string[0] = '~';
+            }
+            else if (string[1] == '1')
+            {
+                decoded_string[1] = '/';
+            }
+            else
+            {
+                /* invalid escape sequence */
+                return;
+            }
+
+            string++;
+        }
     }
 
-    *s2 = '\0';
+    decoded_string[0] = '\0';
 }
 
-static cJSON *cJSONUtils_PatchDetach(cJSON *object, const unsigned char *path)
+/* non-broken cJSON_DetachItemFromArray */
+static cJSON *detach_item_from_array(cJSON *array, size_t which)
 {
-    unsigned char *parentptr = NULL;
-    unsigned char *childptr = NULL;
+    cJSON *c = array->child;
+    while (c && (which > 0))
+    {
+        c = c->next;
+        which--;
+    }
+    if (!c)
+    {
+        /* item doesn't exist */
+        return NULL;
+    }
+    if (c->prev)
+    {
+        /* not the first element */
+        c->prev->next = c->next;
+    }
+    if (c->next)
+    {
+        c->next->prev = c->prev;
+    }
+    if (c==array->child)
+    {
+        array->child = c->next;
+    }
+    /* make sure the detached item doesn't point anywhere anymore */
+    c->prev = c->next = NULL;
+
+    return c;
+}
+
+/* detach an item at the given path */
+static cJSON *detach_path(cJSON *object, const unsigned char *path, const cJSON_bool case_sensitive)
+{
+    unsigned char *parent_pointer = NULL;
+    unsigned char *child_pointer = NULL;
     cJSON *parent = NULL;
-    cJSON *ret = NULL;
+    cJSON *detached_item = NULL;
 
     /* copy path and split it in parent and child */
-    parentptr = cJSONUtils_strdup(path);
-    if (parentptr == NULL) {
-        return NULL;
+    parent_pointer = cJSONUtils_strdup(path);
+    if (parent_pointer == NULL) {
+        goto cleanup;
     }
 
-    childptr = (unsigned char*)strrchr((char*)parentptr, '/'); /* last '/' */
-    if (childptr == NULL)
+    child_pointer = (unsigned char*)strrchr((char*)parent_pointer, '/'); /* last '/' */
+    if (child_pointer == NULL)
     {
-        free(parentptr);
-        return NULL;
+        goto cleanup;
     }
     /* split strings */
-    *childptr++ = '\0';
+    child_pointer[0] = '\0';
+    child_pointer++;
 
-    parent = cJSONUtils_GetPointer(object, (char*)parentptr);
-    cJSONUtils_InplaceDecodePointerString(childptr);
+    parent = get_item_from_pointer(object, (char*)parent_pointer, case_sensitive);
+    decode_pointer_inplace(child_pointer);
 
-    if (!parent)
+    if (cJSON_IsArray(parent))
+    {
+        size_t index = 0;
+        if (!decode_array_index_from_pointer(child_pointer, &index))
+        {
+            goto cleanup;
+        }
+        detached_item = detach_item_from_array(parent, index);
+    }
+    else if (cJSON_IsObject(parent))
+    {
+        detached_item = cJSON_DetachItemFromObject(parent, (char*)child_pointer);
+    }
+    else
     {
         /* Couldn't find object to remove child from. */
-        ret = NULL;
+        goto cleanup;
+    }
+
+cleanup:
+    if (parent_pointer != NULL)
+    {
+        cJSON_free(parent_pointer);
+    }
+
+    return detached_item;
+}
+
+/* sort lists using mergesort */
+static cJSON *sort_list(cJSON *list, const cJSON_bool case_sensitive)
+{
+    cJSON *first = list;
+    cJSON *second = list;
+    cJSON *current_item = list;
+    cJSON *result = list;
+    cJSON *result_tail = NULL;
+
+    if ((list == NULL) || (list->next == NULL))
+    {
+        /* One entry is sorted already. */
+        return result;
+    }
+
+    while ((current_item != NULL) && (current_item->next != NULL) && (compare_strings((unsigned char*)current_item->string, (unsigned char*)current_item->next->string, case_sensitive) < 0))
+    {
+        /* Test for list sorted. */
+        current_item = current_item->next;
     }
-    else if ((parent->type & 0xFF) == cJSON_Array)
+    if ((current_item == NULL) || (current_item->next == NULL))
     {
-        ret = cJSON_DetachItemFromArray(parent, atoi((char*)childptr));
+        /* Leave sorted lists unmodified. */
+        return result;
     }
-    else if ((parent->type & 0xFF) == cJSON_Object)
+
+    /* reset pointer to the beginning */
+    current_item = list;
+    while (current_item != NULL)
     {
-        ret = cJSON_DetachItemFromObject(parent, (char*)childptr);
+        /* Walk two pointers to find the middle. */
+        second = second->next;
+        current_item = current_item->next;
+        /* advances current_item two steps at a time */
+        if (current_item != NULL)
+        {
+            current_item = current_item->next;
+        }
+    }
+    if ((second != NULL) && (second->prev != NULL))
+    {
+        /* Split the lists */
+        second->prev->next = NULL;
+        second->prev = NULL;
     }
-    free(parentptr);
 
-    /* return the detachted item */
-    return ret;
+    /* Recursively sort the sub-lists. */
+    first = sort_list(first, case_sensitive);
+    second = sort_list(second, case_sensitive);
+    result = NULL;
+
+    /* Merge the sub-lists */
+    while ((first != NULL) && (second != NULL))
+    {
+        cJSON *smaller = NULL;
+        if (compare_strings((unsigned char*)first->string, (unsigned char*)second->string, case_sensitive) < 0)
+        {
+            smaller = first;
+        }
+        else
+        {
+            smaller = second;
+        }
+
+        if (result == NULL)
+        {
+            /* start merged list with the smaller element */
+            result_tail = smaller;
+            result = smaller;
+        }
+        else
+        {
+            /* add smaller element to the list */
+            result_tail->next = smaller;
+            smaller->prev = result_tail;
+            result_tail = smaller;
+        }
+
+        if (first == smaller)
+        {
+            first = first->next;
+        }
+        else
+        {
+            second = second->next;
+        }
+    }
+
+    if (first != NULL)
+    {
+        /* Append rest of first list. */
+        if (result == NULL)
+        {
+            return first;
+        }
+        result_tail->next = first;
+        first->prev = result_tail;
+    }
+    if (second != NULL)
+    {
+        /* Append rest of second list */
+        if (result == NULL)
+        {
+            return second;
+        }
+        result_tail->next = second;
+        second->prev = result_tail;
+    }
+
+    return result;
+}
+
+static void sort_object(cJSON * const object, const cJSON_bool case_sensitive)
+{
+    if (object == NULL)
+    {
+        return;
+    }
+    object->child = sort_list(object->child, case_sensitive);
 }
 
-static int cJSONUtils_Compare(cJSON *a, cJSON *b)
+static cJSON_bool compare_json(cJSON *a, cJSON *b, const cJSON_bool case_sensitive)
 {
     if ((a == NULL) || (b == NULL) || ((a->type & 0xFF) != (b->type & 0xFF)))
     {
         /* mismatched type. */
-        return -1;
+        return false;
     }
     switch (a->type & 0xFF)
     {
         case cJSON_Number:
             /* numeric mismatch. */
-            return ((a->valueint != b->valueint) || (a->valuedouble != b->valuedouble)) ? -2 : 0;
+            if ((a->valueint != b->valueint) || (a->valuedouble != b->valuedouble))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_String:
             /* string mismatch. */
-            return (strcmp(a->valuestring, b->valuestring) != 0) ? -3 : 0;
+            if (strcmp(a->valuestring, b->valuestring) != 0)
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_Array:
-            for (a = a->child, b = b->child; a && b; a = a->next, b = b->next)
+            for ((void)(a = a->child), b = b->child; (a != NULL) && (b != NULL); (void)(a = a->next), b = b->next)
             {
-                int err = cJSONUtils_Compare(a, b);
-                if (err)
+                cJSON_bool identical = compare_json(a, b, case_sensitive);
+                if (!identical)
                 {
-                    return err;
+                    return false;
                 }
             }
+
             /* array size mismatch? (one of both children is not NULL) */
-            return (a || b) ? -4 : 0;
+            if ((a != NULL) || (b != NULL))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+
         case cJSON_Object:
-            cJSONUtils_SortObject(a);
-            cJSONUtils_SortObject(b);
-            a = a->child;
-            b = b->child;
-            while (a && b)
+            sort_object(a, case_sensitive);
+            sort_object(b, case_sensitive);
+            for ((void)(a = a->child), b = b->child; (a != NULL) && (b != NULL); (void)(a = a->next), b = b->next)
             {
-                int err = 0;
+                cJSON_bool identical = false;
                 /* compare object keys */
-                if (cJSONUtils_strcasecmp((unsigned char*)a->string, (unsigned char*)b->string))
+                if (compare_strings((unsigned char*)a->string, (unsigned char*)b->string, case_sensitive))
                 {
                     /* missing member */
-                    return -6;
+                    return false;
                 }
-                err = cJSONUtils_Compare(a, b);
-                if (err)
+                identical = compare_json(a, b, case_sensitive);
+                if (!identical)
                 {
-                    return err;
+                    return false;
                 }
-                a = a->next;
-                b = b->next;
             }
+
             /* object length mismatch (one of both children is not null) */
-            return (a || b) ? -5 : 0;
+            if ((a != NULL) || (b != NULL))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
 
         default:
             break;
     }
+
     /* null, true or false */
-    return 0;
+    return true;
 }
 
-static int cJSONUtils_ApplyPatch(cJSON *object, cJSON *patch)
+/* non broken version of cJSON_InsertItemInArray */
+static cJSON_bool insert_item_in_array(cJSON *array, size_t which, cJSON *newitem)
 {
-    cJSON *op = NULL;
-    cJSON *path = NULL;
-    cJSON *value = NULL;
-    cJSON *parent = NULL;
-    int opcode = 0;
-    unsigned char *parentptr = NULL;
-    unsigned char *childptr = NULL;
+    cJSON *child = array->child;
+    while (child && (which > 0))
+    {
+        child = child->next;
+        which--;
+    }
+    if (which > 0)
+    {
+        /* item is after the end of the array */
+        return 0;
+    }
+    if (child == NULL)
+    {
+        cJSON_AddItemToArray(array, newitem);
+        return 1;
+    }
+
+    /* insert into the linked list */
+    newitem->next = child;
+    newitem->prev = child->prev;
+    child->prev = newitem;
 
-    op = cJSON_GetObjectItem(patch, "op");
-    path = cJSON_GetObjectItem(patch, "path");
-    if (!op || !path)
+    /* was it at the beginning */
+    if (child == array->child)
     {
-        /* malformed patch. */
-        return 2;
+        array->child = newitem;
+    }
+    else
+    {
+        newitem->prev->next = newitem;
+    }
+
+    return 1;
+}
+
+static cJSON *get_object_item(const cJSON * const object, const char* name, const cJSON_bool case_sensitive)
+{
+    if (case_sensitive)
+    {
+        return cJSON_GetObjectItemCaseSensitive(object, name);
     }
 
-    /* decode operation */
-    if (!strcmp(op->valuestring, "add"))
+    return cJSON_GetObjectItem(object, name);
+}
+
+enum patch_operation { INVALID, ADD, REMOVE, REPLACE, MOVE, COPY, TEST };
+
+static enum patch_operation decode_patch_operation(const cJSON * const patch, const cJSON_bool case_sensitive)
+{
+    cJSON *operation = get_object_item(patch, "op", case_sensitive);
+    if (!cJSON_IsString(operation))
     {
-        opcode = 0;
+        return INVALID;
     }
-    else if (!strcmp(op->valuestring, "remove"))
+
+    if (strcmp(operation->valuestring, "add") == 0)
     {
-        opcode = 1;
+        return ADD;
     }
-    else if (!strcmp(op->valuestring, "replace"))
+
+    if (strcmp(operation->valuestring, "remove") == 0)
     {
-        opcode = 2;
+        return REMOVE;
     }
-    else if (!strcmp(op->valuestring, "move"))
+
+    if (strcmp(operation->valuestring, "replace") == 0)
     {
-        opcode = 3;
+        return REPLACE;
     }
-    else if (!strcmp(op->valuestring, "copy"))
+
+    if (strcmp(operation->valuestring, "move") == 0)
     {
-        opcode = 4;
+        return MOVE;
     }
-    else if (!strcmp(op->valuestring, "test"))
+
+    if (strcmp(operation->valuestring, "copy") == 0)
+    {
+        return COPY;
+    }
+
+    if (strcmp(operation->valuestring, "test") == 0)
+    {
+        return TEST;
+    }
+
+    return INVALID;
+}
+
+/* overwrite and existing item with another one and free resources on the way */
+static void overwrite_item(cJSON * const root, const cJSON replacement)
+{
+    if (root == NULL)
+    {
+        return;
+    }
+
+    if (root->string != NULL)
+    {
+        cJSON_free(root->string);
+    }
+    if (root->valuestring != NULL)
+    {
+        cJSON_free(root->valuestring);
+    }
+    if (root->child != NULL)
+    {
+        cJSON_Delete(root->child);
+    }
+
+    memcpy(root, &replacement, sizeof(cJSON));
+}
+
+static int apply_patch(cJSON *object, const cJSON *patch, const cJSON_bool case_sensitive)
+{
+    cJSON *path = NULL;
+    cJSON *value = NULL;
+    cJSON *parent = NULL;
+    enum patch_operation opcode = INVALID;
+    unsigned char *parent_pointer = NULL;
+    unsigned char *child_pointer = NULL;
+    int status = 0;
+
+    path = get_object_item(patch, "path", case_sensitive);
+    if (!cJSON_IsString(path))
+    {
+        /* malformed patch. */
+        status = 2;
+        goto cleanup;
+    }
+
+    opcode = decode_patch_operation(patch, case_sensitive);
+    if (opcode == INVALID)
+    {
+        status = 3;
+        goto cleanup;
+    }
+    else if (opcode == TEST)
     {
         /* compare value: {...} with the given path */
-        return cJSONUtils_Compare(cJSONUtils_GetPointer(object, path->valuestring), cJSON_GetObjectItem(patch, "value"));
+        status = !compare_json(get_item_from_pointer(object, path->valuestring, case_sensitive), get_object_item(patch, "value", case_sensitive), case_sensitive);
+        goto cleanup;
     }
-    else
+
+    /* special case for replacing the root */
+    if (path->valuestring[0] == '\0')
     {
-        /* unknown opcode. */
-        return 3;
+        if (opcode == REMOVE)
+        {
+            static const cJSON invalid = { NULL, NULL, NULL, cJSON_Invalid, NULL, 0, 0, NULL};
+
+            overwrite_item(object, invalid);
+
+            status = 0;
+            goto cleanup;
+        }
+
+        if ((opcode == REPLACE) || (opcode == ADD))
+        {
+            value = get_object_item(patch, "value", case_sensitive);
+            if (value == NULL)
+            {
+                /* missing "value" for add/replace. */
+                status = 7;
+                goto cleanup;
+            }
+
+            value = cJSON_Duplicate(value, 1);
+            if (value == NULL)
+            {
+                /* out of memory for add/replace. */
+                status = 8;
+                goto cleanup;
+            }
+
+            overwrite_item(object, *value);
+
+            /* delete the duplicated value */
+            cJSON_free(value);
+            value = NULL;
+
+            /* the string "value" isn't needed */
+            if (object->string != NULL)
+            {
+                cJSON_free(object->string);
+                object->string = NULL;
+            }
+
+            status = 0;
+            goto cleanup;
+        }
     }
 
-    /* Remove/Replace */
-    if ((opcode == 1) || (opcode == 2))
+    if ((opcode == REMOVE) || (opcode == REPLACE))
     {
         /* Get rid of old. */
-        cJSON_Delete(cJSONUtils_PatchDetach(object, (unsigned char*)path->valuestring));
-        if (opcode == 1)
+        cJSON *old_item = detach_path(object, (unsigned char*)path->valuestring, case_sensitive);
+        if (old_item == NULL)
         {
-            /* For Remove, this is job done. */
-            return 0;
+            status = 13;
+            goto cleanup;
+        }
+        cJSON_Delete(old_item);
+        if (opcode == REMOVE)
+        {
+            /* For Remove, this job is done. */
+            status = 0;
+            goto cleanup;
         }
     }
 
     /* Copy/Move uses "from". */
-    if ((opcode == 3) || (opcode == 4))
+    if ((opcode == MOVE) || (opcode == COPY))
     {
-        cJSON *from = cJSON_GetObjectItem(patch, "from");
-        if (!from)
+        cJSON *from = get_object_item(patch, "from", case_sensitive);
+        if (from == NULL)
         {
             /* missing "from" for copy/move. */
-            return 4;
+            status = 4;
+            goto cleanup;
         }
 
-        if (opcode == 3)
+        if (opcode == MOVE)
         {
-            /* move */
-            value = cJSONUtils_PatchDetach(object, (unsigned char*)from->valuestring);
+            value = detach_path(object, (unsigned char*)from->valuestring, case_sensitive);
         }
-        if (opcode == 4)
+        if (opcode == COPY)
         {
-            /* copy */
-            value = cJSONUtils_GetPointer(object, from->valuestring);
+            value = get_item_from_pointer(object, from->valuestring, case_sensitive);
         }
-        if (!value)
+        if (value == NULL)
         {
             /* missing "from" for copy/move. */
-            return 5;
+            status = 5;
+            goto cleanup;
         }
-        if (opcode == 4)
+        if (opcode == COPY)
         {
             value = cJSON_Duplicate(value, 1);
         }
-        if (!value)
+        if (value == NULL)
         {
             /* out of memory for copy/move. */
-            return 6;
+            status = 6;
+            goto cleanup;
         }
     }
     else /* Add/Replace uses "value". */
     {
-        value = cJSON_GetObjectItem(patch, "value");
-        if (!value)
+        value = get_object_item(patch, "value", case_sensitive);
+        if (value == NULL)
         {
             /* missing "value" for add/replace. */
-            return 7;
+            status = 7;
+            goto cleanup;
         }
         value = cJSON_Duplicate(value, 1);
-        if (!value)
+        if (value == NULL)
         {
             /* out of memory for add/replace. */
-            return 8;
+            status = 8;
+            goto cleanup;
         }
     }
 
     /* Now, just add "value" to "path". */
 
     /* split pointer in parent and child */
-    parentptr = cJSONUtils_strdup((unsigned char*)path->valuestring);
-    childptr = (unsigned char*)strrchr((char*)parentptr, '/');
-    if (childptr)
+    parent_pointer = cJSONUtils_strdup((unsigned char*)path->valuestring);
+    child_pointer = (unsigned char*)strrchr((char*)parent_pointer, '/');
+    if (child_pointer != NULL)
     {
-        *childptr++ = '\0';
+        child_pointer[0] = '\0';
+        child_pointer++;
     }
-    parent = cJSONUtils_GetPointer(object, (char*)parentptr);
-    cJSONUtils_InplaceDecodePointerString(childptr);
+    parent = get_item_from_pointer(object, (char*)parent_pointer, case_sensitive);
+    decode_pointer_inplace(child_pointer);
 
     /* add, remove, replace, move, copy, test. */
-    if (!parent)
+    if ((parent == NULL) || (child_pointer == NULL))
     {
         /* Couldn't find object to add to. */
-        free(parentptr);
-        cJSON_Delete(value);
-        return 9;
+        status = 9;
+        goto cleanup;
     }
-    else if ((parent->type & 0xFF) == cJSON_Array)
+    else if (cJSON_IsArray(parent))
     {
-        if (!strcmp((char*)childptr, "-"))
+        if (strcmp((char*)child_pointer, "-") == 0)
         {
             cJSON_AddItemToArray(parent, value);
+            value = NULL;
+        }
+        else
+        {
+            size_t index = 0;
+            if (!decode_array_index_from_pointer(child_pointer, &index))
+            {
+                status = 11;
+                goto cleanup;
+            }
+
+            if (!insert_item_in_array(parent, index, value))
+            {
+                status = 10;
+                goto cleanup;
+            }
+            value = NULL;
+        }
+    }
+    else if (cJSON_IsObject(parent))
+    {
+        if (case_sensitive)
+        {
+            cJSON_DeleteItemFromObjectCaseSensitive(parent, (char*)child_pointer);
         }
         else
         {
-            cJSON_InsertItemInArray(parent, atoi((char*)childptr), value);
+            cJSON_DeleteItemFromObject(parent, (char*)child_pointer);
         }
+        cJSON_AddItemToObject(parent, (char*)child_pointer, value);
+        value = NULL;
     }
-    else if ((parent->type & 0xFF) == cJSON_Object)
+    else /* parent is not an object */
     {
-        cJSON_DeleteItemFromObject(parent, (char*)childptr);
-        cJSON_AddItemToObject(parent, (char*)childptr, value);
+        /* Couldn't find object to add to. */
+        status = 9;
+        goto cleanup;
     }
-    else
+
+cleanup:
+    if (value != NULL)
     {
         cJSON_Delete(value);
     }
-    free(parentptr);
+    if (parent_pointer != NULL)
+    {
+        cJSON_free(parent_pointer);
+    }
 
-    return 0;
+    return status;
 }
 
-CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches)
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON * const object, const cJSON * const patches)
 {
-    int err = 0;
+    const cJSON *current_patch = NULL;
+    int status = 0;
 
-    if (patches == NULL)
+    if (!cJSON_IsArray(patches))
     {
+        /* malformed patches. */
         return 1;
     }
 
-    if ((patches->type & 0xFF) != cJSON_Array)
+    if (patches != NULL)
+    {
+        current_patch = patches->child;
+    }
+
+    while (current_patch != NULL)
+    {
+        status = apply_patch(object, current_patch, false);
+        if (status != 0)
+        {
+            return status;
+        }
+        current_patch = current_patch->next;
+    }
+
+    return 0;
+}
+
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatchesCaseSensitive(cJSON * const object, const cJSON * const patches)
+{
+    const cJSON *current_patch = NULL;
+    int status = 0;
+
+    if (!cJSON_IsArray(patches))
     {
         /* malformed patches. */
         return 1;
     }
-    if (patches)
+
+    if (patches != NULL)
     {
-        patches = patches->child;
+        current_patch = patches->child;
     }
-    while (patches)
+
+    while (current_patch != NULL)
     {
-        if ((err = cJSONUtils_ApplyPatch(object, patches)))
+        status = apply_patch(object, current_patch, true);
+        if (status != 0)
         {
-            return err;
+            return status;
         }
-        patches = patches->next;
+        current_patch = current_patch->next;
     }
 
     return 0;
 }
 
-static void cJSONUtils_GeneratePatch(cJSON *patches, const unsigned char *op, const unsigned char *path, const unsigned char *suffix, cJSON *val)
+static void compose_patch(cJSON * const patches, const unsigned char * const operation, const unsigned char * const path, const unsigned char *suffix, const cJSON * const value)
 {
-    cJSON *patch = cJSON_CreateObject();
-    cJSON_AddItemToObject(patch, "op", cJSON_CreateString((const char*)op));
-    if (suffix)
+    cJSON *patch = NULL;
+
+    if ((patches == NULL) || (operation == NULL) || (path == NULL))
+    {
+        return;
+    }
+
+    patch = cJSON_CreateObject();
+    if (patch == NULL)
     {
-        unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + cJSONUtils_PointerEncodedstrlen(suffix) + 2);
-        cJSONUtils_PointerEncodedstrcpy(newpath + sprintf((char*)newpath, "%s/", (const char*)path), suffix);
-        cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)newpath));
-        free(newpath);
+        return;
     }
-    else
+    cJSON_AddItemToObject(patch, "op", cJSON_CreateString((const char*)operation));
+
+    if (suffix == NULL)
     {
         cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)path));
     }
-    if (val)
+    else
     {
-        cJSON_AddItemToObject(patch, "value", cJSON_Duplicate(val, 1));
+        size_t suffix_length = pointer_encoded_length(suffix);
+        size_t path_length = strlen((const char*)path);
+        unsigned char *full_path = (unsigned char*)cJSON_malloc(path_length + suffix_length + sizeof("/"));
+
+        sprintf((char*)full_path, "%s/", (const char*)path);
+        encode_string_as_pointer(full_path + path_length + 1, suffix);
+
+        cJSON_AddItemToObject(patch, "path", cJSON_CreateString((const char*)full_path));
+        cJSON_free(full_path);
+    }
+
+    if (value != NULL)
+    {
+        cJSON_AddItemToObject(patch, "value", cJSON_Duplicate(value, 1));
     }
     cJSON_AddItemToArray(patches, patch);
 }
 
-CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON *array, const char *op, const char *path, cJSON *val)
+CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON * const array, const char * const operation, const char * const path, const cJSON * const value)
 {
-    cJSONUtils_GeneratePatch(array, (const unsigned char*)op, (const unsigned char*)path, 0, val);
+    compose_patch(array, (const unsigned char*)operation, (const unsigned char*)path, NULL, value);
 }
 
-static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path, cJSON *from, cJSON *to)
+static void create_patches(cJSON * const patches, const unsigned char * const path, cJSON * const from, cJSON * const to, const cJSON_bool case_sensitive)
 {
     if ((from == NULL) || (to == NULL))
     {
@@ -565,102 +1125,127 @@ static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path,
 
     if ((from->type & 0xFF) != (to->type & 0xFF))
     {
-        cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+        compose_patch(patches, (const unsigned char*)"replace", path, 0, to);
         return;
     }
 
-    switch ((from->type & 0xFF))
+    switch (from->type & 0xFF)
     {
         case cJSON_Number:
             if ((from->valueint != to->valueint) || (from->valuedouble != to->valuedouble))
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+                compose_patch(patches, (const unsigned char*)"replace", path, NULL, to);
             }
             return;
 
         case cJSON_String:
             if (strcmp(from->valuestring, to->valuestring) != 0)
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"replace", path, 0, to);
+                compose_patch(patches, (const unsigned char*)"replace", path, NULL, to);
             }
             return;
 
         case cJSON_Array:
         {
-            size_t c = 0;
-            unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + 23); /* Allow space for 64bit int. */
-            /* generate patches for all array elements that exist in "from" and "to" */
-            for (c = 0, from = from->child, to = to->child; from && to; from = from->next, to = to->next, c++)
+            size_t index = 0;
+            cJSON *from_child = from->child;
+            cJSON *to_child = to->child;
+            unsigned char *new_path = (unsigned char*)cJSON_malloc(strlen((const char*)path) + 20 + sizeof("/")); /* Allow space for 64bit int. log10(2^64) = 20 */
+
+            /* generate patches for all array elements that exist in both "from" and "to" */
+            for (index = 0; (from_child != NULL) && (to_child != NULL); (void)(from_child = from_child->next), (void)(to_child = to_child->next), index++)
             {
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (index > ULONG_MAX)
                 {
-                    free(newpath);
+                    cJSON_free(new_path);
                     return;
                 }
-                sprintf((char*)newpath, "%s/%lu", path, (unsigned long)c); /* path of the current array element */
-                cJSONUtils_CompareToPatch(patches, newpath, from, to);
+                sprintf((char*)new_path, "%s/%lu", path, (unsigned long)index); /* path of the current array element */
+                create_patches(patches, new_path, from_child, to_child, case_sensitive);
             }
+
             /* remove leftover elements from 'from' that are not in 'to' */
-            for (; from; from = from->next, c++)
+            for (; (from_child != NULL); (void)(from_child = from_child->next))
             {
                 /* check if conversion to unsigned long is valid
                  * This should be eliminated at compile time by dead code elimination
                  * if size_t is an alias of unsigned long, or if it is bigger */
-                if (c > ULONG_MAX)
+                if (index > ULONG_MAX)
                 {
-                    free(newpath);
+                    cJSON_free(new_path);
                     return;
                 }
-                sprintf((char*)newpath, "%lu", (unsigned long)c);
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"remove", path, newpath, 0);
+                sprintf((char*)new_path, "%lu", (unsigned long)index);
+                compose_patch(patches, (const unsigned char*)"remove", path, new_path, NULL);
             }
             /* add new elements in 'to' that were not in 'from' */
-            for (; to; to = to->next, c++)
+            for (; (to_child != NULL); (void)(to_child = to_child->next), index++)
             {
-                cJSONUtils_GeneratePatch(patches, (const unsigned char*)"add", path, (const unsigned char*)"-", to);
+                compose_patch(patches, (const unsigned char*)"add", path, (const unsigned char*)"-", to_child);
             }
-            free(newpath);
+            cJSON_free(new_path);
             return;
         }
 
         case cJSON_Object:
         {
-            cJSON *a = NULL;
-            cJSON *b = NULL;
-            cJSONUtils_SortObject(from);
-            cJSONUtils_SortObject(to);
+            cJSON *from_child = NULL;
+            cJSON *to_child = NULL;
+            sort_object(from, case_sensitive);
+            sort_object(to, case_sensitive);
 
-            a = from->child;
-            b = to->child;
+            from_child = from->child;
+            to_child = to->child;
             /* for all object values in the object with more of them */
-            while (a || b)
+            while ((from_child != NULL) || (to_child != NULL))
             {
-                int diff = (!a) ? 1 : ((!b) ? -1 : cJSONUtils_strcasecmp((unsigned char*)a->string, (unsigned char*)b->string));
-                if (!diff)
+                int diff;
+                if (from_child == NULL)
+                {
+                    diff = 1;
+                }
+                else if (to_child == NULL)
+                {
+                    diff = -1;
+                }
+                else
+                {
+                    diff = compare_strings((unsigned char*)from_child->string, (unsigned char*)to_child->string, case_sensitive);
+                }
+
+                if (diff == 0)
                 {
                     /* both object keys are the same */
-                    unsigned char *newpath = (unsigned char*)malloc(strlen((const char*)path) + cJSONUtils_PointerEncodedstrlen((unsigned char*)a->string) + 2);
-                    cJSONUtils_PointerEncodedstrcpy(newpath + sprintf((char*)newpath, "%s/", path), (unsigned char*)a->string);
+                    size_t path_length = strlen((const char*)path);
+                    size_t from_child_name_length = pointer_encoded_length((unsigned char*)from_child->string);
+                    unsigned char *new_path = (unsigned char*)cJSON_malloc(path_length + from_child_name_length + sizeof("/"));
+
+                    sprintf((char*)new_path, "%s/", path);
+                    encode_string_as_pointer(new_path + path_length + 1, (unsigned char*)from_child->string);
+
                     /* create a patch for the element */
-                    cJSONUtils_CompareToPatch(patches, newpath, a, b);
-                    free(newpath);
-                    a = a->next;
-                    b = b->next;
+                    create_patches(patches, new_path, from_child, to_child, case_sensitive);
+                    cJSON_free(new_path);
+
+                    from_child = from_child->next;
+                    to_child = to_child->next;
                 }
                 else if (diff < 0)
                 {
                     /* object element doesn't exist in 'to' --> remove it */
-                    cJSONUtils_GeneratePatch(patches, (const unsigned char*)"remove", path, (unsigned char*)a->string, 0);
-                    a = a->next;
+                    compose_patch(patches, (const unsigned char*)"remove", path, (unsigned char*)from_child->string, NULL);
+
+                    from_child = from_child->next;
                 }
                 else
                 {
                     /* object element doesn't exist in 'from' --> add it */
-                    cJSONUtils_GeneratePatch(patches, (const unsigned char*)"add", path, (unsigned char*)b->string, b);
-                    b = b->next;
+                    compose_patch(patches, (const unsigned char*)"add", path, (unsigned char*)to_child->string, to_child);
+
+                    to_child = to_child->next;
                 }
             }
             return;
@@ -671,211 +1256,199 @@ static void cJSONUtils_CompareToPatch(cJSON *patches, const unsigned char *path,
     }
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON *from, cJSON *to)
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON * const from, cJSON * const to)
 {
-    cJSON *patches = cJSON_CreateArray();
-    cJSONUtils_CompareToPatch(patches, (const unsigned char*)"", from, to);
+    cJSON *patches = NULL;
+
+    if ((from == NULL) || (to == NULL))
+    {
+        return NULL;
+    }
+
+    patches = cJSON_CreateArray();
+    create_patches(patches, (const unsigned char*)"", from, to, false);
 
     return patches;
 }
 
-/* sort lists using mergesort */
-static cJSON *cJSONUtils_SortList(cJSON *list)
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatchesCaseSensitive(cJSON * const from, cJSON * const to)
 {
-    cJSON *first = list;
-    cJSON *second = list;
-    cJSON *ptr = list;
+    cJSON *patches = NULL;
 
-    if (!list || !list->next)
+    if ((from == NULL) || (to == NULL))
     {
-        /* One entry is sorted already. */
-        return list;
+        return NULL;
     }
 
-    while (ptr && ptr->next && (cJSONUtils_strcasecmp((unsigned char*)ptr->string, (unsigned char*)ptr->next->string) < 0))
-    {
-        /* Test for list sorted. */
-        ptr = ptr->next;
-    }
-    if (!ptr || !ptr->next)
-    {
-        /* Leave sorted lists unmodified. */
-        return list;
-    }
+    patches = cJSON_CreateArray();
+    create_patches(patches, (const unsigned char*)"", from, to, true);
 
-    /* reset ptr to the beginning */
-    ptr = list;
-    while (ptr)
+    return patches;
+}
+
+CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON * const object)
+{
+    sort_object(object, false);
+}
+
+CJSON_PUBLIC(void) cJSONUtils_SortObjectCaseSensitive(cJSON * const object)
+{
+    sort_object(object, true);
+}
+
+static cJSON *merge_patch(cJSON *target, const cJSON * const patch, const cJSON_bool case_sensitive)
+{
+    cJSON *patch_child = NULL;
+
+    if (!cJSON_IsObject(patch))
     {
-        /* Walk two pointers to find the middle. */
-        second = second->next;
-        ptr = ptr->next;
-        /* advances ptr two steps at a time */
-        if (ptr)
-        {
-            ptr = ptr->next;
-        }
+        /* scalar value, array or NULL, just duplicate */
+        cJSON_Delete(target);
+        return cJSON_Duplicate(patch, 1);
     }
-    if (second && second->prev)
+
+    if (!cJSON_IsObject(target))
     {
-        /* Split the lists */
-        second->prev->next = NULL;
+        cJSON_Delete(target);
+        target = cJSON_CreateObject();
     }
 
-    /* Recursively sort the sub-lists. */
-    first = cJSONUtils_SortList(first);
-    second = cJSONUtils_SortList(second);
-    list = ptr = NULL;
-
-    while (first && second) /* Merge the sub-lists */
+    patch_child = patch->child;
+    while (patch_child != NULL)
     {
-        if (cJSONUtils_strcasecmp((unsigned char*)first->string, (unsigned char*)second->string) < 0)
+        if (cJSON_IsNull(patch_child))
         {
-            if (!list)
+            /* NULL is the indicator to remove a value, see RFC7396 */
+            if (case_sensitive)
             {
-                /* start merged list with the first element of the first list */
-                list = ptr = first;
+                cJSON_DeleteItemFromObjectCaseSensitive(target, patch_child->string);
             }
             else
             {
-                /* add first element of first list to merged list */
-                ptr->next = first;
-                first->prev = ptr;
-                ptr = first;
+                cJSON_DeleteItemFromObject(target, patch_child->string);
             }
-            first = first->next;
         }
         else
         {
-            if (!list)
+            cJSON *replace_me = NULL;
+            cJSON *replacement = NULL;
+
+            if (case_sensitive)
             {
-                /* start merged list with the first element of the second list */
-                list = ptr = second;
+                replace_me = cJSON_DetachItemFromObjectCaseSensitive(target, patch_child->string);
             }
             else
             {
-                /* add first element of second list to merged list */
-                ptr->next = second;
-                second->prev = ptr;
-                ptr = second;
+                replace_me = cJSON_DetachItemFromObject(target, patch_child->string);
             }
-            second = second->next;
-        }
-    }
-    if (first)
-    {
-        /* Append rest of first list. */
-        if (!list)
-        {
-            return first;
-        }
-        ptr->next = first;
-        first->prev = ptr;
-    }
-    if (second)
-    {
-        /* Append rest of second list */
-        if (!list)
-        {
-            return second;
+
+            replacement = merge_patch(replace_me, patch_child, case_sensitive);
+            if (replacement == NULL)
+            {
+                return NULL;
+            }
+
+            cJSON_AddItemToObject(target, patch_child->string, replacement);
         }
-        ptr->next = second;
-        second->prev = ptr;
+        patch_child = patch_child->next;
     }
-
-    return list;
+    return target;
 }
 
-CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON *object)
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, const cJSON * const patch)
 {
-    object->child = cJSONUtils_SortList(object->child);
+    return merge_patch(target, patch, false);
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, cJSON *patch)
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatchCaseSensitive(cJSON *target, const cJSON * const patch)
 {
-    if (!patch || ((patch->type & 0xFF) != cJSON_Object))
-    {
-        /* scalar value, array or NULL, just duplicate */
-        cJSON_Delete(target);
-        return cJSON_Duplicate(patch, 1);
-    }
-
-    if (!target || ((target->type & 0xFF) != cJSON_Object))
-    {
-        cJSON_Delete(target);
-        target = cJSON_CreateObject();
-    }
-
-    patch = patch->child;
-    while (patch)
-    {
-        if ((patch->type & 0xFF) == cJSON_NULL)
-        {
-            /* NULL is the indicator to remove a value, see RFC7396 */
-            cJSON_DeleteItemFromObject(target, patch->string);
-        }
-        else
-        {
-            cJSON *replaceme = cJSON_DetachItemFromObject(target, patch->string);
-            cJSON_AddItemToObject(target, patch->string, cJSONUtils_MergePatch(replaceme, patch));
-        }
-        patch = patch->next;
-    }
-    return target;
+    return merge_patch(target, patch, true);
 }
 
-CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON *from, cJSON *to)
+static cJSON *generate_merge_patch(cJSON * const from, cJSON * const to, const cJSON_bool case_sensitive)
 {
+    cJSON *from_child = NULL;
+    cJSON *to_child = NULL;
     cJSON *patch = NULL;
-    if (!to)
+    if (to == NULL)
     {
         /* patch to delete everything */
         return cJSON_CreateNull();
     }
-    if (((to->type & 0xFF) != cJSON_Object) || !from || ((from->type & 0xFF) != cJSON_Object))
+    if (!cJSON_IsObject(to) || !cJSON_IsObject(from))
     {
         return cJSON_Duplicate(to, 1);
     }
 
-    cJSONUtils_SortObject(from);
-    cJSONUtils_SortObject(to);
+    sort_object(from, case_sensitive);
+    sort_object(to, case_sensitive);
 
-    from = from->child;
-    to = to->child;
+    from_child = from->child;
+    to_child = to->child;
     patch = cJSON_CreateObject();
-    while (from || to)
+    while (from_child || to_child)
     {
-        int compare = from ? (to ? strcmp(from->string, to->string) : -1) : 1;
-        if (compare < 0)
+        int diff;
+        if (from_child != NULL)
+        {
+            if (to_child != NULL)
+            {
+                diff = strcmp(from_child->string, to_child->string);
+            }
+            else
+            {
+                diff = -1;
+            }
+        }
+        else
+        {
+            diff = 1;
+        }
+
+        if (diff < 0)
         {
             /* from has a value that to doesn't have -> remove */
-            cJSON_AddItemToObject(patch, from->string, cJSON_CreateNull());
-            from = from->next;
+            cJSON_AddItemToObject(patch, from_child->string, cJSON_CreateNull());
+
+            from_child = from_child->next;
         }
-        else if (compare > 0)
+        else if (diff > 0)
         {
             /* to has a value that from doesn't have -> add to patch */
-            cJSON_AddItemToObject(patch, to->string, cJSON_Duplicate(to, 1));
-            to = to->next;
+            cJSON_AddItemToObject(patch, to_child->string, cJSON_Duplicate(to_child, 1));
+
+            to_child = to_child->next;
         }
         else
         {
             /* object key exists in both objects */
-            if (cJSONUtils_Compare(from, to))
+            if (!compare_json(from_child, to_child, case_sensitive))
             {
                 /* not identical --> generate a patch */
-                cJSON_AddItemToObject(patch, to->string, cJSONUtils_GenerateMergePatch(from, to));
+                cJSON_AddItemToObject(patch, to_child->string, cJSONUtils_GenerateMergePatch(from_child, to_child));
             }
+
             /* next key in the object */
-            from = from->next;
-            to = to->next;
+            from_child = from_child->next;
+            to_child = to_child->next;
         }
     }
-    if (!patch->child)
+    if (patch->child == NULL)
     {
+        /* no patch generated */
         cJSON_Delete(patch);
         return NULL;
     }
 
     return patch;
 }
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON * const from, cJSON * const to)
+{
+    return generate_merge_patch(from, to, false);
+}
+
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatchCaseSensitive(cJSON * const from, cJSON * const to)
+{
+    return generate_merge_patch(from, to, true);
+}
diff --git a/src/include/private/switch_core_pvt.h b/src/include/private/switch_core_pvt.h
index 0f46b8471dc..c3478f109e7 100644
--- a/src/include/private/switch_core_pvt.h
+++ b/src/include/private/switch_core_pvt.h
@@ -314,6 +314,7 @@ struct switch_runtime {
 	int events_use_dispatch;
 	uint32_t port_alloc_flags;
 	char *event_channel_key_separator;
+	uint32_t max_audio_channels;
 };
 
 extern struct switch_runtime runtime;
diff --git a/src/include/switch_cJSON.h b/src/include/switch_cJSON.h
index 37366f9e287..514251c9b24 100644
--- a/src/include/switch_cJSON.h
+++ b/src/include/switch_cJSON.h
@@ -1,16 +1,13 @@
 /*
-  Copyright (c) 2009 Dave Gamble
-
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
   Permission is hereby granted, free of charge, to any person obtaining a copy
   of this software and associated documentation files (the "Software"), to deal
   in the Software without restriction, including without limitation the rights
   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   copies of the Software, and to permit persons to whom the Software is
   furnished to do so, subject to the following conditions:
-
   The above copyright notice and this permission notice shall be included in
   all copies or substantial portions of the Software.
-
   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -28,10 +25,55 @@ extern "C"
 {
 #endif
 
+#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32))
+#define __WINDOWS__
+#endif
+
+#ifdef __WINDOWS__
+
+/* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention.  For windows you have 3 define options:
+CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols
+CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols (default)
+CJSON_IMPORT_SYMBOLS - Define this if you want to dllimport symbol
+For *nix builds that support visibility attribute, you can define similar behavior by
+setting default visibility to hidden by adding
+-fvisibility=hidden (for gcc)
+or
+-xldscope=hidden (for sun cc)
+to CFLAGS
+then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does
+*/
+
+#define CJSON_CDECL __cdecl
+#define CJSON_STDCALL __stdcall
+
+/* export symbols by default, this is necessary for copy pasting the C and header file */
+#if !defined(CJSON_HIDE_SYMBOLS) && !defined(CJSON_IMPORT_SYMBOLS) && !defined(CJSON_EXPORT_SYMBOLS)
+#define CJSON_EXPORT_SYMBOLS
+#endif
+
+#if defined(CJSON_HIDE_SYMBOLS)
+#define CJSON_PUBLIC(type)   type CJSON_STDCALL
+#elif defined(CJSON_EXPORT_SYMBOLS)
+#define CJSON_PUBLIC(type)   __declspec(dllexport) type CJSON_STDCALL
+#elif defined(CJSON_IMPORT_SYMBOLS)
+#define CJSON_PUBLIC(type)   __declspec(dllimport) type CJSON_STDCALL
+#endif
+#else /* !__WINDOWS__ */
+#define CJSON_CDECL
+#define CJSON_STDCALL
+
+#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY)
+#define CJSON_PUBLIC(type)   __attribute__((visibility("default"))) type
+#else
+#define CJSON_PUBLIC(type) type
+#endif
+#endif
+
 /* project version */
 #define CJSON_VERSION_MAJOR 1
-#define CJSON_VERSION_MINOR 3
-#define CJSON_VERSION_PATCH 0
+#define CJSON_VERSION_MINOR 7
+#define CJSON_VERSION_PATCH 12
 
 #include <stddef.h>
 
@@ -63,7 +105,7 @@ typedef struct cJSON
 
     /* The item's string, if type==cJSON_String  and type == cJSON_Raw */
     char *valuestring;
-    /* The item's number, if type==cJSON_Number */
+    /* writing to valueint is DEPRECATED, use cJSON_SetNumberValue instead */
     int valueint;
     /* The item's number, if type==cJSON_Number */
     double valuedouble;
@@ -74,45 +116,17 @@ typedef struct cJSON
 
 typedef struct cJSON_Hooks
 {
-      void *(*malloc_fn)(size_t sz);
-      void (*free_fn)(void *ptr);
+      /* malloc/free are CDECL on Windows regardless of the default calling convention of the compiler, so ensure the hooks allow passing those functions directly. */
+      void *(CJSON_CDECL *malloc_fn)(size_t sz);
+      void (CJSON_CDECL *free_fn)(void *ptr);
 } cJSON_Hooks;
 
-#if !defined(__WINDOWS__) && (defined(WIN32) || defined(WIN64) || defined(_MSC_VER) || defined(_WIN32))
-#define __WINDOWS__
-#endif
-#ifdef __WINDOWS__
-
-/* When compiling for windows, we specify a specific calling convention to avoid issues where we are being called from a project with a different default calling convention.  For windows you have 2 define options:
-
-CJSON_HIDE_SYMBOLS - Define this in the case where you don't want to ever dllexport symbols
-CJSON_EXPORT_SYMBOLS - Define this on library build when you want to dllexport symbols
-
-For *nix builds that support visibility attribute, you can define similar behavior by 
-
-setting default visibility to hidden by adding
--fvisibility=hidden (for gcc)
-or
--xldscope=hidden (for sun cc)
-to CFLAGS
-
-then using the CJSON_API_VISIBILITY flag to "export" the same symbols the way CJSON_EXPORT_SYMBOLS does
-
-*/
+typedef int cJSON_bool;
 
-#if defined(CJSON_HIDE_SYMBOLS)
-#define CJSON_PUBLIC(type)   type __stdcall
-#elif defined(CJSON_EXPORT_SYMBOLS)
-#define CJSON_PUBLIC(type)   __declspec(dllexport) type __stdcall
-#else
-#define CJSON_PUBLIC(type)   __declspec(dllimport) type __stdcall
-#endif
-#else /* !WIN32 */
-#if (defined(__GNUC__) || defined(__SUNPRO_CC) || defined (__SUNPRO_C)) && defined(CJSON_API_VISIBILITY)
-#define CJSON_PUBLIC(type)   __attribute__((visibility("default"))) type
-#else
-#define CJSON_PUBLIC(type) type
-#endif
+/* Limits how deeply nested arrays/objects can be before cJSON rejects to parse them.
+ * This is to prevent stack overflows. */
+#ifndef CJSON_NESTING_LIMIT
+#define CJSON_NESTING_LIMIT 1000
 #endif
 
 /* returns the version of cJSON as a string */
@@ -121,36 +135,56 @@ CJSON_PUBLIC(const char*) cJSON_Version(void);
 /* Supply malloc, realloc and free functions to cJSON */
 CJSON_PUBLIC(void) cJSON_InitHooks(cJSON_Hooks* hooks);
 
-
-/* Supply a block of JSON, and this returns a cJSON object you can interrogate. Call cJSON_Delete when finished. */
+/* Memory Management: the caller is always responsible to free the results from all variants of cJSON_Parse (with cJSON_Delete) and cJSON_Print (with stdlib free, cJSON_Hooks.free_fn, or cJSON_free as appropriate). The exception is cJSON_PrintPreallocated, where the caller has full responsibility of the buffer. */
+/* Supply a block of JSON, and this returns a cJSON object you can interrogate. */
 CJSON_PUBLIC(cJSON *) cJSON_Parse(const char *value);
-/* Render a cJSON entity to text for transfer/storage. Free the char* when finished. */
+/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */
+/* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error so will match cJSON_GetErrorPtr(). */
+CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, cJSON_bool require_null_terminated);
+
+/* Render a cJSON entity to text for transfer/storage. */
 CJSON_PUBLIC(char *) cJSON_Print(const cJSON *item);
-/* Render a cJSON entity to text for transfer/storage without any formatting. Free the char* when finished. */
+/* Render a cJSON entity to text for transfer/storage without any formatting. */
 CJSON_PUBLIC(char *) cJSON_PrintUnformatted(const cJSON *item);
 /* Render a cJSON entity to text using a buffered strategy. prebuffer is a guess at the final size. guessing well reduces reallocation. fmt=0 gives unformatted, =1 gives formatted */
-CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, int fmt);
-/* Render a cJSON entity to text using a buffer already allocated in memory with length buf_len. Returns 1 on success and 0 on failure. */
-CJSON_PUBLIC(int) cJSON_PrintPreallocated(cJSON *item, char *buf, const int len, const int fmt);
+CJSON_PUBLIC(char *) cJSON_PrintBuffered(const cJSON *item, int prebuffer, cJSON_bool fmt);
+/* Render a cJSON entity to text using a buffer already allocated in memory with given length. Returns 1 on success and 0 on failure. */
+/* NOTE: cJSON is not always 100% accurate in estimating how much memory it will use, so to be safe allocate 5 bytes more than you actually need */
+CJSON_PUBLIC(cJSON_bool) cJSON_PrintPreallocated(cJSON *item, char *buffer, const int length, const cJSON_bool format);
 /* Delete a cJSON entity and all subentities. */
 CJSON_PUBLIC(void) cJSON_Delete(cJSON *c);
 
 /* Returns the number of items in an array (or object). */
 CJSON_PUBLIC(int) cJSON_GetArraySize(const cJSON *array);
-/* Retrieve item number "item" from array "array". Returns NULL if unsuccessful. */
-CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int item);
+/* Retrieve item number "index" from array "array". Returns NULL if unsuccessful. */
+CJSON_PUBLIC(cJSON *) cJSON_GetArrayItem(const cJSON *array, int index);
 /* Get item "string" from object. Case insensitive. */
-CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON *object, const char *string);
+CJSON_PUBLIC(cJSON *) cJSON_GetObjectItem(const cJSON * const object, const char * const string);
 CJSON_PUBLIC(cJSON *) cJSON_GetObjectItemCaseSensitive(const cJSON * const object, const char * const string);
-CJSON_PUBLIC(int) cJSON_HasObjectItem(const cJSON *object, const char *string);
+CJSON_PUBLIC(cJSON_bool) cJSON_HasObjectItem(const cJSON *object, const char *string);
 /* For analysing failed parses. This returns a pointer to the parse error. You'll probably need to look a few chars back to make sense of it. Defined when cJSON_Parse() returns 0. 0 when cJSON_Parse() succeeds. */
 CJSON_PUBLIC(const char *) cJSON_GetErrorPtr(void);
 
+/* Check if the item is a string and return its valuestring */
+CJSON_PUBLIC(char *) cJSON_GetStringValue(cJSON *item);
+
+/* These functions check the type of an item */
+CJSON_PUBLIC(cJSON_bool) cJSON_IsInvalid(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsFalse(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsTrue(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsBool(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNull(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsNumber(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsString(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsArray(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsObject(const cJSON * const item);
+CJSON_PUBLIC(cJSON_bool) cJSON_IsRaw(const cJSON * const item);
+
 /* These calls create a cJSON item of the appropriate type. */
 CJSON_PUBLIC(cJSON *) cJSON_CreateNull(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateTrue(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateFalse(void);
-CJSON_PUBLIC(cJSON *) cJSON_CreateBool(int b);
+CJSON_PUBLIC(cJSON *) cJSON_CreateBool(cJSON_bool boolean);
 CJSON_PUBLIC(cJSON *) cJSON_CreateNumber(double num);
 CJSON_PUBLIC(cJSON *) cJSON_CreateString(const char *string);
 /* raw json */
@@ -158,6 +192,14 @@ CJSON_PUBLIC(cJSON *) cJSON_CreateRaw(const char *raw);
 CJSON_PUBLIC(cJSON *) cJSON_CreateArray(void);
 CJSON_PUBLIC(cJSON *) cJSON_CreateObject(void);
 
+/* Create a string where valuestring references a string so
+ * it will not be freed by cJSON_Delete */
+CJSON_PUBLIC(cJSON *) cJSON_CreateStringReference(const char *string);
+/* Create an object/arrray that only references it's elements so
+ * they will not be freed by cJSON_Delete */
+CJSON_PUBLIC(cJSON *) cJSON_CreateObjectReference(const cJSON *child);
+CJSON_PUBLIC(cJSON *) cJSON_CreateArrayReference(const cJSON *child);
+
 /* These utilities create an Array of count items. */
 CJSON_PUBLIC(cJSON *) cJSON_CreateIntArray(const int *numbers, int count);
 CJSON_PUBLIC(cJSON *) cJSON_CreateFloatArray(const float *numbers, int count);
@@ -176,36 +218,44 @@ CJSON_PUBLIC(void) cJSON_AddItemReferenceToArray(cJSON *array, cJSON *item);
 CJSON_PUBLIC(void) cJSON_AddItemReferenceToObject(cJSON *object, const char *string, cJSON *item);
 
 /* Remove/Detatch items from Arrays/Objects. */
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemViaPointer(cJSON *parent, cJSON * const item);
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromArray(cJSON *array, int which);
 CJSON_PUBLIC(void) cJSON_DeleteItemFromArray(cJSON *array, int which);
 CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObject(cJSON *object, const char *string);
+CJSON_PUBLIC(cJSON *) cJSON_DetachItemFromObjectCaseSensitive(cJSON *object, const char *string);
 CJSON_PUBLIC(void) cJSON_DeleteItemFromObject(cJSON *object, const char *string);
+CJSON_PUBLIC(void) cJSON_DeleteItemFromObjectCaseSensitive(cJSON *object, const char *string);
 
 /* Update array items. */
 CJSON_PUBLIC(void) cJSON_InsertItemInArray(cJSON *array, int which, cJSON *newitem); /* Shifts pre-existing items to the right. */
+CJSON_PUBLIC(cJSON_bool) cJSON_ReplaceItemViaPointer(cJSON * const parent, cJSON * const item, cJSON * replacement);
 CJSON_PUBLIC(void) cJSON_ReplaceItemInArray(cJSON *array, int which, cJSON *newitem);
 CJSON_PUBLIC(void) cJSON_ReplaceItemInObject(cJSON *object,const char *string,cJSON *newitem);
+CJSON_PUBLIC(void) cJSON_ReplaceItemInObjectCaseSensitive(cJSON *object,const char *string,cJSON *newitem);
 
 /* Duplicate a cJSON item */
-CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, int recurse);
+CJSON_PUBLIC(cJSON *) cJSON_Duplicate(const cJSON *item, cJSON_bool recurse);
 /* Duplicate will create a new, identical cJSON item to the one you pass, in new memory that will
 need to be released. With recurse!=0, it will duplicate any children connected to the item.
 The item->next and ->prev pointers are always zero on return from Duplicate. */
+/* Recursively compare two cJSON items for equality. If either a or b is NULL or invalid, they will be considered unequal.
+ * case_sensitive determines if object keys are treated case sensitive (1) or case insensitive (0) */
+CJSON_PUBLIC(cJSON_bool) cJSON_Compare(const cJSON * const a, const cJSON * const b, const cJSON_bool case_sensitive);
 
-/* ParseWithOpts allows you to require (and check) that the JSON is null terminated, and to retrieve the pointer to the final byte parsed. */
-/* If you supply a ptr in return_parse_end and parsing fails, then return_parse_end will contain a pointer to the error. If not, then cJSON_GetErrorPtr() does the job. */
-CJSON_PUBLIC(cJSON *) cJSON_ParseWithOpts(const char *value, const char **return_parse_end, int require_null_terminated);
 
 CJSON_PUBLIC(void) cJSON_Minify(char *json);
 
-/* Macros for creating things quickly. */
-#define cJSON_AddNullToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateNull())
-#define cJSON_AddTrueToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateTrue())
-#define cJSON_AddFalseToObject(object,name) cJSON_AddItemToObject(object, name, cJSON_CreateFalse())
-#define cJSON_AddBoolToObject(object,name,b) cJSON_AddItemToObject(object, name, cJSON_CreateBool(b))
-#define cJSON_AddNumberToObject(object,name,n) cJSON_AddItemToObject(object, name, cJSON_CreateNumber(n))
-#define cJSON_AddStringToObject(object,name,s) cJSON_AddItemToObject(object, name, cJSON_CreateString(s))
-#define cJSON_AddRawToObject(object,name,s) cJSON_AddItemToObject(object, name, cJSON_CreateRaw(s))
+/* Helper functions for creating and adding items to an object at the same time.
+ * They return the added item or NULL on failure. */
+CJSON_PUBLIC(cJSON*) cJSON_AddNullToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddTrueToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddFalseToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddBoolToObject(cJSON * const object, const char * const name, const cJSON_bool boolean);
+CJSON_PUBLIC(cJSON*) cJSON_AddNumberToObject(cJSON * const object, const char * const name, const double number);
+CJSON_PUBLIC(cJSON*) cJSON_AddStringToObject(cJSON * const object, const char * const name, const char * const string);
+CJSON_PUBLIC(cJSON*) cJSON_AddRawToObject(cJSON * const object, const char * const name, const char * const raw);
+CJSON_PUBLIC(cJSON*) cJSON_AddObjectToObject(cJSON * const object, const char * const name);
+CJSON_PUBLIC(cJSON*) cJSON_AddArrayToObject(cJSON * const object, const char * const name);
 
 /* When assigning an integer value, it needs to be propagated to valuedouble too. */
 #define cJSON_SetIntValue(object, number) ((object) ? (object)->valueint = (object)->valuedouble = (number) : (number))
@@ -213,9 +263,13 @@ CJSON_PUBLIC(void) cJSON_Minify(char *json);
 CJSON_PUBLIC(double) cJSON_SetNumberHelper(cJSON *object, double number);
 #define cJSON_SetNumberValue(object, number) ((object != NULL) ? cJSON_SetNumberHelper(object, (double)number) : (number))
 
-/* Macro for iterating over an array */
+/* Macro for iterating over an array or object */
 #define cJSON_ArrayForEach(element, array) for(element = (array != NULL) ? (array)->child : NULL; element != NULL; element = element->next)
 
+/* malloc/free objects using the malloc/free functions that have been set with cJSON_InitHooks */
+CJSON_PUBLIC(void *) cJSON_malloc(size_t size);
+CJSON_PUBLIC(void) cJSON_free(void *object);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/include/switch_cJSON_Utils.h b/src/include/switch_cJSON_Utils.h
index 358145e4c26..4336c566964 100644
--- a/src/include/switch_cJSON_Utils.h
+++ b/src/include/switch_cJSON_Utils.h
@@ -1,14 +1,45 @@
+/*
+  Copyright (c) 2009-2017 Dave Gamble and cJSON contributors
+  Permission is hereby granted, free of charge, to any person obtaining a copy
+  of this software and associated documentation files (the "Software"), to deal
+  in the Software without restriction, including without limitation the rights
+  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+  copies of the Software, and to permit persons to whom the Software is
+  furnished to do so, subject to the following conditions:
+  The above copyright notice and this permission notice shall be included in
+  all copies or substantial portions of the Software.
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+  THE SOFTWARE.
+*/
+
+#ifndef cJSON_Utils__h
+#define cJSON_Utils__h
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
 #include "switch_cJSON.h"
 
 /* Implement RFC6901 (https://tools.ietf.org/html/rfc6901) JSON Pointer spec. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON *object, const char *pointer);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointer(cJSON * const object, const char *pointer);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GetPointerCaseSensitive(cJSON * const object, const char *pointer);
 
 /* Implement RFC6902 (https://tools.ietf.org/html/rfc6902) JSON Patch spec. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON *from, cJSON *to);
+/* NOTE: This modifies objects in 'from' and 'to' by sorting the elements by their key */
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatches(cJSON * const from, cJSON * const to);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GeneratePatchesCaseSensitive(cJSON * const from, cJSON * const to);
 /* Utility for generating patch array entries. */
-CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON *array, const char *op, const char *path, cJSON *val);
+CJSON_PUBLIC(void) cJSONUtils_AddPatchToArray(cJSON * const array, const char * const operation, const char * const path, const cJSON * const value);
 /* Returns 0 for success. */
-CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches);
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON * const object, const cJSON * const patches);
+CJSON_PUBLIC(int) cJSONUtils_ApplyPatchesCaseSensitive(cJSON * const object, const cJSON * const patches);
 
 /*
 // Note that ApplyPatches is NOT atomic on failure. To implement an atomic ApplyPatches, use:
@@ -33,12 +64,22 @@ CJSON_PUBLIC(int) cJSONUtils_ApplyPatches(cJSON *object, cJSON *patches);
 
 /* Implement RFC7386 (https://tools.ietf.org/html/rfc7396) JSON Merge Patch spec. */
 /* target will be modified by patch. return value is new ptr for target. */
-CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, cJSON *patch);
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatch(cJSON *target, const cJSON * const patch);
+CJSON_PUBLIC(cJSON *) cJSONUtils_MergePatchCaseSensitive(cJSON *target, const cJSON * const patch);
 /* generates a patch to move from -> to */
-CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON *from, cJSON *to);
+/* NOTE: This modifies objects in 'from' and 'to' by sorting the elements by their key */
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatch(cJSON * const from, cJSON * const to);
+CJSON_PUBLIC(cJSON *) cJSONUtils_GenerateMergePatchCaseSensitive(cJSON * const from, cJSON * const to);
 
 /* Given a root object and a target object, construct a pointer from one to the other. */
-CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(cJSON *object, cJSON *target);
+CJSON_PUBLIC(char *) cJSONUtils_FindPointerFromObjectTo(const cJSON * const object, const cJSON * const target);
 
 /* Sorts the members of the object into alphabetical order. */
-CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON *object);
+CJSON_PUBLIC(void) cJSONUtils_SortObject(cJSON * const object);
+CJSON_PUBLIC(void) cJSONUtils_SortObjectCaseSensitive(cJSON * const object);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/include/switch_core.h b/src/include/switch_core.h
index 8255e13c858..cad1e0cb17f 100644
--- a/src/include/switch_core.h
+++ b/src/include/switch_core.h
@@ -2702,6 +2702,7 @@ SWITCH_DECLARE(const char *) switch_core_banner(void);
 SWITCH_DECLARE(switch_bool_t) switch_core_session_in_thread(switch_core_session_t *session);
 SWITCH_DECLARE(uint32_t) switch_default_ptime(const char *name, uint32_t number);
 SWITCH_DECLARE(uint32_t) switch_default_rate(const char *name, uint32_t number);
+SWITCH_DECLARE(uint32_t) switch_core_max_audio_channels(uint32_t limit);
 
 /*!
  \brief Add user registration
diff --git a/src/include/switch_image.h b/src/include/switch_image.h
index d6d3166d2ff..83cb19126aa 100644
--- a/src/include/switch_image.h
+++ b/src/include/switch_image.h
@@ -12,8 +12,8 @@
  * \brief Describes the vpx image descriptor and associated operations
  *
  */
-#ifndef VPX_VPX_IMAGE_H_
-#define VPX_VPX_IMAGE_H_
+#ifndef VPX_VPX_VPX_IMAGE_H_
+#define VPX_VPX_VPX_IMAGE_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -167,21 +167,21 @@ vpx_image_t *vpx_img_alloc(vpx_image_t *img, vpx_img_fmt_t fmt,
  * storage for descriptor has been allocated elsewhere, and a descriptor is
  * desired to "wrap" that storage.
  *
- * \param[in]    img       Pointer to storage for descriptor. If this parameter
- *                         is NULL, the storage for the descriptor will be
- *                         allocated on the heap.
- * \param[in]    fmt       Format for the image
- * \param[in]    d_w       Width of the image
- * \param[in]    d_h       Height of the image
- * \param[in]    align     Alignment, in bytes, of each row in the image.
- * \param[in]    img_data  Storage to use for the image
+ * \param[in]    img           Pointer to storage for descriptor. If this
+ *                             parameter is NULL, the storage for the descriptor
+ *                             will be allocated on the heap.
+ * \param[in]    fmt           Format for the image
+ * \param[in]    d_w           Width of the image
+ * \param[in]    d_h           Height of the image
+ * \param[in]    stride_align  Alignment, in bytes, of each row in the image.
+ * \param[in]    img_data      Storage to use for the image
  *
  * \return Returns a pointer to the initialized image descriptor. If the img
  *         parameter is non-null, the value of the img parameter will be
  *         returned.
  */
 vpx_image_t *vpx_img_wrap(vpx_image_t *img, vpx_img_fmt_t fmt, unsigned int d_w,
-                          unsigned int d_h, unsigned int align,
+                          unsigned int d_h, unsigned int stride_align,
                           unsigned char *img_data);
 
 /*!\brief Set the rectangle identifying the displayed portion of the image
@@ -221,4 +221,4 @@ void vpx_img_free(vpx_image_t *img);
 }  // extern "C"
 #endif
 
-#endif  // VPX_VPX_IMAGE_H_
+#endif  // VPX_VPX_VPX_IMAGE_H_
diff --git a/src/include/switch_ivr.h b/src/include/switch_ivr.h
index 39e915edc53..34d6625541b 100644
--- a/src/include/switch_ivr.h
+++ b/src/include/switch_ivr.h
@@ -559,6 +559,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_bridge_bleg(switch_core_session_t *se
 */
 SWITCH_DECLARE(switch_status_t) switch_ivr_signal_bridge(switch_core_session_t *session, switch_core_session_t *peer_session);
 
+SWITCH_DECLARE(void) switch_ivr_check_hold(switch_core_session_t *session);								
+
+
 /*!
   \brief Transfer an existing session to another location
   \param session the session to transfer
diff --git a/src/include/switch_rtp.h b/src/include/switch_rtp.h
index f9d116a6720..96ea2135f37 100644
--- a/src/include/switch_rtp.h
+++ b/src/include/switch_rtp.h
@@ -593,6 +593,7 @@ SWITCH_DECLARE(switch_status_t) switch_rtp_ack_bitrate(switch_rtp_t *rtp_session
 SWITCH_DECLARE(void) switch_rtp_video_refresh(switch_rtp_t *rtp_session);
 SWITCH_DECLARE(void) switch_rtp_video_loss(switch_rtp_t *rtp_session);
 
+SWITCH_DECLARE(switch_core_session_t*) switch_rtp_get_core_session(switch_rtp_t *rtp_session);
 /*!
   \}
 */
diff --git a/src/include/switch_utils.h b/src/include/switch_utils.h
index 21d4ae38ff8..1c409759083 100644
--- a/src/include/switch_utils.h
+++ b/src/include/switch_utils.h
@@ -1338,6 +1338,12 @@ SWITCH_DECLARE(void) switch_split_time(const char *exp, int *hour, int *min, int
 */
 SWITCH_DECLARE(int) switch_split_user_domain(char *in, char **user, char **domain);
 
+SWITCH_DECLARE(void *) switch_calloc(size_t nmemb, size_t size);
+
+#ifdef __clang_analyzer__
+#define calloc switch_calloc
+#endif
+
 /* malloc or DIE macros */
 #ifdef NDEBUG
 #define switch_malloc(ptr, len) (void)( (!!(ptr = malloc(len))) || (fprintf(stderr,"ABORT! Malloc failure at: %s:%d", __FILE__, __LINE__),abort(), 0), ptr )
@@ -1452,6 +1458,8 @@ SWITCH_DECLARE(void) switch_getcputime(switch_cputime *t);
 
 SWITCH_DECLARE(char *)switch_html_strip(const char *str);
 
+SWITCH_DECLARE(unsigned long) switch_getpid(void);
+
 SWITCH_END_EXTERN_C
 #endif
 /* For Emacs:
diff --git a/src/include/switch_xml.h b/src/include/switch_xml.h
index 25aa2002c52..fab9d981efa 100644
--- a/src/include/switch_xml.h
+++ b/src/include/switch_xml.h
@@ -233,7 +233,7 @@ SWITCH_DECLARE(char *) switch_xml_tohtml_ex(_In_ switch_xml_t xml, _In_ switch_b
 ///\param prn_header add <?xml version..> header too
 ///\param use_utf8_encoding encoding into ampersand entities for UTF-8 chars
 ///\return the xml text string
-#define switch_xml_toxml_buf(xml, buf, buflen, offset, prn_header) switch_xml_toxml_buf(xml, buf, buflen, offset, prn_header, USE_UTF_8_ENCODING);
+#define switch_xml_toxml_buf(xml, buf, buflen, offset, prn_header) switch_xml_toxml_buf_ex(xml, buf, buflen, offset, prn_header, USE_UTF_8_ENCODING);
 SWITCH_DECLARE(char *) switch_xml_toxml_buf_ex(_In_ switch_xml_t xml, _In_z_ char *buf, _In_ switch_size_t buflen, _In_ switch_size_t offset,
 											_In_ switch_bool_t prn_header, switch_bool_t use_utf8_encoding);
 
diff --git a/src/include/test/switch_test.h b/src/include/test/switch_test.h
index 4e7fcfcd8ac..d343ead1b26 100644
--- a/src/include/test/switch_test.h
+++ b/src/include/test/switch_test.h
@@ -66,16 +66,23 @@ static char *fst_getenv_default(const char *env, char *default_value, switch_boo
 /**
  * initialize FS core from optional configuration dir
  */
-static switch_status_t fst_init_core_and_modload(const char *confdir, const char *basedir, int minimal)
+static switch_status_t fst_init_core_and_modload(const char *confdir, const char *basedir, int minimal, switch_core_flag_t flags)
 {
 	switch_status_t status;
 	const char *err;
+	unsigned long pid = switch_getpid();
 	// Let FreeSWITCH core pick these
 	//SWITCH_GLOBAL_dirs.base_dir = strdup("/usr/local/freeswitch");
 	//SWITCH_GLOBAL_dirs.mod_dir = strdup("/usr/local/freeswitch/mod");
 	//SWITCH_GLOBAL_dirs.lib_dir = strdup("/usr/local/freeswitch/lib");
 	//SWITCH_GLOBAL_dirs.temp_dir = strdup("/tmp");
 
+#ifdef SWITCH_TEST_BASE_DIR_OVERRIDE
+	basedir = SWITCH_TEST_BASE_DIR_OVERRIDE;
+#else
+#define SWITCH_TEST_BASE_DIR_OVERRIDE "."
+#endif
+
 	if (zstr(basedir)) {
 		basedir = ".";
 	}
@@ -95,12 +102,12 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		SWITCH_GLOBAL_dirs.conf_dir = switch_mprintf("%s%sconf", basedir, SWITCH_PATH_SEPARATOR);
 	}
 
-	SWITCH_GLOBAL_dirs.log_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
+	SWITCH_GLOBAL_dirs.log_dir = switch_mprintf("%s%s%lu%s", basedir, SWITCH_PATH_SEPARATOR, pid, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.run_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.recordings_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.sounds_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.cache_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
-	SWITCH_GLOBAL_dirs.db_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
+	SWITCH_GLOBAL_dirs.db_dir = switch_mprintf("%s%s%lu%s", basedir, SWITCH_PATH_SEPARATOR, pid, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.script_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.htdocs_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
 	SWITCH_GLOBAL_dirs.grammar_dir = switch_mprintf("%s%s", basedir, SWITCH_PATH_SEPARATOR);
@@ -113,7 +120,7 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 	switch_core_set_globals();
 
 	if (!minimal) {
-		status = switch_core_init_and_modload(SCF_USE_SQL, SWITCH_TRUE, &err);
+		status = switch_core_init_and_modload(flags, SWITCH_TRUE, &err);
 		switch_sleep(1 * 1000000);
 		switch_core_set_variable("sound_prefix", "." SWITCH_PATH_SEPARATOR);
 		if (status != SWITCH_STATUS_SUCCESS && err) {
@@ -256,7 +263,7 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		int fst_timer_started = 0; \
 		fst_getenv_default("FST_SUPPRESS_UNUSED_STATIC_WARNING", NULL, SWITCH_FALSE); \
 		if (fst_core) { \
-			fst_init_core_and_modload(NULL, NULL, 0); /* shuts up compiler */ \
+			fst_init_core_and_modload(NULL, NULL, 0, 0); /* shuts up compiler */ \
 		} \
 		{ \
 
@@ -274,7 +281,7 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
  * Define the beginning of a freeswitch core test driver.  Only one per test application allowed.
  * @param confdir directory containing freeswitch.xml configuration
  */
-#define FST_CORE_BEGIN(confdir) \
+#define FST_CORE_EX_BEGIN(confdir, flags) \
 	FCT_BGN() \
 	{ \
 		int fst_core = 0; \
@@ -283,7 +290,7 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		switch_memory_pool_t *fst_pool = NULL; \
 		int fst_timer_started = 0; \
 		fst_getenv_default("FST_SUPPRESS_UNUSED_STATIC_WARNING", NULL, SWITCH_FALSE); \
-		if (fst_init_core_and_modload(confdir, confdir, 0) == SWITCH_STATUS_SUCCESS) { \
+		if (fst_init_core_and_modload(confdir, confdir, 0, flags) == SWITCH_STATUS_SUCCESS) { \
 			fst_core = 2; \
 		} else { \
 			fprintf(stderr, "Failed to load FS core\n"); \
@@ -305,6 +312,9 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 	} \
 	FCT_END()
 
+#define FST_CORE_BEGIN(confdir) FST_CORE_EX_BEGIN(confdir, 0)
+#define FST_CORE_DB_BEGIN(confdir) FST_CORE_EX_BEGIN(confdir, SCF_USE_SQL)
+
 /**
  * Minimal FS core load
  */
@@ -317,7 +327,7 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		switch_memory_pool_t *fst_pool = NULL; \
 		int fst_timer_started = 0; \
 		fst_getenv_default("FST_SUPPRESS_UNUSED_STATIC_WARNING", NULL, SWITCH_FALSE); \
-		if (fst_init_core_and_modload(confdir, NULL, 1) == SWITCH_STATUS_SUCCESS) { /* minimal load */ \
+		if (fst_init_core_and_modload(confdir, NULL, 1, 0) == SWITCH_STATUS_SUCCESS) { /* minimal load */ \
 			fst_core = 1; \
 		} else { \
 			fprintf(stderr, "Failed to load FS core\n"); \
@@ -347,7 +357,9 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		const char *fst_test_module = #modname; \
 		if (fst_core && !zstr(fst_test_module)) { \
 			const char *err; \
-			switch_loadable_module_load_module((char *)"../.libs", (char *)fst_test_module, SWITCH_TRUE, &err); \
+			char path[1024]; \
+			sprintf(path, "%s%s%s", SWITCH_TEST_BASE_DIR_OVERRIDE, SWITCH_PATH_SEPARATOR, "../.libs/"); \
+			switch_loadable_module_load_module((char *)path, (char *)fst_test_module, SWITCH_TRUE, &err); \
 		} \
 		FCT_FIXTURE_SUITE_BGN(suite);
 #endif
@@ -368,7 +380,9 @@ static switch_status_t fst_init_core_and_modload(const char *confdir, const char
 		FCT_FIXTURE_SUITE_END(); \
 		if (!zstr(fst_test_module) && switch_loadable_module_exists(fst_test_module) == SWITCH_STATUS_SUCCESS) { \
 			const char *err; \
-			switch_loadable_module_unload_module((char *)"../.libs", (char *)fst_test_module, SWITCH_FALSE, &err); \
+			char path[1024]; \
+			sprintf(path, "%s%s%s", SWITCH_TEST_BASE_DIR_OVERRIDE, SWITCH_PATH_SEPARATOR, "../.libs/"); \
+			switch_loadable_module_unload_module((char*)path, (char *)fst_test_module, SWITCH_FALSE, &err); \
 		} \
 	}
 #endif
@@ -849,7 +863,7 @@ cJSON *varname = NULL; \
 	int fd = open(file, O_RDONLY); \
 	fst_requires(fd >= 0); \
 	fstat(fd, &s); \
-	buf = malloc(s.st_size + 1); \
+	switch_zmalloc(buf, s.st_size + 1); \
 	fst_requires(buf); \
 	size = read(fd, buf, s.st_size); \
 	fst_requires(size == s.st_size); \
diff --git a/src/mod/Makefile.am b/src/mod/Makefile.am
index b4f3eb5b0a3..9bbf5b7c402 100644
--- a/src/mod/Makefile.am
+++ b/src/mod/Makefile.am
@@ -4,6 +4,8 @@ all: $(OUR_MODULES)
 clean: $(OUR_CLEAN_MODULES) $(OUR_DISABLED_CLEAN_MODULES)
 install: $(OUR_INSTALL_MODULES)
 uninstall: $(OUR_UNINSTALL_MODULES) $(OUR_DISABLED_UNINSTALL_MODULES)
+print_tests: $(OUR_TEST_MODULES)
+check: $(OUR_CHECK_MODULES)
 
 mod_skypopen-all: mod_gsmopen-all
 mod_gsmopen-all: mod_spandsp-all
@@ -11,7 +13,7 @@ mod_unimrcp-all: mod_sofia-all
 mod_rayo-all: mod_dingaling-all
 mod_ssml-all: mod_rayo-all
 
-$(OUR_MODULES) $(OUR_CLEAN_MODULES) $(OUR_INSTALL_MODULES) $(OUR_UNINSTALL_MODULES) $(OUR_DISABLED_MODULES) $(OUR_DISABLED_CLEAN_MODULES) $(OUR_DISABLED_INSTALL_MODULES) $(OUR_DISABLED_UNINSTALL_MODULES):
+$(OUR_MODULES) $(OUR_CLEAN_MODULES) $(OUR_INSTALL_MODULES) $(OUR_UNINSTALL_MODULES) $(OUR_DISABLED_MODULES) $(OUR_DISABLED_CLEAN_MODULES) $(OUR_DISABLED_INSTALL_MODULES) $(OUR_DISABLED_UNINSTALL_MODULES) $(OUR_TEST_MODULES) $(OUR_CHECK_MODULES):
 	@set fnord $$MAKEFLAGS; amf=$$2; \
 	target=`echo $@ | sed -e 's|^.*-||'`; \
 	modname=`echo $@ | sed -e 's|-.*||' | sed -e 's|^.*/||'`; \
@@ -33,8 +35,7 @@ $(OUR_MODULES) $(OUR_CLEAN_MODULES) $(OUR_INSTALL_MODULES) $(OUR_UNINSTALL_MODUL
 		if test -z "$$target" ; then target="all" ; fi ; \
 		if ! test -f $$moddir/$$modname.c && ! test -f $$moddir/$$modname.cpp && test $$modname != "mod_com_g729" ;	\
 		then echo ; echo "WARNING $$modname is not a valid FreeSWITCH module dir, skipping it..." ; else \
-			echo ;\
-			echo making $$target $$modname ;\
+			if test "$$target" != "print_tests" ; then echo; echo making $$target $$modname ; fi;\
 			test -d "$$buildmoddir" || mkdir -p $$buildmoddir ; \
 			(if test -f "$$moddir/Makefile" ; then \
 				test -f "$$buildmoddir/Makefile" || cp $$moddir/Makefile $$buildmoddir/Makefile ; \
diff --git a/src/mod/applications/mod_abstraction/mod_abstraction.c b/src/mod/applications/mod_abstraction/mod_abstraction.c
index bb24ca4c75b..11144462ecf 100644
--- a/src/mod/applications/mod_abstraction/mod_abstraction.c
+++ b/src/mod/applications/mod_abstraction/mod_abstraction.c
@@ -63,7 +63,7 @@ SWITCH_STANDARD_API(api_abstraction_function)
 		const char *destination = switch_xml_attr_soft(x_api, "destination");
 		const char *arguments = switch_xml_attr_soft(x_api, "argument");
 
-		int proceed = 0;
+		int proceed;
 		switch_regex_t *re = NULL;
 		int ovector[30];
 
@@ -75,7 +75,6 @@ SWITCH_STANDARD_API(api_abstraction_function)
 				uint32_t len = (uint32_t) (strlen(cmd) + strlen(arguments) + 10) * proceed;
 				if (!(substituted = malloc(len))) {
 					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_CRIT, "Memory Error!\n");
-					proceed = 0;
 					goto end;
 				}
 				memset(substituted, 0, len);
diff --git a/src/mod/applications/mod_av/Makefile.am b/src/mod/applications/mod_av/Makefile.am
index 70efce9122f..e697d9ec930 100644
--- a/src/mod/applications/mod_av/Makefile.am
+++ b/src/mod/applications/mod_av/Makefile.am
@@ -23,13 +23,15 @@ mod_av_la_LDFLAGS  = -avoid-version -module -no-undefined -shared -lm -lz
 
 
 noinst_PROGRAMS = test/test_mod_av test/test_avformat
-AM_CFLAGS = $(SWITCH_AM_CFLAGS) -I../ $(AVFORMAT_CFLAGS) $(AVCODEC_CFLAGS) $(SWSCALE_CFLAGS) $(AVUTIL_CFLAGS) $(RESAMPLE_CFLAGS)
-AM_LDFLAGS = $(AVFORMAT_LIBS) $(AVCODEC_LIBS) $(SWSCALE_LIBS)  $(AVUTIL_LIBS) $(RESAMPLE_LIBS) -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS)
-TESTS = $(noinst_PROGRAMS)
-
+test_test_mod_av_CFLAGS = $(SWITCH_AM_CFLAGS) -I../ -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\" $(AVFORMAT_CFLAGS) $(AVCODEC_CFLAGS) $(SWSCALE_CFLAGS) $(AVUTIL_CFLAGS) $(RESAMPLE_CFLAGS)
+test_test_mod_av_LDFLAGS = $(AVFORMAT_LIBS) $(AVCODEC_LIBS) $(SWSCALE_LIBS)  $(AVUTIL_LIBS) $(RESAMPLE_LIBS) -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS)
 test_test_mod_av_LDADD = libavmod.la $(switch_builddir)/libfreeswitch.la
+test_test_avformat_CFLAGS = $(SWITCH_AM_CFLAGS) -I../ -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\" $(AVFORMAT_CFLAGS) $(AVCODEC_CFLAGS) $(SWSCALE_CFLAGS) $(AVUTIL_CFLAGS) $(RESAMPLE_CFLAGS)
+test_test_avformat_LDFLAGS = $(AVFORMAT_LIBS) $(AVCODEC_LIBS) $(SWSCALE_LIBS)  $(AVUTIL_LIBS) $(RESAMPLE_LIBS) -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS)
 test_test_avformat_LDADD = libavmod.la $(switch_builddir)/libfreeswitch.la
 
+TESTS = $(noinst_PROGRAMS)
+
 else
 install: error
 all: error
diff --git a/src/mod/applications/mod_av/avformat.c b/src/mod/applications/mod_av/avformat.c
index 2e0c7de7ab6..b76fd599bce 100644
--- a/src/mod/applications/mod_av/avformat.c
+++ b/src/mod/applications/mod_av/avformat.c
@@ -49,7 +49,7 @@ GCC_DIAG_OFF(deprecated-declarations)
 #define swr_get_out_samples avresample_get_out_samples
 #define swr_get_out_samples avresample_get_out_samples
 #define swr_convert(ctx, odata, osamples, idata, isamples) \
-	avresample_convert(ctx, odata, osamples, 0, (uint8_t **)idata, isamples, 0)
+	avresample_convert(ctx, odata, 0, osamples, (uint8_t **)idata, 0, isamples)
 #else
 #include <libswresample/swresample.h>
 #endif
@@ -1113,7 +1113,7 @@ static switch_status_t open_input_file(av_file_context_t *context, switch_file_h
 	/** Open the input file to read from it. */
 	if ((error = avformat_open_input(&context->fc, filename, NULL, NULL)) < 0) {
 		char ebuf[255] = "";
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not open input file '%s' (error '%s')\n", filename, get_error_text(error, ebuf, sizeof(ebuf)));
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Could not open input file '%s' (error '%s')\n", filename, get_error_text(error, ebuf, sizeof(ebuf)));
 		switch_goto_status(SWITCH_STATUS_FALSE, err);
 	}
 
@@ -1123,7 +1123,7 @@ static switch_status_t open_input_file(av_file_context_t *context, switch_file_h
 	/** Get information on the input file (number of streams etc.). */
 	if ((error = avformat_find_stream_info(context->fc, opts ? &opts : NULL)) < 0) {
 		char ebuf[255] = "";
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not open find stream info (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Could not open find stream info (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
 		if (opts) av_dict_free(&opts);
 		switch_goto_status(SWITCH_STATUS_FALSE, err);
 	}
@@ -1194,7 +1194,7 @@ GCC_DIAG_ON(deprecated-declarations)
 	// printf("has audio:%d has_video:%d\n", context->has_audio, context->has_video);
 
 	if ((!context->has_audio) && (!context->has_video)) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Neither audio nor video stream found in file %s\n", filename);
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Neither audio nor video stream found in file %s\n", filename);
 		switch_goto_status(SWITCH_STATUS_FALSE, err);
 	}
 
@@ -1378,7 +1378,7 @@ GCC_DIAG_ON(deprecated-declarations)
 				pkt.stream_index = context->video_st.st->index;
 			} else {
 				char ebuf[255] = "";
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not read frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Could not read frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
 				break;
 			}
 		}
@@ -1400,7 +1400,7 @@ GCC_DIAG_OFF(deprecated-declarations)
 			if ((error = avcodec_decode_video2(context->video_st.st->codec, vframe, &got_data, &pkt)) < 0) {
 GCC_DIAG_ON(deprecated-declarations)
 				char ebuf[255] = "";
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not decode frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Could not decode frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
 				av_packet_unref(&pkt);
 				av_frame_free(&vframe);
 				break;
@@ -1523,7 +1523,7 @@ GCC_DIAG_OFF(deprecated-declarations)
 			if ((error = avcodec_decode_audio4(context->audio_st[0].st->codec, &in_frame, &got_data, &pkt)) < 0) {
 GCC_DIAG_ON(deprecated-declarations)
 				char ebuf[255] = "";
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Could not decode frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Could not decode frame (error '%s')\n", get_error_text(error, ebuf, sizeof(ebuf)));
 				av_packet_unref(&pkt);
 				break;
 			}
diff --git a/src/mod/applications/mod_av/test/test_avformat.c b/src/mod/applications/mod_av/test/test_avformat.c
index c697729e542..af364afd663 100644
--- a/src/mod/applications/mod_av/test/test_avformat.c
+++ b/src/mod/applications/mod_av/test/test_avformat.c
@@ -44,6 +44,7 @@ FST_CORE_BEGIN("conf")
 
 		FST_TEST_BEGIN(avformat_test_colorspace_RGB)
 		{
+			char path[1024];
 			switch_status_t status;
 			switch_image_t *img = switch_img_alloc(NULL, SWITCH_IMG_FMT_I420, 1280, 720, 1);
 			switch_file_handle_t fh = { 0 };
@@ -57,7 +58,8 @@ FST_CORE_BEGIN("conf")
 
 			fst_requires(img);
 
-			status = switch_core_file_open(&fh, "{colorspace=0}./test_RGB.mp4", 1, 8000, flags, fst_pool);
+			sprintf(path, "%s%s%s%s", "{colorspace=0}", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../test_RGB.mp4");
+			status = switch_core_file_open(&fh, path, 1, 8000, flags, fst_pool);
 			fst_requires(status == SWITCH_STATUS_SUCCESS);
 			fst_requires(switch_test_flag(&fh, SWITCH_FILE_OPEN));
 
@@ -70,7 +72,8 @@ FST_CORE_BEGIN("conf")
 			status = switch_core_file_write_video(&fh, &frame);
 			fst_check(status == SWITCH_STATUS_SUCCESS);
 
-			ccimg = switch_img_read_png("./cluecon.png", SWITCH_IMG_FMT_ARGB);
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../cluecon.png");
+			ccimg = switch_img_read_png(path, SWITCH_IMG_FMT_ARGB);
 			fst_requires(ccimg);
 
 			color.a = 255;
@@ -101,6 +104,7 @@ FST_CORE_BEGIN("conf")
 
 		FST_TEST_BEGIN(avformat_test_colorspace_BT7)
 		{
+			char path[1024];
 			switch_status_t status;
 			switch_image_t *img = switch_img_alloc(NULL, SWITCH_IMG_FMT_I420, 1280, 720, 1);
 			switch_file_handle_t fh = { 0 };
@@ -114,7 +118,8 @@ FST_CORE_BEGIN("conf")
 
 			fst_requires(img);
 
-			status = switch_core_file_open(&fh, "{colorspace=1}./test_BT7.mp4", 1, 8000, flags, fst_pool);
+			sprintf(path, "%s%s%s%s", "{colorspace=1}", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../test_BT7.mp4");
+			status = switch_core_file_open(&fh, path, 1, 8000, flags, fst_pool);
 			fst_requires(status == SWITCH_STATUS_SUCCESS);
 			fst_requires(switch_test_flag(&fh, SWITCH_FILE_OPEN));
 
@@ -127,7 +132,8 @@ FST_CORE_BEGIN("conf")
 			status = switch_core_file_write_video(&fh, &frame);
 			fst_check(status == SWITCH_STATUS_SUCCESS);
 
-			ccimg = switch_img_read_png("./cluecon.png", SWITCH_IMG_FMT_ARGB);
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../cluecon.png");
+			ccimg = switch_img_read_png(path, SWITCH_IMG_FMT_ARGB);
 			fst_requires(ccimg);
 
 			color.a = 255;
diff --git a/src/mod/applications/mod_av/test/test_mod_av.c b/src/mod/applications/mod_av/test/test_mod_av.c
index 0910c56973c..a058aa63fd6 100644
--- a/src/mod/applications/mod_av/test/test_mod_av.c
+++ b/src/mod/applications/mod_av/test/test_mod_av.c
@@ -130,6 +130,7 @@ FST_CORE_BEGIN("conf")
 			fst_check(packets > 0);
 
 			switch_core_codec_destroy(&codec);
+			switch_img_free(&img);
 		}
 		FST_TEST_END()
 
diff --git a/src/mod/applications/mod_avmd/scripts/avmd_get_events.pl b/src/mod/applications/mod_avmd/scripts/avmd_get_events.pl
index 024ee32aa24..80d0caeb6d7 100644
--- a/src/mod/applications/mod_avmd/scripts/avmd_get_events.pl
+++ b/src/mod/applications/mod_avmd/scripts/avmd_get_events.pl
@@ -2,7 +2,7 @@
 
 
 #brief  Subscribe to avmd events and print them to the console.
-#author Piotr Gregor <piotrgregor@rsyncme.org>
+#author Piotr Gregor <piotr@dataandsignal.com>
 #date   13 Sept 2016 09:44 PM
 
 
diff --git a/src/mod/applications/mod_avmd/scripts/avmd_originate.pl b/src/mod/applications/mod_avmd/scripts/avmd_originate.pl
index 28d0cc97df1..9cbc98ea5e6 100644
--- a/src/mod/applications/mod_avmd/scripts/avmd_originate.pl
+++ b/src/mod/applications/mod_avmd/scripts/avmd_originate.pl
@@ -2,7 +2,7 @@
 
 
 #brief  Call single voicemail and print detection result to the console.
-#author Piotr Gregor <piotrgregor@rsyncme.org>
+#author Piotr Gregor <piotr@dataandsignal.com>
 #date   15 Sept 2016 02:44 PM
 
 
@@ -17,7 +17,7 @@
 my $pass = "ClueCon";
 my $extension_base = "sofia/internal/1000\@192.168.1.60";
 
-my $playback = 'local_stream://moh';
+my $playback = "{loops=-1}tone_stream://%(251,0,1004)";
 my $context = 'default'; 
 my $endpoint;
 my $dest;
@@ -73,7 +73,7 @@ sub test_once {
 
     my $uuid = $con->api('create_uuid')->getBody();
     my ($time_epoch, $time_hires) = Time::HiRes::gettimeofday();
-    printf("Calling with uuid [%s] [%s]...\n", $uuid, POSIX::strftime('%Y-%m-%d %H:%M:%S', localtime($time_epoch)));
+    printf("Calling with uuid [%s] [%s]... [%s]\n", $uuid, POSIX::strftime('%Y-%m-%d %H:%M:%S', localtime($time_epoch)), $originate_string);
 
     $con->bgapi(sprintf($originate_string, $uuid));
 
diff --git a/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple.pl b/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple.pl
index b95da5fe296..223cedc5571 100644
--- a/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple.pl
+++ b/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple.pl
@@ -1,10 +1,10 @@
 #!/usr/bin/perl -w
 
 
-#brief  Call (possibly) multiple voicemails
-#       and print detection result to the console.
-#author Piotr Gregor <piotrgregor@rsyncme.org>
+#brief  Make multiple calls. This script can be used to check CPU usage on calls running avmd (see test.txt).
+#author Piotr Gregor <piotr@dataandsignal.com>
 #date   15 Sept 2016 02:44 PM
+#changed 27th Dec 2019
 
 
 use strict;
@@ -22,7 +22,8 @@
 my $context = 'default'; 
 #Example:
 #my $endpoint = "originate {originator_codec=PCMA,origination_uuid=%s}sofia/gateway/box_b/840534002  \&park()";
-my $endpoint = "originate {originator_codec=PCMA,origination_uuid=%s}sofia/gateway/%s/%s  \&park()";
+#my $endpoint = "originate {originator_codec=PCMA,origination_uuid=%s}sofia/gateway/%s/%s  \&park()";
+my $endpoint;
 my $gateway;
 my $dest;
 my $callerid;
@@ -30,14 +31,13 @@
 my $idx = 0;
 
 
-if ($#ARGV + 1 eq 4) {
-    $gateway = $ARGV[0];
-    $dest = $ARGV[1];
-    $callerid = $ARGV[2];
-    $thread_n = $ARGV[3];
-    print "Dialing [" .$thread_n ."] calls simultaneously to [" .$gateway ."][" .$dest ."] as [" .$callerid ."]\n";
+if ($#ARGV + 1 eq 3) {
+    $dest = $ARGV[0];
+    $callerid = $ARGV[1];
+    $thread_n = $ARGV[2];
+    print "Dialing [" .$thread_n ."] calls simultaneously to[" .$dest ."] as [" .$callerid ."]\n";
 } else {
-    die "Please specify gateway, destination number, caller id and number of calls to make\n";
+    die "Please specify destination number, caller id and number of calls to make\n\nExample:\n./avmd_originate_multiple.pl EXTENSION CALLER NUMBER_OF_CALLS";
 }
 
 my $con  = new ESL::ESLconnection($host, $port, $pass);
@@ -60,7 +60,6 @@
 
 sub call_once {
     my ($dest, $callerid, $idx) = @_;
-    my $uuid = 
     my $originate_string =
     'originate ' .
     '{ignore_early_media=true,' .
@@ -82,6 +81,6 @@ sub call_once {
     my ($time_epoch, $time_hires) = Time::HiRes::gettimeofday();
     printf("[%s]\tCalling with uuid [%s] [%s]... [%s]\n", $idx + 1, $uuid, POSIX::strftime('%Y-%m-%d %H:%M:%S', localtime($time_epoch)), $originate_string);
 
-    $con->bgapi(sprintf($originate_string, $uuid, $gateway, $dest));
+    $con->bgapi(sprintf($originate_string, $uuid));
     $con->api('uuid_setvar ' . $uuid .' execute_on_answer avmd_start');
 }
diff --git a/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple_loopback.pl b/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple_loopback.pl
index 1baf5b2d706..8eb791d13cc 100644
--- a/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple_loopback.pl
+++ b/src/mod/applications/mod_avmd/scripts/avmd_originate_multiple_loopback.pl
@@ -1,9 +1,8 @@
 #!/usr/bin/perl -w
 
 
-#brief  Call (possibly) multiple voicemails
-#       and print detection result to the console.
-#author Piotr Gregor <piotrgregor@rsyncme.org>
+#brief  Call (possibly) multiple voicemails and print detection result to the console.
+#author Piotr Gregor <piotr@dataandsignal.com>
 #date   15 Sept 2016 02:44 PM
 
 
diff --git a/src/mod/applications/mod_avmd/scripts/avmd_test.pl b/src/mod/applications/mod_avmd/scripts/avmd_test.pl
index 6f6ac0f5fdd..c6f63e1c094 100755
--- a/src/mod/applications/mod_avmd/scripts/avmd_test.pl
+++ b/src/mod/applications/mod_avmd/scripts/avmd_test.pl
@@ -3,7 +3,7 @@
 
 #brief      Test module avmd by calling voicemails from avmd test suite
 #           and print detection results to the console.
-#author     Piotr Gregor <piotrgregor@rsyncme.org>
+#author     Piotr Gregor <piotr@dataandsignal.com>
 #details    If you are testing serving voicemails from dialplan then avmd
 #           must be set to inbound mode, either globally (by avmd set inbound
 #           in fs_cli) or in dialplan settings (<action application="avmd_start"
diff --git a/src/mod/applications/mod_avmd/scripts/test.txt b/src/mod/applications/mod_avmd/scripts/test.txt
new file mode 100644
index 00000000000..d70d3cd9718
--- /dev/null
+++ b/src/mod/applications/mod_avmd/scripts/test.txt
@@ -0,0 +1,370 @@
+#author Piotr Gregor <piotr@dataandsignal.com>
+#date	27th Dec 2019
+
+Test dialplan
+=============
+
+    <extension name="avmd1">
+      <condition field="destination_number" expression="^avmd1$">
+	<action application="playback" data="voicemail/8000/twenty_gold_bomb.wav"/>
+      </condition>
+    </extension>
+    
+    <extension name="avmd2">
+      <condition field="destination_number" expression="^avmd2$">
+	<action application="avmd_start"/>
+	<action application="playback" data="voicemail/8000/twenty_gold_bomb.wav"/>
+	<action application="avmd_stop"/>
+      </condition>
+    </extension>
+
+
+Test machine
+============
+
+CPU is 8-core Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz.
+
+
+Test results
+===========
+
+Obtained from output of top (running in multiple core mode, so max value of CPU for this case of 8-core CPU is 800%).
+
+Running command
+./avmd_originate_multiple.pl avmd1 piotr X
+then Freeswitch starts two legs for each call (AND I configured avmd to start on both of them).
+then Freeswitch's command "show calls" is showing 2*X calls, so if I run ./avmd_originate_multiple.pl avmd1 piotr 10 
+then if I do "show calls" in cli it displays "20 total", as it counts "-a" and "-b" loopback legs.
+
+So for test extension "avmd2" where avmd_start app is executed Freeswitch starts two legs for each call and avmd is running on inbound.
+
+This table uses X as calls (so if you did "show calls" in cli you would actually see twice as the number in top row).
+
+ext/#calls	10	50	100	150	200 
+avmd1		10%	26	50	63	78
+avmd2		103	257	382 510	760(GUI lagging)
+
+Results are values shown as %CPU from "top" command (here that would be 63.3%):
+Obtained from output of top (running in multiple core mode, so max value of CPU for this case of 8-core CPU is 800%).
+
+top -p `pidof freeswitch`
+
+top - 02:26:45 up 13 days, 13:48,  1 user,  load average: 0.62, 0.39, 0.34
+Tasks:   1 total,   0 running,   1 sleeping,   0 stopped,   0 zombie
+%Cpu(s):  7.9 us,  0.6 sy,  0.0 ni, 89.8 id,  0.0 wa,  0.0 hi,  1.7 si,  0.0 st
+MiB Mem :  24034.2 total,   4879.0 free,  11671.8 used,   7483.4 buff/cache
+MiB Swap:  24502.0 total,  24502.0 free,      0.0 used.  11040.5 avail Mem 
+
+  PID USER      PR  NI    VIRT    RES    SHR S  %CPU  %MEM     TIME+ COMMAND                                                                                                                                                                                         
+ 8765 root      -2 -10 4370088 115872  20780 S  63.3   0.5   1:42.73 freeswitch  
+
+
+Test script (for up to date version see avmd_originate_multiple.pl)
+===========
+
+#!/usr/bin/perl -w
+
+
+#brief  Make multiple calls. This script can be used to check CPU usage on calls running avmd (see test.txt).
+#author Piotr Gregor <piotr@dataandsignal.com>
+#date   15 Sept 2016 02:44 PM
+#changed 27th Dec 2019
+
+
+use strict;
+use warnings;
+require ESL;
+use POSIX;
+use Time::HiRes;
+
+my $host = "127.0.0.1";
+my $port = "8021";
+my $pass = "ClueCon";
+my $extension_base = "sofia/internal/1000\@192.168.1.1";
+
+my $playback = 'local_stream://moh';
+my $context = 'default'; 
+#Example:
+#my $endpoint = "originate {originator_codec=PCMA,origination_uuid=%s}sofia/gateway/box_b/840534002  \&park()";
+#my $endpoint = "originate {originator_codec=PCMA,origination_uuid=%s}sofia/gateway/%s/%s  \&park()";
+my $endpoint;
+my $gateway;
+my $dest;
+my $callerid;
+my $thread_n;
+my $idx = 0;
+
+
+if ($#ARGV + 1 eq 3) {
+    $dest = $ARGV[0];
+    $callerid = $ARGV[1];
+    $thread_n = $ARGV[2];
+    print "Dialing [" .$thread_n ."] calls simultaneously to[" .$dest ."] as [" .$callerid ."]\n";
+} else {
+    die "Please specify destination number, caller id and number of calls to make\n";
+}
+
+my $con  = new ESL::ESLconnection($host, $port, $pass);
+if (!$con) {
+    die "Unable to establish connection to $host:$port\n";
+}
+if ($con->connected()) {
+    print "OK, Connected.\n";
+} else {
+    die "Connection failure.\n";
+}
+
+while($con->connected() && ($idx < $thread_n)) {
+    call_once($dest, $callerid, $idx);
+    $idx++;
+    Time::HiRes::sleep(0.11);    # avoid switch_core_session.c:2265 Throttle Error! 33, switch_time.c:1227 Over Session Rate of 30!
+}
+
+print "Disconnected.\n\n";
+
+sub call_once {
+    my ($dest, $callerid, $idx) = @_;
+    my $originate_string =
+    'originate ' .
+    '{ignore_early_media=true,' .
+    'originator_codec=PCMA,' .
+    'origination_uuid=%s,' . 
+    'originate_timeout=60,' .
+    'origination_caller_id_number=' . $callerid . ',' .
+    'origination_caller_id_name=' . $callerid . '}';
+
+    if(defined($endpoint)) {
+        $originate_string = '';
+        $originate_string .= $endpoint;
+    } else {
+        $originate_string .= 'loopback/' . $dest . '/' . $context;
+        $originate_string .=  ' ' . '&playback(' . $playback . ')';
+    }
+
+    my $uuid = $con->api('create_uuid')->getBody();
+    my ($time_epoch, $time_hires) = Time::HiRes::gettimeofday();
+    printf("[%s]\tCalling with uuid [%s] [%s]... [%s]\n", $idx + 1, $uuid, POSIX::strftime('%Y-%m-%d %H:%M:%S', localtime($time_epoch)), $originate_string);
+
+    $con->bgapi(sprintf($originate_string, $uuid));
+    $con->api('uuid_setvar ' . $uuid .' execute_on_answer avmd_start');
+}
+
+CPU info
+=========
+peter@photon:/usr/local/freeswitch/conf/dialplan$ cat /proc/cpuinfo 
+processor	: 0
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.008
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 0
+cpu cores	: 4
+apicid		: 0
+initial apicid	: 0
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 1
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 799.969
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 1
+cpu cores	: 4
+apicid		: 2
+initial apicid	: 2
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 2
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 799.992
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 2
+cpu cores	: 4
+apicid		: 4
+initial apicid	: 4
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 3
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.021
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 3
+cpu cores	: 4
+apicid		: 6
+initial apicid	: 6
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 4
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.038
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 0
+cpu cores	: 4
+apicid		: 1
+initial apicid	: 1
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 5
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.241
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 1
+cpu cores	: 4
+apicid		: 3
+initial apicid	: 3
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 6
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.199
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 2
+cpu cores	: 4
+apicid		: 5
+initial apicid	: 5
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
+processor	: 7
+vendor_id	: GenuineIntel
+cpu family	: 6
+model		: 60
+model name	: Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz
+stepping	: 3
+microcode	: 0x19
+cpu MHz		: 800.031
+cache size	: 8192 KB
+physical id	: 0
+siblings	: 8
+core id		: 3
+cpu cores	: 4
+apicid		: 7
+initial apicid	: 7
+fpu		: yes
+fpu_exception	: yes
+cpuid level	: 13
+wp		: yes
+flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm abm cpuid_fault invpcid_single pti tpr_shadow vnmi flexpriority ept vpid ept_ad fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm xsaveopt dtherm ida arat pln pts
+bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
+bogomips	: 7999.38
+clflush size	: 64
+cache_alignment	: 64
+address sizes	: 39 bits physical, 48 bits virtual
+power management:
+
diff --git a/src/mod/applications/mod_callcenter/mod_callcenter.c b/src/mod/applications/mod_callcenter/mod_callcenter.c
index cdbb9f4361f..f59b7440984 100644
--- a/src/mod/applications/mod_callcenter/mod_callcenter.c
+++ b/src/mod/applications/mod_callcenter/mod_callcenter.c
@@ -1656,7 +1656,7 @@ static switch_status_t load_config(void)
 static switch_status_t playback_array(switch_core_session_t *session, const char *str) {
 	switch_status_t status = SWITCH_STATUS_FALSE;
 	if (str && !strncmp(str, "ARRAY::", 7)) {
-		char *i = (char*) str + 7, *j = i;
+		char *i = (char*) str + 7, *j;
 		while (1) {
 			if ((j = strstr(i, "::"))) {
 				*j = 0;
diff --git a/src/mod/applications/mod_cidlookup/mod_cidlookup.c b/src/mod/applications/mod_cidlookup/mod_cidlookup.c
index 5b90e8a095a..7df22fb3ef2 100644
--- a/src/mod/applications/mod_cidlookup/mod_cidlookup.c
+++ b/src/mod/applications/mod_cidlookup/mod_cidlookup.c
@@ -545,9 +545,6 @@ static cid_data_t *do_lookup(switch_memory_pool_t *pool, switch_event_t *event,
 	cid_data_t *cidtmp = NULL;
 	switch_bool_t save_cache = SWITCH_FALSE;
 
-	cid = switch_core_alloc(pool, sizeof(cid_data_t));
-	switch_assert(cid);
-
 	number = string_digitsonly(pool, num);
 	switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "caller_id_number", number);
 
@@ -555,6 +552,8 @@ static cid_data_t *do_lookup(switch_memory_pool_t *pool, switch_event_t *event,
 	if (globals.odbc_dsn && globals.sql) {
 		name = do_db_lookup(pool, event, number, globals.sql);
 		if (name) {
+			cid = switch_core_alloc(pool, sizeof(cid_data_t));
+			switch_assert(cid);
 			cid->name = name;
 			cid->src = "phone_database";
 			goto done;
@@ -578,6 +577,11 @@ static cid_data_t *do_lookup(switch_memory_pool_t *pool, switch_event_t *event,
 		}
 	}
 
+	if (!cid) {
+		cid = switch_core_alloc(pool, sizeof(cid_data_t));
+		switch_assert(cid);
+	}
+
 	if (!skipurl && globals.url) {
 		url_query = switch_event_expand_headers(event, globals.url);
 		do_lookup_url(pool, event, &name, url_query, NULL, NULL, 0);
@@ -593,10 +597,6 @@ static cid_data_t *do_lookup(switch_memory_pool_t *pool, switch_event_t *event,
 	}
 
   done:
-	if (!cid) {
-		cid = switch_core_alloc(pool, sizeof(cid_data_t));
-		switch_assert(cid);
-	}
 	/* append area if we can */
 	if (!cid->area && !skipcitystate && strlen(number) == 11 && number[0] == '1' && globals.odbc_dsn && globals.citystate_sql) {
 
diff --git a/src/mod/applications/mod_conference/Makefile.am b/src/mod/applications/mod_conference/Makefile.am
index 581228e743e..2b681351e3a 100644
--- a/src/mod/applications/mod_conference/Makefile.am
+++ b/src/mod/applications/mod_conference/Makefile.am
@@ -24,12 +24,12 @@ libmodconference_la_CFLAGS   = $(AM_CFLAGS) -I.
 noinst_PROGRAMS = test/test_image test/test_member
 
 test_test_image_SOURCES = test/test_image.c
-test_test_image_CFLAGS = $(AM_CFLAGS) -I.
+test_test_image_CFLAGS = $(AM_CFLAGS) -I. -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
 test_test_image_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
 test_test_image_LDADD = libmodconference.la
 
 test_test_member_SOURCES = test/test_member.c
-test_test_member_CFLAGS = $(AM_CFLAGS) -I.
+test_test_member_CFLAGS = $(AM_CFLAGS) -I. -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
 test_test_member_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
 test_test_member_LDADD = libmodconference.la
 
diff --git a/src/mod/applications/mod_conference/conference_event.c b/src/mod/applications/mod_conference/conference_event.c
index 6977b8a494d..806dfc95ed2 100644
--- a/src/mod/applications/mod_conference/conference_event.c
+++ b/src/mod/applications/mod_conference/conference_event.c
@@ -108,7 +108,7 @@ void conference_event_mod_channel_handler(const char *event_channel, cJSON *json
 		}
 	}
 
-	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "conf %s CMD %s [%s] %s\n", conference_name, key, action, cid);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "conf %s CMD %s [%s] %s\n", conference_name, key, action ? action : "N/A", cid);
 
 	if (zstr(action)) {
 		goto end;
@@ -1029,17 +1029,18 @@ switch_status_t chat_send(switch_event_t *message_event)
 			conference_list_pretty(conference, &stream);
 			/* provide help */
 		} else {
-			return SWITCH_STATUS_SUCCESS;
+			goto done;
 		}
 	}
 
-	switch_safe_free(lbuf);
-
 	if (!conference->broadcast_chat_messages) {
 		switch_core_chat_send_args(proto, CONF_CHAT_PROTO, to, hint && strchr(hint, '/') ? hint : from, "", stream.data, NULL, NULL, SWITCH_FALSE);
 	}
 
+done:
+	switch_safe_free(lbuf);
 	switch_safe_free(stream.data);
+
 	switch_thread_rwlock_unlock(conference->rwlock);
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/applications/mod_conference/conference_video.c b/src/mod/applications/mod_conference/conference_video.c
index 625e2406ecc..357f30a5602 100644
--- a/src/mod/applications/mod_conference/conference_video.c
+++ b/src/mod/applications/mod_conference/conference_video.c
@@ -4834,7 +4834,7 @@ void conference_video_write_frame(conference_obj_t *conference, conference_membe
 {
 	conference_member_t *imember;
 	int want_refresh = 0;
-	unsigned char buf[SWITCH_RTP_MAX_BUF_LEN] = "";
+	unsigned char buf[sizeof(switch_rtp_packet_t)] = "";
 	switch_frame_t tmp_frame = { 0 };
 
 	if (switch_test_flag(vid_frame, SFF_CNG) || !vid_frame->packet) {
diff --git a/src/mod/applications/mod_conference/test/test_image.c b/src/mod/applications/mod_conference/test/test_image.c
index 7e7810a1cf9..3b4556a2f32 100644
--- a/src/mod/applications/mod_conference/test/test_image.c
+++ b/src/mod/applications/mod_conference/test/test_image.c
@@ -50,32 +50,39 @@ FST_CORE_BEGIN("./conf")
 
 		FST_TEST_BEGIN(scale_test)
 		{
-			switch_image_t *img = switch_img_read_png("images/signalwire.png", SWITCH_IMG_FMT_I420);
+			char path[4096];
+			switch_image_t *img;
 			int i;
 
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../images/signalwire.png");
+			img = switch_img_read_png(path, SWITCH_IMG_FMT_I420);
+
 			for(i = 2; i <= 10; i += 2) {
 				switch_image_t *scaled_img = NULL;
 				char name[1024];
 
-				switch_snprintf(name, sizeof(name), "images/signalwire-scaled-I420-%d.png", i);
+				switch_snprintf(name, sizeof(name), "../images/signalwire-scaled-I420-%d.png", i);
+				sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, name);
 				switch_img_scale(img, &scaled_img, img->d_w / i, img->d_h / i);
 				fst_requires(scaled_img);
-				switch_img_write_png(scaled_img, name);
+				switch_img_write_png(scaled_img, path);
 				switch_img_free(&scaled_img);
 			}
 
 			switch_img_free(&img);
 
-			img = switch_img_read_png("images/signalwire.png", SWITCH_IMG_FMT_ARGB);
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../images/signalwire.png");
+			img = switch_img_read_png(path, SWITCH_IMG_FMT_ARGB);
 
 			for(i = 2; i <= 10; i += 2) {
 				switch_image_t *scaled_img = NULL;
 				char name[1024];
 
-				switch_snprintf(name, sizeof(name), "images/signalwire-scaled-ARGB-%d.png", i);
+				switch_snprintf(name, sizeof(name), "../images/signalwire-scaled-ARGB-%d.png", i);
+				sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, name);
 				switch_img_scale(img, &scaled_img, img->d_w / i, img->d_h / i);
 				fst_requires(scaled_img);
-				switch_img_write_png(scaled_img, name);
+				switch_img_write_png(scaled_img, path);
 				switch_img_free(&scaled_img);
 			}
 
@@ -86,7 +93,8 @@ FST_CORE_BEGIN("./conf")
 
 		FST_TEST_BEGIN(scale_test)
 		{
-			switch_image_t *img = switch_img_read_png("images/signalwire-scaled-ARGB-8.png", SWITCH_IMG_FMT_ARGB);
+			char path[1024];
+			switch_image_t *img;
 			char *font_face = "font/AEH.ttf";
 			char *fg = "#000000";
 			char *altfg = "#FFFFFF";
@@ -96,12 +104,18 @@ FST_CORE_BEGIN("./conf")
 			const char *txt = "FEESWITCH ROCKS";
 			const char *alttxt = "freeswitch";
 
-			switch_img_txt_handle_create(&txthandle, font_face, fg, bg, font_size, 0, NULL);
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../images/signalwire-scaled-ARGB-8.png");
+			img = switch_img_read_png(path, SWITCH_IMG_FMT_ARGB);
+
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, font_face);
+			switch_img_txt_handle_create(&txthandle, path, fg, bg, font_size, 0, NULL);
 			switch_img_txt_handle_render(txthandle, img, 50, 3, txt, NULL, fg, bg, font_size, 0);
 			switch_img_txt_handle_render(txthandle, img, 60, 15, alttxt, NULL, altfg, "#000000", font_size, 0);
-			switch_img_write_png(img, "images/signalwire-scaled-ARGB-8-txt.png");
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../images/signalwire-scaled-ARGB-8-txt.png");
+			switch_img_write_png(img, path);
 
 			switch_img_free(&img);
+			switch_img_txt_handle_destroy(&txthandle);
 		}
 		FST_TEST_END()
 
diff --git a/src/mod/applications/mod_conference/test/test_member.c b/src/mod/applications/mod_conference/test/test_member.c
index 45e37ea85d6..1df2b850a0b 100644
--- a/src/mod/applications/mod_conference/test/test_member.c
+++ b/src/mod/applications/mod_conference/test/test_member.c
@@ -52,16 +52,19 @@ FST_CORE_BEGIN("./conf")
 
 		FST_TEST_BEGIN(member_test)
 		{
-			const char *logo = "{position=left-bot,text_x=center,"
-				"center_offset=190,text=#000000:transparent:font/AEH.ttf:50:"
-				"'FREESWITCH ROCKS',alt_text_x=center,alt_center_offset=190,"
-				"alt_text_y=88,alt_text=#ffffff:transparent:font/AEH.ttf:40:"
-				"'freeswitch'}images/signalwire.png";
+			char path[4096];
+			char logo[1024];
 			conference_member_t smember = { 0 };
 			conference_member_t *member = &smember;
 			switch_image_t *img;
 			int i;
 
+			sprintf(logo, "%s%s%s%s%s%s%s", "{position=left-bot,text_x=center,"
+			"center_offset=190,text=#000000:transparent:", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "font/AEH.ttf:50:"
+			"'FREESWITCH ROCKS',alt_text_x=center,alt_center_offset=190,"
+			"alt_text_y=88,alt_text=#ffffff:transparent:", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "font/AEH.ttf:40:"
+			"'freeswitch'}");
+			sprintf(path, "%s%s%s%s", logo, SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "../images/signalwire.png");
 			switch_mutex_init(&member->write_mutex, SWITCH_MUTEX_NESTED, fst_pool);
 			switch_mutex_init(&member->flag_mutex, SWITCH_MUTEX_NESTED, fst_pool);
 			switch_mutex_init(&member->fnode_mutex, SWITCH_MUTEX_NESTED, fst_pool);
@@ -70,17 +73,18 @@ FST_CORE_BEGIN("./conf")
 			switch_mutex_init(&member->read_mutex, SWITCH_MUTEX_NESTED, fst_pool);
 			switch_thread_rwlock_create(&member->rwlock, fst_pool);
 
-			conference_member_set_logo(member, logo);
+			conference_member_set_logo(member, path);
 			img = member->video_logo;
 
 			for(i = 2; i <= 10; i += 2) {
 				switch_image_t *scaled_img = NULL;
 				char name[1024];
 
-				switch_snprintf(name, sizeof(name), "images/logo-signalwire-scaled-%d.png", i);
+				switch_snprintf(name, sizeof(name), "../images/logo-signalwire-scaled-%d.png", i);
+				sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, name);
 				switch_img_scale(img, &scaled_img, img->d_w / i, img->d_h / i);
 				fst_requires(scaled_img);
-				switch_img_write_png(scaled_img, name);
+				switch_img_write_png(scaled_img, path);
 				switch_img_free(&scaled_img);
 			}
 
diff --git a/src/mod/applications/mod_dptools/mod_dptools.c b/src/mod/applications/mod_dptools/mod_dptools.c
index 50601be2fca..176b38074c8 100644
--- a/src/mod/applications/mod_dptools/mod_dptools.c
+++ b/src/mod/applications/mod_dptools/mod_dptools.c
@@ -3868,7 +3868,7 @@ static void pickup_add_session(switch_core_session_t *session, const char *key)
 		key = dup_key;
 	}
 
-	node = malloc(sizeof(*node));
+	switch_zmalloc(node, sizeof(*node));
 	switch_assert(node);
 	node->key = strdup(key);
 	node->uuid = strdup(switch_core_session_get_uuid(session));
diff --git a/src/mod/applications/mod_fifo/mod_fifo.c b/src/mod/applications/mod_fifo/mod_fifo.c
index 1087427b36f..0f9d1b299d1 100644
--- a/src/mod/applications/mod_fifo/mod_fifo.c
+++ b/src/mod/applications/mod_fifo/mod_fifo.c
@@ -1331,8 +1331,6 @@ static switch_status_t messagehook (switch_core_session_t *session, switch_core_
 
 			fifo_execute_sql_queued(&sql, SWITCH_TRUE, SWITCH_FALSE);
 
-			epoch_start = (long)switch_epoch_time_now(NULL);
-
 			ts = switch_micro_time_now();
 			switch_time_exp_lt(&tm, ts);
 			epoch_start = (long)switch_epoch_time_now(NULL);
@@ -1538,7 +1536,6 @@ static void *SWITCH_THREAD_FUNC outbound_ringall_thread_run(switch_thread_t *thr
 
 	if (!pop) {
 		for (x = 0; x < MAX_PRI; x++) {
-			q = node->fifo_list[x];
 			if (fifo_queue_pop(node->fifo_list[x], &pop_dup, SWITCH_FALSE) == SWITCH_STATUS_SUCCESS && pop_dup) {
 				pop = pop_dup;
 				break;
@@ -2113,7 +2110,7 @@ static void *SWITCH_THREAD_FUNC node_thread_run(switch_thread_t *thread, void *o
 
 		while(node) {
 			int x = 0;
-			switch_event_t *pop;
+			switch_event_t *pop = NULL;
 
 			this_node = node;
 			node = node->next;
@@ -2182,7 +2179,6 @@ static void *SWITCH_THREAD_FUNC node_thread_run(switch_thread_t *thread, void *o
 
 		if (cur_priority == 1 || need_sleep) {
 			switch_yield(1000000);
-			need_sleep = 0;
 		}
 	}
 
@@ -2567,7 +2563,7 @@ SWITCH_STANDARD_APP(fifo_function)
 	switch_event_t *event = NULL;
 	char date[80] = "";
 	switch_time_exp_t tm;
-	switch_time_t ts = switch_micro_time_now();
+	switch_time_t ts;
 	switch_size_t retsize;
 	char *list_string;
 	int nlist_count;
@@ -3701,8 +3697,6 @@ static int xml_callback(void *pArg, int argc, char **argv, char **columnNames)
 		}
 	}
 
-	arg = 0;
-
 	if (argv[7]) {
 		if ((etime = atol(argv[7]))) {
 			switch_size_t retsize;
@@ -4100,7 +4094,7 @@ static void list_node(fifo_node_t *node, switch_xml_t x_report, int *off, int ve
 	cc_off = xml_outbound(x_fifo, node, "outbound", "member", cc_off, verbose);
 	cc_off = xml_caller(x_fifo, node, "callers", "caller", cc_off, verbose);
 	cc_off = xml_hash(x_fifo, node->consumer_hash, "consumers", "consumer", cc_off, verbose);
-	cc_off = xml_bridges(x_fifo, node, "bridges", "bridge", cc_off, verbose);
+	xml_bridges(x_fifo, node, "bridges", "bridge", cc_off, verbose);
 }
 
 void dump_hash(switch_hash_t *hash, switch_stream_handle_t *stream)
diff --git a/src/mod/applications/mod_hash/mod_hash.c b/src/mod/applications/mod_hash/mod_hash.c
index 1e8ebc56f56..6899922ab64 100644
--- a/src/mod/applications/mod_hash/mod_hash.c
+++ b/src/mod/applications/mod_hash/mod_hash.c
@@ -246,7 +246,6 @@ SWITCH_HASH_DELETE_FUNC(limit_hash_remote_cleanup_callback)
 	switch_time_t now = (switch_time_t)(intptr_t)pData;
 
 	if (item->last_update != now) {
-		free(item);
 		return SWITCH_TRUE;
 	}
 
@@ -763,8 +762,6 @@ limit_remote_t *limit_remote_create(const char *name, const char *host, uint16_t
 void limit_remote_destroy(limit_remote_t **r)
 {
 	if (r && *r) {
-		switch_hash_index_t *hi;
-
 		(*r)->state = REMOTE_OFF;
 
 		if ((*r)->thread) {
@@ -775,20 +772,12 @@ void limit_remote_destroy(limit_remote_t **r)
 		switch_thread_rwlock_wrlock((*r)->rwlock);
 
 		/* Free hashtable data */
-		for (hi = switch_core_hash_first((*r)->index); hi; hi = switch_core_hash_next(&hi)) {
-			void *val;
-			const void *key;
-			switch_ssize_t keylen;
-			switch_core_hash_this(hi, &key, &keylen, &val);
-
-			free(val);
-		}
+		switch_core_hash_destroy(&(*r)->index);
 
 		switch_thread_rwlock_unlock((*r)->rwlock);
 		switch_thread_rwlock_destroy((*r)->rwlock);
 
 		switch_core_destroy_memory_pool(&((*r)->pool));
-		switch_core_hash_destroy(&(*r)->index);
 		*r = NULL;
 	}
 }
@@ -880,8 +869,8 @@ static void *SWITCH_THREAD_FUNC limit_remote_thread(switch_thread_t *thread, voi
 								limit_hash_item_t *item;
 								switch_thread_rwlock_wrlock(remote->rwlock);
 								if (!(item = switch_core_hash_find(remote->index, argv[0]))) {
-									item = malloc(sizeof(*item));
-									switch_core_hash_insert(remote->index, argv[0], item);
+									switch_zmalloc(item, sizeof(*item));
+									switch_core_hash_insert_auto_free(remote->index, argv[0], item);
 								}
 								item->total_usage = atoi(argv[1]);
 								item->rate_usage = atoi(argv[2]);
diff --git a/src/mod/applications/mod_httapi/conf/autoload_configs/httapi.conf.xml b/src/mod/applications/mod_httapi/conf/autoload_configs/httapi.conf.xml
index e6435667034..1aa4377c580 100644
--- a/src/mod/applications/mod_httapi/conf/autoload_configs/httapi.conf.xml
+++ b/src/mod/applications/mod_httapi/conf/autoload_configs/httapi.conf.xml
@@ -105,6 +105,8 @@
 	<!-- <param name="ssl-key-password" value="MyPrivateKeyPassword"/> -->
 	<!-- optional timeout -->
 	<!-- <param name="timeout" value="10"/> -->
+	<!-- optional: maximum amount of time in seconds that is allowed to make the connection to the server -->
+	<!-- <param name="connect-timeout" value="2"/> -->
 
 	<!-- optional: use a custom CA certificate in PEM format to verify the peer
 	     with. This is useful if you are acting as your own certificate authority.
diff --git a/src/mod/applications/mod_httapi/docs/mod_httapi_doc.txt b/src/mod/applications/mod_httapi/docs/mod_httapi_doc.txt
index b313dc5e15a..1fdb3b4d39f 100644
--- a/src/mod/applications/mod_httapi/docs/mod_httapi_doc.txt
+++ b/src/mod/applications/mod_httapi/docs/mod_httapi_doc.txt
@@ -319,6 +319,8 @@ auth-scheme                     : <string               >               basic
 disable-100-continue            : <true|false>          true            Disable the 100 continue feature.
 method                          : <string>              ""              METHOD name to send.
 timeout                         : <number>              0               Timeout waiting for response.
+connect-timeout                 : <number>              0               Timeout to create connection. Use default value 0 to switch to the
+                                                                        default built-in connection timeout - 300 seconds.
 enable-cacert-check             : <true|false>          false           Check CA/CERT.
 ssl-cert-path                   : <string>              ""              path to file.
 ssl-key-path                    : <string>              ""              path to file.
diff --git a/src/mod/applications/mod_httapi/mod_httapi.c b/src/mod/applications/mod_httapi/mod_httapi.c
index b0e4e4803c7..abc9bb37d24 100644
--- a/src/mod/applications/mod_httapi/mod_httapi.c
+++ b/src/mod/applications/mod_httapi/mod_httapi.c
@@ -89,6 +89,7 @@ typedef struct client_profile_s {
 	switch_hash_t *vars_map;
 	long auth_scheme;
 	int timeout;
+	int connect_timeout;
 	profile_perms_t perms;
 	char *ua;
 
@@ -1610,6 +1611,10 @@ static switch_status_t httapi_sync(client_t *client)
 		switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, client->profile->timeout);
 	}
 
+	if (client->profile->connect_timeout) {
+		switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, client->profile->connect_timeout);
+	}
+
 	if (client->profile->ssl_cert_file) {
 		switch_curl_easy_setopt(curl_handle, CURLOPT_SSLCERT, client->profile->ssl_cert_file);
 	}
@@ -1759,6 +1764,7 @@ static switch_status_t do_config(void)
 		char *method = NULL;
 		int disable100continue = 1;
 		int timeout = 0;
+		int connect_timeout = 0;
 		uint32_t enable_cacert_check = 0;
 		char *ssl_cert_file = NULL;
 		char *ssl_key_file = NULL;
@@ -1824,6 +1830,13 @@ static switch_status_t do_config(void)
 					} else {
 						switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Can't set a negative timeout!\n");
 					}
+				} else if (!strcasecmp(var, "connect-timeout")) {
+					int tmp = atoi(val);
+					if (tmp >= 0) {
+						connect_timeout = tmp;
+					} else {
+						switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Can't set a negative connect-timeout!\n");
+					}
 				} else if (!strcasecmp(var, "enable-cacert-check") && switch_true(val)) {
 					enable_cacert_check = 1;
 				} else if (!strcasecmp(var, "ssl-cert-path")) {
@@ -2091,6 +2104,7 @@ static switch_status_t do_config(void)
 
 		profile->auth_scheme = auth_scheme;
 		profile->timeout = timeout;
+		profile->connect_timeout = connect_timeout;
 		profile->url = switch_core_strdup(globals.pool, url);
 		switch_assert(profile->url);
 
@@ -2544,6 +2558,10 @@ static switch_status_t fetch_cache_data(http_file_context_t *context, const char
 		switch_curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, client->profile->timeout);
 	}
 
+	if (client->profile->connect_timeout) {
+		switch_curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, client->profile->connect_timeout);
+	}
+
 	if (client->profile->ssl_cert_file) {
 		switch_curl_easy_setopt(curl_handle, CURLOPT_SSLCERT, client->profile->ssl_cert_file);
 	}
@@ -2880,7 +2898,7 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 	http_file_context_t *context;
 	char *parsed = NULL, *pdup = NULL;
 	const char *pa = NULL;
-	switch_status_t status;
+	switch_status_t status = SWITCH_STATUS_SUCCESS;
 
 	if (!strncmp(path, "http://", 7)) {
 		pa = path + 7;
@@ -2942,7 +2960,7 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 
 		if (!context->write.file_name) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "No file name specified.\n");
-			return SWITCH_STATUS_GENERR;
+			switch_goto_status(SWITCH_STATUS_GENERR, done);
 		}
 
 		if ((ext = strrchr(context->write.file_name, '.'))) {
@@ -2962,7 +2980,7 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 
 
 		if (switch_core_file_open(&context->fh, context->write.file, handle->channels, handle->samplerate, handle->flags, NULL) != SWITCH_STATUS_SUCCESS) {
-			return SWITCH_STATUS_GENERR;
+			switch_goto_status(SWITCH_STATUS_GENERR, done);
 		}
 
 	} else {
@@ -2976,7 +2994,7 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 		lock_file(context, SWITCH_FALSE);
 
 		if (status != SWITCH_STATUS_SUCCESS) {
-			return status;
+			switch_goto_status(status, done);
 		}
 
 		if ((status = switch_core_file_open(&context->fh,
@@ -2987,7 +3005,7 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid cache file %s opening url %s Discarding file.\n", context->cache_file, path);
 			unlink(context->cache_file);
 			unlink(context->meta_file);
-			return status;
+			switch_goto_status(status, done);
 		}
 
 		if (switch_test_flag(&context->fh, SWITCH_FILE_FLAG_VIDEO)) {
@@ -3014,7 +3032,14 @@ static switch_status_t file_open(switch_file_handle_t *handle, const char *path,
 		switch_clear_flag_locked(handle, SWITCH_FILE_NATIVE);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+done:
+	if (status != SWITCH_STATUS_SUCCESS) {
+		if (context->url_params) {
+			switch_event_destroy(&context->url_params);
+		}
+	}
+
+	return status;
 }
 
 static switch_status_t http_file_file_open(switch_file_handle_t *handle, const char *path) {
@@ -3028,6 +3053,7 @@ static switch_status_t https_file_file_open(switch_file_handle_t *handle, const
 static switch_status_t http_file_file_close(switch_file_handle_t *handle)
 {
 	http_file_context_t *context = handle->private_info;
+	switch_status_t status = SWITCH_STATUS_SUCCESS;
 
 	if (switch_test_flag((&context->fh), SWITCH_FILE_OPEN)) {
 		switch_core_file_close(&context->fh);
@@ -3058,10 +3084,11 @@ static switch_status_t http_file_file_close(switch_file_handle_t *handle)
 		} else {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Cannot find suitable profile\n");
 			switch_event_destroy(&params);
+			status = SWITCH_STATUS_FALSE;
 		}
 
 		unlink(context->write.file);
-		return SWITCH_STATUS_SUCCESS;
+		switch_goto_status(status, done);
 	}
 
 
@@ -3072,11 +3099,13 @@ static switch_status_t http_file_file_close(switch_file_handle_t *handle)
 		}
 	}
 
+done:
+
 	if (context->url_params) {
 		switch_event_destroy(&context->url_params);
 	}
 
-	return SWITCH_STATUS_SUCCESS;
+	return status;
 }
 
 
diff --git a/src/mod/applications/mod_http_cache/Makefile.am b/src/mod/applications/mod_http_cache/Makefile.am
index 4edb28826d0..5f6f5c4bbbe 100644
--- a/src/mod/applications/mod_http_cache/Makefile.am
+++ b/src/mod/applications/mod_http_cache/Makefile.am
@@ -1,11 +1,22 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_http_cache
 
+noinst_LTLIBRARIES = libhttpcachemod.la
+libhttpcachemod_la_SOURCES  = mod_http_cache.c common.c aws.c azure.c
+
 mod_LTLIBRARIES = mod_http_cache.la
-mod_http_cache_la_SOURCES  = mod_http_cache.c common.c aws.c azure.c
+mod_http_cache_la_SOURCES  = 
 mod_http_cache_la_CFLAGS   = $(AM_CFLAGS)
 mod_http_cache_la_CPPFLAGS = $(CURL_CFLAGS) $(AM_CPPFLAGS)
-mod_http_cache_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
+mod_http_cache_la_LIBADD   = $(switch_builddir)/libfreeswitch.la libhttpcachemod.la
 mod_http_cache_la_LDFLAGS  = $(CURL_LIBS) -avoid-version -module -no-undefined -shared
 
-SUBDIRS=. test
+noinst_PROGRAMS = test/test_aws
+
+test_test_aws_SOURCES = test/test_aws.c
+test_test_aws_CFLAGS = $(AM_CFLAGS) -I. -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_aws_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_aws_LDADD = libhttpcachemod.la
+
+TESTS = $(noinst_PROGRAMS)
+
diff --git a/src/mod/applications/mod_http_cache/test/Makefile.am b/src/mod/applications/mod_http_cache/test/Makefile.am
deleted file mode 100644
index 341c0cada77..00000000000
--- a/src/mod/applications/mod_http_cache/test/Makefile.am
+++ /dev/null
@@ -1,4 +0,0 @@
-noinst_PROGRAMS = test_aws
-AM_CFLAGS = $(SWITCH_AM_CFLAGS)
-AM_LDFLAGS = $(switch_builddir)/libfreeswitch.la -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS) ../mod_http_cache.la 
-TESTS = $(noinst_PROGRAMS)
diff --git a/src/mod/applications/mod_memcache/mod_memcache.c b/src/mod/applications/mod_memcache/mod_memcache.c
index c318901b6a4..c81b345c7a9 100644
--- a/src/mod/applications/mod_memcache/mod_memcache.c
+++ b/src/mod/applications/mod_memcache/mod_memcache.c
@@ -313,7 +313,7 @@ SWITCH_STANDARD_API(memcache_function)
 			}
 			switch_safe_free(val);
 		} else if ((!strcasecmp(subcmd, "increment") || !strcasecmp(subcmd, "decrement")) && argc > 1) {
-			uint64_t ivalue;
+			uint64_t ivalue = 0;
 			unsigned int offset = 1;
 			switch_bool_t increment = SWITCH_TRUE;
 			char *svalue = NULL;
diff --git a/src/mod/applications/mod_oreka/mod_oreka.c b/src/mod/applications/mod_oreka/mod_oreka.c
index cae606a9bff..f3fe6763867 100644
--- a/src/mod/applications/mod_oreka/mod_oreka.c
+++ b/src/mod/applications/mod_oreka/mod_oreka.c
@@ -494,7 +494,6 @@ static switch_bool_t oreka_audio_callback(switch_media_bug_t *bug, void *user_da
 
                 raw_frame.data = raw_data;
                 raw_frame.buflen = SWITCH_RECOMMENDED_BUFFER_SIZE;
-                linear_frame = &raw_frame;
 
                 while (switch_core_media_bug_read(bug, &raw_frame, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
                     linear_frame = &raw_frame;
diff --git a/src/mod/applications/mod_redis/credis.c b/src/mod/applications/mod_redis/credis.c
index 61467f2776b..58e57ce1cde 100644
--- a/src/mod/applications/mod_redis/credis.c
+++ b/src/mod/applications/mod_redis/credis.c
@@ -177,8 +177,10 @@ static int cr_morebulk(cr_multibulk *mb, int size)
 
   iptr = realloc(mb->idxs, total * sizeof(int));
 
-  if (iptr == NULL)
+  if (iptr == NULL) {
+    free(cptr);
     return CREDIS_ERR_NOMEM;
+   }
 
   mb->bulks = cptr;
   mb->idxs = iptr;
@@ -365,7 +367,7 @@ static int cr_readln(REDIS rhnd, int start, char **line, int *idx)
 
 static int cr_receivemultibulk(REDIS rhnd, char *line)
 {
-  int bnum, blen, i, rc=0, idx;
+  int bnum, blen, i, rc=0, idx=0;
 
   bnum = atoi(line);
 
@@ -478,6 +480,7 @@ static int cr_receivereply(REDIS rhnd, char recvtype)
 
 static void cr_delete(REDIS rhnd)
 {
+  if (!rhnd) return;
   if (rhnd->reply.multibulk.bulks != NULL)
     free(rhnd->reply.multibulk.bulks);
   if (rhnd->reply.multibulk.idxs != NULL)
diff --git a/src/mod/applications/mod_rss/mod_rss.c b/src/mod/applications/mod_rss/mod_rss.c
index 0c2a8642531..5f23024da2c 100644
--- a/src/mod/applications/mod_rss/mod_rss.c
+++ b/src/mod/applications/mod_rss/mod_rss.c
@@ -575,7 +575,6 @@ SWITCH_STANDARD_APP(rss_function)
 
 						if (entries[dtb.index].title_txt) {
 							switch_snprintf(tmpbuf + tmplen, sizeof(tmpbuf) - tmplen, "%s", entries[dtb.index].title_txt);
-							tmplen = (uint32_t) strlen(tmpbuf);
 						}
 					}
 					switch_core_speech_flush_tts(&sh);
@@ -590,7 +589,6 @@ SWITCH_STANDARD_APP(rss_function)
 					}
 
 					if (cont) {
-						cont = 0;
 						continue;
 					}
 
diff --git a/src/mod/applications/mod_test/Makefile.am b/src/mod/applications/mod_test/Makefile.am
index 960df112316..5a961aad713 100644
--- a/src/mod/applications/mod_test/Makefile.am
+++ b/src/mod/applications/mod_test/Makefile.am
@@ -10,5 +10,12 @@ mod_test_la_SOURCES =
 mod_test_la_LIBADD = $(switch_builddir)/libfreeswitch.la $(SOFIALA) libtestmod.la
 mod_test_la_LDFLAGS = -avoid-version -module -no-undefined -shared
 
-SUBDIRS=. test
+noinst_PROGRAMS = test/test_asr
+
+test_test_asr_SOURCES = test/test_asr.c
+test_test_asr_CFLAGS = $(AM_CFLAGS) -I. -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_asr_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_asr_LDADD = libtestmod.la
+
+TESTS = $(noinst_PROGRAMS)
 
diff --git a/src/mod/applications/mod_test/test/Makefile.am b/src/mod/applications/mod_test/test/Makefile.am
deleted file mode 100644
index 3b721482fb3..00000000000
--- a/src/mod/applications/mod_test/test/Makefile.am
+++ /dev/null
@@ -1,6 +0,0 @@
-include $(top_srcdir)/build/modmake.rulesam
-noinst_PROGRAMS = test_asr
-test_asr_CFLAGS = $(AM_CFLAGS) -I../
-test_asr_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
-test_asr_LDADD = ../libtestmod.la
-TESTS = $(noinst_PROGRAMS)
diff --git a/src/mod/applications/mod_test/test/test_asr.c b/src/mod/applications/mod_test/test/test_asr.c
index 3ab3452d279..ba501e47fe3 100644
--- a/src/mod/applications/mod_test/test/test_asr.c
+++ b/src/mod/applications/mod_test/test/test_asr.c
@@ -73,12 +73,14 @@ FST_TEARDOWN_END()
 
 FST_TEST_BEGIN(core_asr)
 {
+	char path[1024];
 	const char* session_id = "123435";
 	char *grammar = switch_core_sprintf(fst_pool, "{start-input-timers=true,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default", session_id);
 	fst_test_core_asr_open("test");
+	sprintf(path, "%s%s%s%s", "file_string://silence_stream://3000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://silence_stream://3000,0!sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_resume();
 	fst_test_core_asr(
@@ -86,9 +88,10 @@ FST_TEST_BEGIN(core_asr)
 		"silence_stream://30000,0");
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "no_input");
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_resume();
 	fst_test_core_asr(
@@ -96,22 +99,24 @@ FST_TEST_BEGIN(core_asr)
 		"silence_stream://30000,0");
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "no_input");
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_pause();
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_close();
-
 	fst_test_core_asr_open("test");
+	sprintf(path, "%s%s%s%s", "file_string://silence_stream://1000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://silence_stream://1000,0!sounds/ivr-please_state_your_name_and_reason_for_calling.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_close();
 }
@@ -119,35 +124,40 @@ FST_TEST_END()
 
 FST_TEST_BEGIN(core_asr_auto_resume)
 {
+	char path[1024];
 	const char* session_id = "123435";
 	char *grammar = switch_core_sprintf(fst_pool, "{start-input-timers=true,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default", session_id);
 	fst_test_core_asr_open("test");
 	switch_set_flag(&ah, SWITCH_ASR_FLAG_AUTO_RESUME);
+	sprintf(path, "%s%s%s%s", "file_string://silence_stream://3000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://silence_stream://3000,0!sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr(
 		grammar,
 		"silence_stream://30000,0");
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "no_input");
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr(
 		grammar,
 		"silence_stream://30000,0");
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "no_input");
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://silence_stream://1000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://silence_stream://1000,0!sounds/ivr-please_state_your_name_and_reason_for_calling.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_close();
 }
@@ -155,21 +165,24 @@ FST_TEST_END()
 
 FST_TEST_BEGIN(core_asr_abuse)
 {
+	char path[1024];
 	const char* session_id = "5351514";
 	char *grammar = switch_core_sprintf(fst_pool, "{start-input-timers=true,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default", session_id);
 	fst_test_core_asr_open("test");
+	sprintf(path, "%s%s%s%s", "file_string://silence_stream://3000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://silence_stream://3000,0!sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_resume();
 	fst_test_core_asr_resume();
 	fst_test_core_asr_resume();
 	fst_test_core_asr_pause();
 	fst_test_core_asr_resume();
+	sprintf(path, "%s%s%s%s", "file_string://", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav!silence_stream://3000,0");
 	fst_test_core_asr(
 		grammar,
-		"file_string://sounds/agent.wav!silence_stream://3000,0");
+		path);
 	fst_check_string_equals(get_query_result_text(fst_pool, fst_asr_result), "agent");
 	fst_test_core_asr_resume();
 
@@ -181,24 +194,30 @@ FST_TEST_END()
 
 FST_SESSION_BEGIN(play_and_detect_1)
 {
+	char path[1024];
+	char path2[1024];
 	const char *result_text = NULL;
 	char *grammar = switch_core_session_sprintf(fst_session, "{start-input-timers=false,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default", switch_core_session_get_uuid(fst_session));
 	fst_play_and_detect_speech_test_begin();
 
 	/* initial welcome and request */
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
+	sprintf(path2, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav");
 	fst_play_and_detect_speech_app_test("test",
 		grammar,
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
-		"sounds/agent.wav");
+		path,
+		path2);
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
 	fst_check_string_equals(result_text, "agent");
 
 	/* follow up request */
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
+	sprintf(path2, "%s%s%s%s", "file_string://1000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav");
 	fst_play_and_detect_speech_app_test("test",
 		grammar,
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
-		"file_string://1000,0!sounds/agent.wav");
+		path,
+		path2);
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
 	fst_check_string_equals(result_text, "agent");
@@ -209,23 +228,28 @@ FST_SESSION_END()
 
 FST_SESSION_BEGIN(play_and_detect_no_input_follow_up)
 {
+	char path[1024];
+	char path2[1024];
 	const char *result_text = NULL;
 	char *grammar = switch_core_session_sprintf(fst_session, "{start-input-timers=false,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}", switch_core_session_get_uuid(fst_session));
 
 	switch_ivr_schedule_hangup(switch_epoch_time_now(NULL) + 60, switch_core_session_get_uuid(fst_session), SWITCH_CAUSE_NORMAL_CLEARING, SWITCH_FALSE);
 	fst_play_and_detect_speech_test_begin();
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
+	sprintf(path2, "%s%s%s%s", "file_string://silence_stream://4000,0!", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/agent.wav");
 	fst_play_and_detect_speech_app_test("test",
 		grammar,
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
-		"file_string://silence_stream://4000,0!sounds/agent.wav");
+		path,
+		path2);
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
 	fst_check_string_equals(result_text, "agent");
 
 	/* follow up request - no input */
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
 	fst_play_and_detect_speech_app_test("test",
 		grammar,
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
+		path,
 		"silence_stream://10000,0");
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
@@ -237,15 +261,17 @@ FST_SESSION_END()
 
 FST_SESSION_BEGIN(play_and_detect_no_input)
 {
+	char path[1024];
 	const char *result_text = NULL;
 
 	switch_ivr_schedule_hangup(switch_epoch_time_now(NULL) + 60, switch_core_session_get_uuid(fst_session), SWITCH_CAUSE_NORMAL_CLEARING, SWITCH_FALSE);
 	fst_play_and_detect_speech_test_begin();
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
 	fst_play_and_detect_speech_app_test("test",
 		switch_core_session_sprintf(fst_session,
 			"{start-input-timers=false,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default",
 			switch_core_session_get_uuid(fst_session)),
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
+		path,
 		"silence_stream://10000,0");
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
@@ -257,16 +283,17 @@ FST_SESSION_END()
 
 FST_SESSION_BEGIN(play_and_detect_start_input_timers)
 {
+	char path[1024];
 	const char *result_text = NULL;
 
 	switch_ivr_schedule_hangup(switch_epoch_time_now(NULL) + 60, switch_core_session_get_uuid(fst_session), SWITCH_CAUSE_NORMAL_CLEARING, SWITCH_FALSE);
 	fst_play_and_detect_speech_test_begin();
-
+	sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, "sounds/ivr-please_state_your_name_and_reason_for_calling.wav");
 	fst_play_and_detect_speech_app_test("test",
 		switch_core_session_sprintf(fst_session, 
 			"{start-input-timers=true,no-input-timeout=5000,speech-timeout=10000,channel-uuid=%s}default",
 			switch_core_session_get_uuid(fst_session)),
-		"sounds/ivr-please_state_your_name_and_reason_for_calling.wav",
+		path,
 		"silence_stream://10000,0");
 	result_text = get_query_result_text(fst_pool, fst_asr_result);
 	fst_requires(result_text != NULL);
@@ -282,7 +309,7 @@ FST_TEST_BEGIN(unload_test)
 {
 	const char *err = NULL;
 	switch_sleep(1000000);
-	fst_check(switch_loadable_module_unload_module((char *)"../.libs", (char *)"mod_test", SWITCH_FALSE, &err) == SWITCH_STATUS_SUCCESS);
+	fst_check(switch_loadable_module_unload_module(SWITCH_GLOBAL_dirs.mod_dir, (char *)"mod_test", SWITCH_FALSE, &err) == SWITCH_STATUS_SUCCESS);
 }
 FST_TEST_END()
 
diff --git a/src/mod/applications/mod_vmd/mod_vmd.c b/src/mod/applications/mod_vmd/mod_vmd.c
index b5f6f9a6006..71a2dc3bc97 100644
--- a/src/mod/applications/mod_vmd/mod_vmd.c
+++ b/src/mod/applications/mod_vmd/mod_vmd.c
@@ -518,6 +518,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_vmd_load)
 	*module_interface = switch_loadable_module_create_module_interface(pool, modname);
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Voicemail detection enabled\n");
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "mod_vmd deprecated.  Please migrate to mod_avmd\n");
 
 	SWITCH_ADD_APP(app_interface, "vmd", "Detect beeps", "Detect voicemail beeps", vmd_start_function, "[start] [stop]", SAF_NONE);
 
diff --git a/src/mod/applications/mod_voicemail_ivr/menu.c b/src/mod/applications/mod_voicemail_ivr/menu.c
index a34c291f741..79aa16b56ed 100644
--- a/src/mod/applications/mod_voicemail_ivr/menu.c
+++ b/src/mod/applications/mod_voicemail_ivr/menu.c
@@ -168,7 +168,7 @@ void vmivr_menu_navigator(switch_core_session_t *session, vmivr_profile_t *profi
 	size_t msg_count = 0;
 	size_t current_msg = 1;
 	size_t next_msg = current_msg;
-	size_t previous_msg = current_msg;
+	size_t previous_msg;
 	char *cmd = NULL;
 	int retry;
 
diff --git a/src/mod/applications/mod_voicemail_ivr/mod_voicemail_ivr.c b/src/mod/applications/mod_voicemail_ivr/mod_voicemail_ivr.c
index cafe24d2df1..0278bfa3467 100644
--- a/src/mod/applications/mod_voicemail_ivr/mod_voicemail_ivr.c
+++ b/src/mod/applications/mod_voicemail_ivr/mod_voicemail_ivr.c
@@ -68,7 +68,7 @@ SWITCH_STANDARD_APP(voicemail_ivr_function)
 	if (argv[2])
 		domain = argv[2];
 
-	if (!strcasecmp(argv[0], "check")) {
+	if (argv[0] && !strcasecmp(argv[0], "check")) {
 		if (argv[3])
 			id = argv[3];
 
diff --git a/src/mod/asr_tts/mod_unimrcp/mod_unimrcp.c b/src/mod/asr_tts/mod_unimrcp/mod_unimrcp.c
index f441bf01308..990875b784c 100644
--- a/src/mod/asr_tts/mod_unimrcp/mod_unimrcp.c
+++ b/src/mod/asr_tts/mod_unimrcp/mod_unimrcp.c
@@ -626,19 +626,13 @@ static switch_status_t audio_queue_create(audio_queue_t ** audio_queue, const ch
 #ifdef MOD_UNIMRCP_DEBUG_AUDIO_QUEUE
 	int flags;
 #endif
-	char *lname = "";
+	char *lname;
 	char *lsession_uuid = NULL;
 	*audio_queue = NULL;
 
 	lname = zstr(name) ? "" : switch_core_strdup(pool, name);
 	lsession_uuid = zstr(session_uuid) ? NULL : switch_core_strdup(pool, session_uuid);
 
-	if (zstr(name)) {
-		lname = "";
-	} else {
-		lname = switch_core_strdup(pool, name);
-	}
-
 	if ((laudio_queue = (audio_queue_t *) switch_core_alloc(pool, sizeof(audio_queue_t))) == NULL) {
 		switch_log_printf(SWITCH_CHANNEL_UUID_LOG(lsession_uuid), SWITCH_LOG_ERROR, "(%s) unable to create audio queue\n", lname);
 		status = SWITCH_STATUS_FALSE;
@@ -4238,7 +4232,12 @@ static mrcp_client_t *mod_unimrcp_client_create(switch_memory_pool_t *mod_pool)
 
 			/* prepare mod_unimrcp's profile for configuration */
 			profile_create(&mod_profile, name, mod_pool);
-			switch_core_hash_insert(globals.profiles, mod_profile->name, mod_profile);
+			if (mod_profile) {
+				switch_core_hash_insert(globals.profiles, mod_profile->name, mod_profile);
+			} else {
+				client = NULL;
+				goto done;
+			}
 
 			/* pull in any default SPEAK params */
 			default_params = switch_xml_child(profile, "synthparams");
diff --git a/src/mod/codecs/mod_amr/amr_be.c b/src/mod/codecs/mod_amr/amr_be.c
index 0613ad17368..b7d9c9053da 100644
--- a/src/mod/codecs/mod_amr/amr_be.c
+++ b/src/mod/codecs/mod_amr/amr_be.c
@@ -94,15 +94,13 @@ extern switch_bool_t switch_amr_pack_be(unsigned char *shift_buf, int n)
 
 extern switch_bool_t switch_amr_unpack_be(unsigned char *encoded_buf, uint8_t *tmp, int encoded_len)
 {
-	int framesz, index, ft;
+	int framesz, index;
 	uint8_t shift_tocs[2] = {0x00, 0x00};
 	uint8_t *shift_buf;
 
 	memcpy(shift_tocs, encoded_buf, 2);
 	/* shift for BE */
 	switch_amr_array_lshift(4, shift_tocs, 2);
-	ft = shift_tocs[0] >> 3;
-	ft &= ~(1 << 5); /* Frame Type*/
 	shift_buf = encoded_buf + 1; /* skip CMR */
 	/* shift for BE */
 	switch_amr_array_lshift(2, shift_buf, encoded_len - 1);
diff --git a/src/mod/codecs/mod_amr/mod_amr.c b/src/mod/codecs/mod_amr/mod_amr.c
index 5f18feab7cf..b9d2c5882dc 100644
--- a/src/mod/codecs/mod_amr/mod_amr.c
+++ b/src/mod/codecs/mod_amr/mod_amr.c
@@ -150,6 +150,9 @@ static switch_bool_t switch_amr_unpack_oa(unsigned char *buf, uint8_t *tmp, int
 	int index;
 	int framesz;
 
+	if (!buf) {
+		return SWITCH_FALSE;
+	}
 	buf++; /* CMR skip */
 	tocs = buf;
 	index = ((tocs[0]>>3) & 0xf);
@@ -243,7 +246,7 @@ static switch_status_t switch_amr_init(switch_codec_t *codec, switch_codec_flag_
 	switch_codec_fmtp_t codec_fmtp;
 	amr_codec_settings_t amr_codec_settings = { 0 };
 	int encoding, decoding;
-	int x, i, argc;
+	int x, i, argc, fmtptmp_pos;
 	char *argv[10];
 	char fmtptmp[128];
 
@@ -329,25 +332,38 @@ static switch_status_t switch_amr_init(switch_codec_t *codec, switch_codec_flag_
 
 		if (context->enc_modes) {
 			/* choose the highest mode (bitrate) for high audio quality */
-			for (i = 7; i > -1; i--) {
+			for (i = SWITCH_AMR_MODES-2; i > -1; i--) {
 				if (context->enc_modes & (1 << i)) {
 					context->enc_mode = (switch_byte_t) i;
 					break;
 				}
 			}
+
+			/* re-create mode-set */
+			fmtptmp_pos = switch_snprintf(fmtptmp, sizeof(fmtptmp), "mode-set=");
+			for (i = 0; SWITCH_AMR_MODES-1 > i; ++i) {
+				if (context->enc_modes & (1 << i)) {
+					fmtptmp_pos += switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, fmtptmp_pos > strlen("mode-set=") ? ",%d" : "%d", i);
+				}
+			}
+
+		} else {
+			/* use default mode-set */
+			fmtptmp_pos = switch_snprintf(fmtptmp, sizeof(fmtptmp), "mode-set=%d", context->enc_mode);
 		}
 
 		if (globals.adjust_bitrate) {
 			switch_set_flag(codec, SWITCH_CODEC_FLAG_HAS_ADJ_BITRATE);
 		}
 
+
+
 		if (!globals.volte) {
-			switch_snprintf(fmtptmp, sizeof(fmtptmp), "octet-align=%d; mode-set=%d", switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0,
-							context->enc_mode);
+			switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, ";octet-align=%d", switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0);
 		} else {
 			/* some UEs reject the call with 488 if mode-change-capability is not 2 */
-			switch_snprintf(fmtptmp, sizeof(fmtptmp), "octet-align=%d; mode-set=%d; max-red=0; mode-change-capability=2",
-							switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0, context->enc_mode);
+			switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, ";octet-align=%d;max-red=0;mode-change-capability=2",
+							switch_test_flag(context, AMR_OPT_OCTET_ALIGN) ? 1 : 0);
 		}
 		codec->fmtp_out = switch_core_strdup(codec->memory_pool, fmtptmp);
 
@@ -630,6 +646,10 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amr_load)
 			}
 		}
 	}
+
+	if (xml) {
+		switch_xml_free(xml);
+	}
 #endif
 
 /* connect my internal structure to the blank pointer passed to me */
@@ -655,11 +675,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amr_load)
 										 20000,	/* number of microseconds per frame */
 										 160,	/* number of samples per frame */
 										 320,	/* number of bytes per frame decompressed */
-#ifndef AMR_PASSTHROUGH
-										 SWITCH_AMR_OUT_MAX_SIZE,	/* number of bytes per frame compressed */
-#else
 										 0,	/* number of bytes per frame compressed */
-#endif
 										 1,	/* number of channels represented */
 										 1,	/* number of frames per network packet */
 										 switch_amr_init,	/* function to initialize a codec handle using this implementation */
@@ -680,11 +696,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amr_load)
 										 20000,	/* number of microseconds per frame */
 										 160,	/* number of samples per frame */
 										 320,	/* number of bytes per frame decompressed */
-#ifndef AMR_PASSTHROUGH
-										 SWITCH_AMR_OUT_MAX_SIZE,	/* number of bytes per frame compressed */
-#else
 										 0,	/* number of bytes per frame compressed */
-#endif
 										 1,	/* number of channels represented */
 										 1,	/* number of frames per network packet */
 										 switch_amr_init,	/* function to initialize a codec handle using this implementation */
diff --git a/src/mod/codecs/mod_amrwb/mod_amrwb.c b/src/mod/codecs/mod_amrwb/mod_amrwb.c
index 19cde02adf7..7fc44150d1d 100644
--- a/src/mod/codecs/mod_amrwb/mod_amrwb.c
+++ b/src/mod/codecs/mod_amrwb/mod_amrwb.c
@@ -189,7 +189,7 @@ static switch_status_t switch_amrwb_init(switch_codec_t *codec, switch_codec_fla
 #else
 	struct amrwb_context *context = NULL;
 	int encoding, decoding;
-	int x, i, argc;
+	int x, i, argc, fmtptmp_pos;
 	char *argv[10];
 	char fmtptmp[128];
 
@@ -266,12 +266,24 @@ static switch_status_t switch_amrwb_init(switch_codec_t *codec, switch_codec_fla
 
 		if (context->enc_modes) {
 			/* choose the highest mode (bitrate) for high audio quality. */
-			for (i = 8; i > -1; i--) {
+			for (i = SWITCH_AMRWB_MODES-2; i > -1; i--) {
 				if (context->enc_modes & (1 << i)) {
 					context->enc_mode = (switch_byte_t) i;
 					break;
 				}
 			}
+
+			/* re-create mode-set */
+			fmtptmp_pos = switch_snprintf(fmtptmp, sizeof(fmtptmp), "mode-set=");
+			for (i = 0; SWITCH_AMRWB_MODES-1 > i; ++i) {
+				if (context->enc_modes & (1 << i)) {
+					fmtptmp_pos += switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, fmtptmp_pos > strlen("mode-set=") ? ",%d" : "%d", i);
+				}
+			}
+
+		} else {
+			/* use default mode-set */
+			fmtptmp_pos = switch_snprintf(fmtptmp, sizeof(fmtptmp), "mode-set=%d", context->enc_mode);
 		}
 
 		if (globals.adjust_bitrate) {
@@ -279,11 +291,11 @@ static switch_status_t switch_amrwb_init(switch_codec_t *codec, switch_codec_fla
 		}
 
 		if (!globals.volte) {
-			switch_snprintf(fmtptmp, sizeof(fmtptmp), "octet-align=%d; mode-set=%d",
-					switch_test_flag(context, AMRWB_OPT_OCTET_ALIGN) ? 1 : 0, context->enc_mode);
+			fmtptmp_pos += switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, ";octet-align=%d",
+					switch_test_flag(context, AMRWB_OPT_OCTET_ALIGN) ? 1 : 0);
 		} else {
-			switch_snprintf(fmtptmp, sizeof(fmtptmp), "octet-align=%d; mode-set=%d; max-red=0; mode-change-capability=2",
-					switch_test_flag(context, AMRWB_OPT_OCTET_ALIGN) ? 1 : 0, context->enc_mode);
+			fmtptmp_pos += switch_snprintf(fmtptmp + fmtptmp_pos, sizeof(fmtptmp) - fmtptmp_pos, ";octet-align=%d;max-red=0;mode-change-capability=2",
+					switch_test_flag(context, AMRWB_OPT_OCTET_ALIGN) ? 1 : 0);
 		}
 		codec->fmtp_out = switch_core_strdup(codec->memory_pool, fmtptmp);
 
@@ -551,6 +563,10 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amrwb_load)
 			}
 		}
 	}
+
+	if (xml) {
+		switch_xml_free(xml);
+	}
 #endif
 
 	/* connect my internal structure to the blank pointer passed to me */
@@ -561,8 +577,6 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amrwb_load)
 
 	switch_console_set_complete("add amrwb_debug on");
 	switch_console_set_complete("add amrwb_debug off");
-#else
-#define SWITCH_AMRWB_OUT_MAX_SIZE 0
 #endif
 
 	SWITCH_ADD_CODEC(codec_interface, "AMR-WB / Octet Aligned");
@@ -571,7 +585,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amrwb_load)
 
 	switch_core_codec_add_implementation(pool, codec_interface,
 										 SWITCH_CODEC_TYPE_AUDIO, 100, "AMR-WB", default_fmtp_oa,
-										 16000, 16000, 23850, 20000, 320, 640, SWITCH_AMRWB_OUT_MAX_SIZE, 1, 1,
+										 16000, 16000, 23850, 20000, 320, 640, 0, 1, 1,
 										 switch_amrwb_init, switch_amrwb_encode, switch_amrwb_decode, switch_amrwb_destroy);
 #ifndef AMRWB_PASSTHROUGH
 	codec_interface->implementations->codec_control = switch_amrwb_control;
@@ -583,7 +597,7 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_amrwb_load)
 
 	switch_core_codec_add_implementation(pool, codec_interface,
 										 SWITCH_CODEC_TYPE_AUDIO, 110, "AMR-WB", default_fmtp_be,
-										 16000, 16000, 23850, 20000, 320, 640, SWITCH_AMRWB_OUT_MAX_SIZE, 1, 1,
+										 16000, 16000, 23850, 20000, 320, 640, 0, 1, 1,
 										 switch_amrwb_init, switch_amrwb_encode, switch_amrwb_decode, switch_amrwb_destroy);
 #ifndef AMRWB_PASSTHROUGH
 	codec_interface->implementations->codec_control = switch_amrwb_control;
diff --git a/src/mod/codecs/mod_silk/mod_silk.c b/src/mod/codecs/mod_silk/mod_silk.c
index e437ed7df7e..b41b8510eb4 100644
--- a/src/mod/codecs/mod_silk/mod_silk.c
+++ b/src/mod/codecs/mod_silk/mod_silk.c
@@ -175,6 +175,11 @@ static switch_status_t switch_silk_init(switch_codec_t *codec,
 	}
 
 	memset(&codec_fmtp, '\0', sizeof(struct switch_codec_fmtp));
+	codec_fmtp.actual_samples_per_second = codec->implementation->actual_samples_per_second;
+	codec_fmtp.bits_per_second = codec->implementation->bits_per_second;
+	codec_fmtp.microseconds_per_packet = codec->implementation->microseconds_per_packet;
+	codec_fmtp.stereo = codec->implementation->number_of_channels > 1;
+
 	codec_fmtp.private_info = &silk_codec_settings;
 	switch_silk_fmtp_parse(codec->fmtp_in, &codec_fmtp);
 
diff --git a/src/mod/databases/mod_mariadb/mod_mariadb.c b/src/mod/databases/mod_mariadb/mod_mariadb.c
index e0eea5a5a54..7926ec0c3f5 100644
--- a/src/mod/databases/mod_mariadb/mod_mariadb.c
+++ b/src/mod/databases/mod_mariadb/mod_mariadb.c
@@ -53,6 +53,8 @@ int mariadb_db_set_connection(MYSQL *mysql, enum enum_server_command command, co
 	size_t length, my_bool skipp_check, void *opt_arg);
 my_bool mariadb_db_dsn_reconnect(MYSQL *mysql);
 
+my_bool reconnect = 1;
+
 #define DEFAULT_MARIADB_RETRIES 120
 
 #ifndef MIN
@@ -474,10 +476,13 @@ switch_status_t mariadb_handle_connect(mariadb_handle_t *handle)
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "Connecting %s\n", handle->dsn);
 	mysql_init(&handle->con);
 
-	// Enable non-blocking operation
-	// https://mariadb.com/kb/en/library/using-the-non-blocking-library/
+	/* Enable non-blocking operation */
+	/* https://mariadb.com/kb/en/library/using-the-non-blocking-library */
 	mysql_options(&handle->con, MYSQL_OPT_NONBLOCK, 0);
 
+	/* Enable automatic reconnect with the mariadb_reconnect function, without this that function does not work */
+	mysql_options(&handle->con, MYSQL_OPT_RECONNECT, &reconnect);
+
 	/* set timeouts to 300 microseconds */
 	/*int default_timeout = 3;
 	mysql_options(&handle->con, MYSQL_OPT_READ_TIMEOUT, &default_timeout);
@@ -685,15 +690,10 @@ switch_status_t mariadb_handle_exec_base_detailed(const char *file, const char *
 	err_str = mariadb_handle_get_error(handle);
 
 	if (zstr(err_str)) {
-		if (zstr(er)) {
-			err_str = strdup((char *)"SQL ERROR!");
-		} else {
-			err_str = er;
-		}
+		switch_safe_free(err_str);
+		err_str = (er) ? er : strdup((char *)"SQL ERROR!");
 	} else {
-		if (!zstr(er)) {
-			free(er);
-		}
+		switch_safe_free(er);
 	}
 
 	if (err_str) {
@@ -854,14 +854,15 @@ switch_status_t database_commit(switch_database_interface_handle_t *dih)
 		return SWITCH_STATUS_FALSE;
 
 	result = mariadb_SQLEndTran(handle, SWITCH_TRUE);
-	result = result && database_SQLSetAutoCommitAttr(dih, SWITCH_TRUE);
-	result = result && mariadb_finish_results(handle);
+	result = database_SQLSetAutoCommitAttr(dih, SWITCH_TRUE) && result;
+	result = mariadb_finish_results(handle) && result;
 
 	return result;
 }
 
 switch_status_t database_rollback(switch_database_interface_handle_t *dih)
 {
+	switch_status_t result;
 	mariadb_handle_t *handle;
 
 	if (!dih) {
@@ -874,9 +875,11 @@ switch_status_t database_rollback(switch_database_interface_handle_t *dih)
 		return SWITCH_STATUS_FALSE;
 	}
 
-	mariadb_SQLEndTran(handle, SWITCH_FALSE);
+	result = mariadb_SQLEndTran(handle, SWITCH_FALSE);
+	result = database_SQLSetAutoCommitAttr(dih, SWITCH_TRUE) && result;
+	result = mariadb_finish_results(handle) && result;
 
-	return SWITCH_STATUS_SUCCESS;
+	return result;
 }
 
 switch_status_t mariadb_handle_callback_exec_detailed(const char *file, const char *func, int line,
diff --git a/src/mod/databases/mod_pgsql/mod_pgsql.c b/src/mod/databases/mod_pgsql/mod_pgsql.c
index 77ac42f0941..a534594548e 100644
--- a/src/mod/databases/mod_pgsql/mod_pgsql.c
+++ b/src/mod/databases/mod_pgsql/mod_pgsql.c
@@ -565,7 +565,9 @@ switch_status_t database_handle_exec_string(switch_database_interface_handle_t *
 		goto error;
 	}
 
-	if (result) {
+	if (!result) {
+		goto done;
+	} else {
 		switch (result->status) {
 #if POSTGRESQL_MAJOR_VERSION >= 9 && POSTGRESQL_MINOR_VERSION >= 2
 		case PGRES_SINGLE_TUPLE:
@@ -853,8 +855,8 @@ switch_status_t database_commit(switch_database_interface_handle_t *dih)
 		return SWITCH_STATUS_FALSE;
 
 	result = pgsql_SQLEndTran(handle, SWITCH_TRUE);
-	result = result && pgsql_SQLSetAutoCommitAttr(dih, SWITCH_TRUE);
-	result = result && pgsql_finish_results(handle);
+	result = pgsql_SQLSetAutoCommitAttr(dih, SWITCH_TRUE) && result;
+	result = pgsql_finish_results(handle) && result;
 
 	return result;
 }
@@ -862,6 +864,7 @@ switch_status_t database_commit(switch_database_interface_handle_t *dih)
 switch_status_t database_rollback(switch_database_interface_handle_t *dih)
 {
 	switch_pgsql_handle_t *handle;
+	switch_status_t result;
 
 	if (!dih) {
 		return SWITCH_STATUS_FALSE;
@@ -872,10 +875,11 @@ switch_status_t database_rollback(switch_database_interface_handle_t *dih)
 	if (!handle)
 		return SWITCH_STATUS_FALSE;
 
-	pgsql_SQLEndTran(handle, SWITCH_FALSE);
-	// Is that enought?
+	result = pgsql_SQLEndTran(handle, SWITCH_FALSE);
+	result = pgsql_SQLSetAutoCommitAttr(dih, SWITCH_TRUE) && result;
+	result = pgsql_finish_results(handle) && result;
 
-	return SWITCH_STATUS_SUCCESS;
+	return result;
 }
 
 switch_status_t pgsql_handle_callback_exec_detailed(const char *file, const char *func, int line,
diff --git a/src/mod/endpoints/mod_dingaling/mod_dingaling.c b/src/mod/endpoints/mod_dingaling/mod_dingaling.c
index 8ab56c5da10..99a65874096 100644
--- a/src/mod/endpoints/mod_dingaling/mod_dingaling.c
+++ b/src/mod/endpoints/mod_dingaling/mod_dingaling.c
@@ -680,7 +680,6 @@ static void roster_event_handler(switch_event_t *event)
 {
 	char *status = switch_event_get_header(event, "status");
 	char *from = switch_event_get_header(event, "from");
-	char *event_type = switch_event_get_header(event, "event_type");
 	mdl_profile_t *profile = NULL;
 	switch_hash_index_t *hi;
 	void *val;
@@ -694,10 +693,6 @@ static void roster_event_handler(switch_event_t *event)
 		status = NULL;
 	}
 
-	if (zstr(event_type)) {
-		event_type = "presence";
-	}
-
 	if (from) {
 		sql = switch_mprintf("select *,'%q' from jabber_subscriptions where sub_from='%q'", status ? status : "", from);
 	} else {
@@ -4429,11 +4424,9 @@ static ldl_status handle_signalling(ldl_handle_t *handle, ldl_session_t *dlsessi
 		break;
 	case LDL_SIGNAL_CANDIDATES:
 		if (dl_signal) {
-			status = LDL_STATUS_SUCCESS;
-
-			status = parse_candidates(dlsession, session, LDL_TPORT_RTP, subject);
-			status = parse_candidates(dlsession, session, LDL_TPORT_VIDEO_RTP, subject);
-			status = parse_candidates(dlsession, session, LDL_TPORT_RTCP, subject);
+			parse_candidates(dlsession, session, LDL_TPORT_RTP, subject);
+			parse_candidates(dlsession, session, LDL_TPORT_VIDEO_RTP, subject);
+			parse_candidates(dlsession, session, LDL_TPORT_RTCP, subject);
 			status = parse_candidates(dlsession, session, LDL_TPORT_VIDEO_RTCP, subject);
 		}
 
diff --git a/src/mod/endpoints/mod_loopback/mod_loopback.c b/src/mod/endpoints/mod_loopback/mod_loopback.c
index 6d77eb946b9..88400fcaf6a 100644
--- a/src/mod/endpoints/mod_loopback/mod_loopback.c
+++ b/src/mod/endpoints/mod_loopback/mod_loopback.c
@@ -271,8 +271,8 @@ static switch_status_t channel_on_init(switch_core_session_t *session)
 		switch_snprintf(name, sizeof(name), "loopback/%s-b", tech_pvt->caller_profile->destination_number);
 		switch_channel_set_name(b_channel, name);
 		if (loopback_globals.early_set_loopback_id) {
-			switch_channel_set_variable(channel, "loopback_leg", "B");
-			switch_channel_set_variable(channel, "is_loopback", "1");
+			switch_channel_set_variable(b_channel, "loopback_leg", "B");
+			switch_channel_set_variable(b_channel, "is_loopback", "1");
 		}
 		if (tech_init(b_tech_pvt, b_session, switch_core_session_get_read_codec(session)) != SWITCH_STATUS_SUCCESS) {
 			switch_channel_hangup(channel, SWITCH_CAUSE_DESTINATION_OUT_OF_ORDER);
@@ -316,6 +316,8 @@ static switch_status_t channel_on_init(switch_core_session_t *session)
 				switch_channel_set_variable(tech_pvt->other_channel, h->name, h->value);
 			}
 
+			switch_channel_del_variable_prefix(channel, "group_confirm_");
+
 			switch_event_destroy(&vars);
 		}
 
@@ -1149,7 +1151,9 @@ static switch_call_cause_t channel_outgoing_channel(switch_core_session_t *sessi
 		ochannel = switch_core_session_get_channel(session);
 		switch_channel_clear_flag(ochannel, CF_PROXY_MEDIA);
 		switch_channel_clear_flag(ochannel, CF_PROXY_MODE);
-		switch_channel_pre_answer(ochannel);
+		if (!switch_channel_var_true(ochannel, "loopback_no_pre_answer")) {
+			switch_channel_pre_answer(ochannel);
+		}
 	}
 
 	if ((*new_session = switch_core_session_request(loopback_endpoint_interface, SWITCH_CALL_DIRECTION_OUTBOUND, flags, pool)) != 0) {
@@ -1302,6 +1306,12 @@ struct null_private_object {
 	switch_frame_t read_frame;
 	int16_t *null_buf;
 	int rate;
+	/* pre answer the channel */
+	int pre_answer;
+	/* enable_auto_answer (enabled by default) */
+	int enable_auto_answer;
+	/* auto_answer_delay (0 ms by default) */
+	int auto_answer_delay;
 };
 
 typedef struct null_private_object null_private_t;
@@ -1451,9 +1461,25 @@ static switch_status_t null_channel_on_consume_media(switch_core_session_t *sess
 	tech_pvt = switch_core_session_get_private(session);
 	assert(tech_pvt != NULL);
 
-	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "CHANNEL CONSUME_MEDIA - answering\n");
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "CHANNEL CONSUME_MEDIA\n");
 
-	switch_channel_mark_answered(channel);
+	if (tech_pvt->pre_answer) {
+		switch_channel_mark_pre_answered(channel);
+	}
+
+	if (tech_pvt->enable_auto_answer) {
+		switch_time_t start_time = switch_time_now();
+
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "CHANNEL CONSUME_MEDIA - answering in %d ms\n", tech_pvt->auto_answer_delay);
+
+		if (tech_pvt->auto_answer_delay > 0) {
+			while (switch_channel_ready(channel) && ((int)((switch_time_now() - start_time) / 1000)) < tech_pvt->auto_answer_delay) {
+				switch_yield(1000 * 20);
+			}
+		}
+
+		switch_channel_mark_answered(channel);
+	}
 
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -1500,6 +1526,7 @@ static switch_status_t null_channel_read_frame(switch_core_session_t *session, s
 		samples = tech_pvt->read_codec.implementation->samples_per_packet;
 		tech_pvt->read_frame.codec = &tech_pvt->read_codec;
 		tech_pvt->read_frame.datalen = samples * sizeof(int16_t);
+		tech_pvt->read_frame.buflen = samples * sizeof(int16_t);
 		tech_pvt->read_frame.samples = samples;
 		tech_pvt->read_frame.data = tech_pvt->null_buf;
 		switch_generate_sln_silence((int16_t *)tech_pvt->read_frame.data, tech_pvt->read_frame.samples, tech_pvt->read_codec.implementation->number_of_channels, 10000);
@@ -1588,18 +1615,24 @@ static switch_call_cause_t null_channel_outgoing_channel(switch_core_session_t *
 {
 	char name[128];
 	switch_channel_t *ochannel = NULL;
+	const char *enable_auto_answer = switch_event_get_header(var_event, "null_enable_auto_answer");
+	const char *auto_answer_delay = switch_event_get_header(var_event, "null_auto_answer_delay");
+	const char *pre_answer = switch_event_get_header(var_event, "null_pre_answer");
+	const char *hangup_cause = switch_event_get_header(var_event, "null_hangup_cause");
 
 	if (session) {
 		ochannel = switch_core_session_get_channel(session);
 		switch_channel_clear_flag(ochannel, CF_PROXY_MEDIA);
 		switch_channel_clear_flag(ochannel, CF_PROXY_MODE);
-		switch_channel_pre_answer(ochannel);
+		if (!switch_channel_var_true(ochannel, "null_no_pre_answer")) {
+			switch_channel_pre_answer(ochannel);
+		}
 	}
 
 	if ((*new_session = switch_core_session_request(null_endpoint_interface, SWITCH_CALL_DIRECTION_OUTBOUND, flags, pool)) != 0) {
 		null_private_t *tech_pvt;
 		switch_channel_t *channel;
-		switch_caller_profile_t *caller_profile;
+		switch_caller_profile_t *caller_profile = NULL;
 
 		switch_core_session_add_stream(*new_session, NULL);
 
@@ -1617,6 +1650,25 @@ static switch_call_cause_t null_channel_outgoing_channel(switch_core_session_t *
 
 			tech_pvt->rate = rate;
 
+			tech_pvt->pre_answer = switch_true(pre_answer);
+
+			if (!enable_auto_answer) {
+				/* if not set - enabled by default */
+				tech_pvt->enable_auto_answer = SWITCH_TRUE;
+			} else {
+				tech_pvt->enable_auto_answer = switch_true(enable_auto_answer);
+			}
+
+			if (!auto_answer_delay) {
+				/* if not set - 0 ms by default */
+				tech_pvt->auto_answer_delay = 0;
+			} else {
+				tech_pvt->auto_answer_delay = atoi(auto_answer_delay);
+
+				if (tech_pvt->auto_answer_delay < 0) tech_pvt->auto_answer_delay = 0;
+				if (tech_pvt->auto_answer_delay > 60000) tech_pvt->auto_answer_delay = 60000;
+			}
+
 			channel = switch_core_session_get_channel(*new_session);
 			switch_snprintf(name, sizeof(name), "null/%s", outbound_profile->destination_number);
 			switch_channel_set_name(channel, name);
@@ -1644,6 +1696,13 @@ static switch_call_cause_t null_channel_outgoing_channel(switch_core_session_t *
 			return SWITCH_CAUSE_DESTINATION_OUT_OF_ORDER;
 		}
 
+		switch_assert(caller_profile);
+
+		if (hangup_cause || !strncmp(caller_profile->destination_number, "cause-", 6)) {
+			if (!hangup_cause) hangup_cause = caller_profile->destination_number + 6;
+			return switch_channel_str2cause(hangup_cause);
+		}
+
 		switch_channel_set_state(channel, CS_INIT);
 		switch_channel_set_flag(channel, CF_AUDIO);
 		return SWITCH_CAUSE_SUCCESS;
diff --git a/src/mod/endpoints/mod_portaudio/mod_portaudio.c b/src/mod/endpoints/mod_portaudio/mod_portaudio.c
index 44675c3c8b3..0e24faef163 100644
--- a/src/mod/endpoints/mod_portaudio/mod_portaudio.c
+++ b/src/mod/endpoints/mod_portaudio/mod_portaudio.c
@@ -282,6 +282,14 @@ SWITCH_STANDARD_API(pa_cmd);
 */
 static switch_status_t channel_on_init(switch_core_session_t *session)
 {
+	switch_channel_t *channel;
+
+	if (session) {
+		if ((channel = switch_core_session_get_channel(session))) {
+			switch_channel_set_flag(channel, CF_AUDIO);
+		}
+	}
+
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -636,7 +644,9 @@ static void remove_stream(audio_stream_t * stream, int already_locked)
 		for (previous = globals.stream_list; previous && previous->next && previous->next != stream; previous = previous->next) {
 			;
 		}
-		previous->next = stream->next;
+		if (previous) {
+			previous->next = stream->next;
+		}
 	}
 	if (! already_locked) {
 		switch_mutex_unlock(globals.streams_lock);
@@ -1172,8 +1182,6 @@ static switch_call_cause_t channel_outgoing_channel(switch_core_session_t *sessi
 	}
 
 	if (outbound_profile->destination_number && !strncasecmp(outbound_profile->destination_number, "endpoint", sizeof("endpoint")-1)) {
-		codec_ms = -1;
-		samples_per_packet = -1;
 		endpoint = NULL;
 		endpoint_name = switch_core_strdup(outbound_profile->pool, outbound_profile->destination_number);
 		endpoint_name = strchr(endpoint_name, '/');
@@ -1267,6 +1275,7 @@ static switch_call_cause_t channel_outgoing_channel(switch_core_session_t *sessi
 
 	switch_set_flag_locked(tech_pvt, TFLAG_OUTBOUND);
 	switch_channel_set_state(channel, CS_INIT);
+	switch_channel_set_flag(channel, CF_AUDIO);
 	return SWITCH_CAUSE_SUCCESS;
 
 error:
@@ -2072,7 +2081,6 @@ static switch_status_t devlist(char **argv, int argc, switch_stream_handle_t *st
 					stream->write_function(stream, ",");
 				}
 				stream->write_function(stream, "o");
-				prev = 1;
 			}
 
 			stream->write_function(stream, "\n");
@@ -2853,7 +2861,7 @@ static switch_status_t answer_call(char **argv, int argc, switch_stream_handle_t
 static switch_status_t do_flags(char **argv, int argc, switch_stream_handle_t *stream)
 {
 	char *action = argv[0];
-	char *flag_str = argv[1];
+	char *flag_str;
 	GFLAGS flags = GFLAG_NONE;
 	char *p;
 	int x = 0;
@@ -3239,7 +3247,7 @@ SWITCH_STANDARD_API(pa_cmd)
 		}
 
 		switch_mutex_lock(globals.pa_mutex);
-		status = func(&argv[lead], argc - lead, stream);
+		func(&argv[lead], argc - lead, stream);
 		status = SWITCH_STATUS_SUCCESS; /*if func was defined we want to always return success as the command was found */
 		switch_mutex_unlock(globals.pa_mutex);
 
diff --git a/src/mod/endpoints/mod_portaudio/pablio.c b/src/mod/endpoints/mod_portaudio/pablio.c
index fc4a2b9b402..aae15eb00cb 100644
--- a/src/mod/endpoints/mod_portaudio/pablio.c
+++ b/src/mod/endpoints/mod_portaudio/pablio.c
@@ -169,7 +169,6 @@ long WriteAudioStream(PABLIO_Stream * aStream, void *data, long numFrames, int c
 
 	bytesWritten = PaUtil_WriteRingBuffer(&aStream->outFIFOs[chan], p, numBytes);
 	numBytes -= bytesWritten;
-	p += bytesWritten;
 
 	if (numBytes > 0) {
 		PaUtil_FlushRingBuffer(&aStream->outFIFOs[chan]);
@@ -197,7 +196,6 @@ long ReadAudioStream(PABLIO_Stream * aStream, void *data, long numFrames, int ch
 		//printf("AVAILABLE BYTES %ld pass %d\n", avail, 5000 - max);
 		if (avail >= neededBytes * 6) {
 			PaUtil_FlushRingBuffer(&aStream->inFIFOs[chan]);
-			avail = 0;
 		} else {
 
 			bytesRead = 0;
diff --git a/src/mod/endpoints/mod_rtmp/mod_rtmp.c b/src/mod/endpoints/mod_rtmp/mod_rtmp.c
index 5a01436d220..585d91b552a 100644
--- a/src/mod/endpoints/mod_rtmp/mod_rtmp.c
+++ b/src/mod/endpoints/mod_rtmp/mod_rtmp.c
@@ -1565,7 +1565,7 @@ static switch_status_t config_profile(rtmp_profile_t *profile, switch_bool_t rel
 
 	for (x_profile = switch_xml_child(x_profiles, "profile"); x_profile; x_profile = x_profile->next) {
 		const char *name = switch_xml_attr_soft(x_profile, "name");
-		if (strcmp(name, profile->name)) {
+		if (profile && strcmp(name, profile->name)) {
 			continue;
 		}
 
diff --git a/src/mod/endpoints/mod_rtmp/mod_rtmp.h b/src/mod/endpoints/mod_rtmp/mod_rtmp.h
index 4edfb8dc45b..91ea9ed770c 100644
--- a/src/mod/endpoints/mod_rtmp/mod_rtmp.h
+++ b/src/mod/endpoints/mod_rtmp/mod_rtmp.h
@@ -211,6 +211,7 @@ static inline uint8_t rtmp_audio_codec(int channels, int bits, int rate, rtmp_au
 			break;
 		case 2:
 			codec |= 1;
+			break;
 		default:
 			return 0;
 	}
@@ -220,6 +221,7 @@ static inline uint8_t rtmp_audio_codec(int channels, int bits, int rate, rtmp_au
 			break;
 		case 16:
 			codec |= 2;
+			break;
 		default:
 			return 0;
 	}
@@ -236,6 +238,7 @@ static inline uint8_t rtmp_audio_codec(int channels, int bits, int rate, rtmp_au
 			break;
 		case 44000:
 			codec |= 0xC;
+			break;
 		default:
 			return 0;
 	}
diff --git a/src/mod/endpoints/mod_rtmp/rtmp_sig.c b/src/mod/endpoints/mod_rtmp/rtmp_sig.c
index 42b29fe4c17..810647279de 100644
--- a/src/mod/endpoints/mod_rtmp/rtmp_sig.c
+++ b/src/mod/endpoints/mod_rtmp/rtmp_sig.c
@@ -850,9 +850,7 @@ RTMP_INVOKE_FUNCTION(rtmp_i_sendevent)
 			if (switch_core_session_queue_event(session_pvt->session, &event) != SWITCH_STATUS_SUCCESS) {
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session_pvt->session), SWITCH_LOG_ERROR, "Couldn't queue event to session\n");
 				switch_event_destroy(&event);
-				status = SWITCH_STATUS_FALSE;
-			} else {
-				status = SWITCH_STATUS_SUCCESS;
+				return SWITCH_STATUS_FALSE;
 			}
 		}
 	}
diff --git a/src/mod/endpoints/mod_skinny/mod_skinny.c b/src/mod/endpoints/mod_skinny/mod_skinny.c
index 21fc7caf5c3..0274912ad25 100644
--- a/src/mod/endpoints/mod_skinny/mod_skinny.c
+++ b/src/mod/endpoints/mod_skinny/mod_skinny.c
@@ -1350,7 +1350,6 @@ switch_call_cause_t channel_outgoing_channel(switch_core_session_t *session, swi
 	}
 	*dest++ = '\0';
 
-	profile = skinny_find_profile(profile_name);
 	if (!(profile = skinny_find_profile(profile_name))) {
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Invalid Profile %s\n", profile_name);
 		cause = SWITCH_CAUSE_UNALLOCATED_NUMBER;
@@ -1427,12 +1426,6 @@ switch_call_cause_t channel_outgoing_channel(switch_core_session_t *session, swi
 
 switch_status_t channel_receive_event(switch_core_session_t *session, switch_event_t *event)
 {
-	char *body = switch_event_get_body(event);
-
-	if (!body) {
-		body = "";
-	}
-
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -2481,7 +2474,7 @@ static void skinny_user_to_device_event_handler(switch_event_t *event)
 			uint32_t line_instance = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Line-Instance"));
 			uint32_t call_id = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Call-Id"));
 			uint32_t transaction_id = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Transaction-Id"));
-			uint32_t data_length = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Data-Length"));
+			uint32_t data_length;
 			uint32_t sequence_flag = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Sequence-Flag"));
 			uint32_t display_priority = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Display-Priority"));
 			uint32_t conference_id = atoi(switch_event_get_header_nil(event, "Skinny-UserToDevice-Conference-Id"));
diff --git a/src/mod/endpoints/mod_skinny/skinny_server.c b/src/mod/endpoints/mod_skinny/skinny_server.c
index dd76626343e..af7702a363c 100644
--- a/src/mod/endpoints/mod_skinny/skinny_server.c
+++ b/src/mod/endpoints/mod_skinny/skinny_server.c
@@ -1928,7 +1928,7 @@ switch_status_t skinny_handle_capabilities_response(listener_t *listener, skinny
 
 	uint32_t i = 0;
 	uint32_t n = 0;
-	char *codec_order[SWITCH_MAX_CODECS];
+	char *codec_order[SWITCH_MAX_CODECS] = {0};
 	char *codec_string;
 
 	size_t string_len, string_pos, pos;
@@ -2556,7 +2556,7 @@ switch_status_t skinny_handle_updatecapabilities(listener_t *listener, skinny_me
 
 	uint32_t i = 0;
 	uint32_t n = 0;
-	char *codec_order[SKINNY_MAX_CAPABILITIES];
+	char *codec_order[SKINNY_MAX_CAPABILITIES] = {0};
 	char *codec_string;
 
 	size_t string_len, string_pos, pos;
diff --git a/src/mod/endpoints/mod_sofia/Makefile.am b/src/mod/endpoints/mod_sofia/Makefile.am
index d63e0298012..73942ac79bf 100644
--- a/src/mod/endpoints/mod_sofia/Makefile.am
+++ b/src/mod/endpoints/mod_sofia/Makefile.am
@@ -42,7 +42,7 @@ BUILT_SOURCES = $(SOFIALA)
 noinst_PROGRAMS = test/test_sofia_funcs
 
 test_test_sofia_funcs_SOURCES = test/test_sofia_funcs.c
-test_test_sofia_funcs_CFLAGS = $(AM_CFLAGS) -I $(libsofiamod_la_CFLAGS) 
+test_test_sofia_funcs_CFLAGS = $(AM_CFLAGS) -I $(libsofiamod_la_CFLAGS) -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
 test_test_sofia_funcs_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS) $(SOFIALA)
 test_test_sofia_funcs_LDADD = libsofiamod.la $(SOFIALA)
 TESTS = $(noinst_PROGRAMS)
diff --git a/src/mod/endpoints/mod_sofia/mod_sofia.h b/src/mod/endpoints/mod_sofia/mod_sofia.h
index c71b2d0559f..05e50c952f2 100644
--- a/src/mod/endpoints/mod_sofia/mod_sofia.h
+++ b/src/mod/endpoints/mod_sofia/mod_sofia.h
@@ -405,6 +405,7 @@ struct mod_sofia_globals {
 	uint32_t max_reg_threads;
 	time_t presence_epoch;
 	int presence_year;
+	int abort_on_empty_external_ip;
 };
 extern struct mod_sofia_globals mod_sofia_globals;
 
@@ -532,6 +533,7 @@ struct sofia_gateway {
 	int pinging;
 	sofia_gateway_status_t status;
 	switch_time_t uptime;
+	uint32_t contact_in_ping;
 	uint32_t ping_freq;
 	int ping_count;
 	int ping_max;
diff --git a/src/mod/endpoints/mod_sofia/sofia.c b/src/mod/endpoints/mod_sofia/sofia.c
index 6c33cf45a5c..32f27462e54 100644
--- a/src/mod/endpoints/mod_sofia/sofia.c
+++ b/src/mod/endpoints/mod_sofia/sofia.c
@@ -2807,8 +2807,23 @@ void event_handler(switch_event_t *event)
 		if (mod_sofia_globals.rewrite_multicasted_fs_path && contact_str) {
 			const char *needle = ";fs_path=";
 			char *sptr, *eptr = NULL;
-			/* allocate enough room for worst-case scenario */
-			size_t len = strlen(contact_str) + strlen(to_host) + 14;
+			/* allocate enough room for worst-case scenario, depends on rewrite_multicased_fs_path setting */
+			size_t len;
+			switch (mod_sofia_globals.rewrite_multicasted_fs_path) {
+				case 1:
+					len = strlen(contact_str) + strlen(to_host) + 14;
+				break;
+				case 2:
+					len = strlen(contact_str) + strlen(orig_server_host) + 14;
+				break;
+				case 3:
+					len = strlen(contact_str) + strlen(orig_hostname) + 14;
+				break;
+				default:
+					len = strlen(contact_str) + strlen(to_host) + 14;
+				break;
+			}
+
 			fixed_contact_str = malloc(len);
 			switch_assert(fixed_contact_str);
 			switch_copy_string(fixed_contact_str, contact_str, len);
@@ -3716,7 +3731,7 @@ static void parse_gateways(sofia_profile_t *profile, switch_xml_t gateways_tag,
 		if ((gateway = switch_core_alloc(profile->pool, sizeof(*gateway)))) {
 			const char *sipip, *format;
 			switch_uuid_t uuid;
-			uint32_t ping_freq = 0, extension_in_contact = 0, ping_monitoring = 0, distinct_to = 0, rfc_5626 = 0;
+			uint32_t ping_freq = 0, extension_in_contact = 0, contact_in_ping = 0, ping_monitoring = 0, distinct_to = 0, rfc_5626 = 0;
 			int ping_max = 1, ping_min = 1;
 			char *register_str = "true", *scheme = "Digest",
 				*realm = NULL,
@@ -3831,6 +3846,8 @@ static void parse_gateways(sofia_profile_t *profile, switch_xml_t gateways_tag,
 					caller_id_in_from = val;
 				} else if (!strcmp(var, "extension")) {
 					extension = val;
+				} else if (!strcmp(var, "contact-in-ping")) {
+					contact_in_ping = switch_true(val);
 				} else if (!strcmp(var, "ping")) {
 					ping_freq = atoi(val);
 				} else if (!strcmp(var, "ping-max")) {
@@ -4029,6 +4046,9 @@ static void parse_gateways(sofia_profile_t *profile, switch_xml_t gateways_tag,
 					gateway->options_to_uri = switch_core_sprintf(gateway->pool, "<sip:%s>",
 						!zstr(from_domain) ? from_domain : proxy);
 					gateway->options_from_uri = gateway->options_to_uri;
+					if (contact_in_ping) {
+						gateway->contact_in_ping = contact_in_ping;
+					}	
 				} else {
 					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ERROR: invalid ping!\n");
 				}
@@ -4416,6 +4436,8 @@ switch_status_t config_sofia(sofia_config_t reload, char *profile_name)
 			char *val = (char *) switch_xml_attr_soft(param, "value");
 			if (!strcasecmp(var, "log-level")) {
 				su_log_set_level(NULL, atoi(val));
+			} else if (!strcasecmp(var, "abort-on-empty-external-ip")) {
+				mod_sofia_globals.abort_on_empty_external_ip = switch_true(val);
 			} else if (!strcasecmp(var, "tracelevel")) {
 				mod_sofia_globals.tracelevel = switch_log_str2level(val);
 			} else if (!strcasecmp(var, "debug-presence")) {
@@ -5059,7 +5081,9 @@ switch_status_t config_sofia(sofia_config_t reload, char *profile_name)
 							}
 						} else {
 							switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid ext-rtp-ip\n");
-							switch_goto_status(SWITCH_STATUS_GENERR, done);
+							if (mod_sofia_globals.abort_on_empty_external_ip) {
+								switch_goto_status(SWITCH_STATUS_GENERR, done);
+							}
 						}
 					} else if (!strcasecmp(var, "rtp-ip")) {
 						char *ip = mod_sofia_globals.guess_ip;
@@ -5086,13 +5110,13 @@ switch_status_t config_sofia(sofia_config_t reload, char *profile_name)
 						}
 
 						if (strchr(ip, ':')) {
-							if (profile->rtpip_index < MAX_RTPIP) {
+							if (profile->rtpip_index6 < MAX_RTPIP) {
 								profile->rtpip6[profile->rtpip_index6++] = switch_core_strdup(profile->pool, ip);
 							} else {
 								switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Max IPs configured for profile %s.\n", profile->name);
 							}
 						} else {
-							if (profile->rtpip_index6 < MAX_RTPIP) {
+							if (profile->rtpip_index < MAX_RTPIP) {
 								profile->rtpip[profile->rtpip_index++] = switch_core_strdup(profile->pool, ip);
 							} else {
 								switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Max IPs configured for profile %s.\n", profile->name);
@@ -10073,7 +10097,7 @@ void sofia_handle_sip_i_reinvite(switch_core_session_t *session,
 	}
 
 	if (profile && sofia_test_pflag(profile, PFLAG_MANAGE_SHARED_APPEARANCE)) {
-		if (channel && sip->sip_call_info) {
+		if (channel && sip && sip->sip_call_info) {
 			char *p;
 			if ((call_info = sip_header_as_string(nua_handle_home(nh), (void *) sip->sip_call_info))) {
 				if (switch_stristr("appearance", call_info)) {
@@ -10093,7 +10117,7 @@ void sofia_handle_sip_i_reinvite(switch_core_session_t *session,
 		}
 		tech_pvt->mparams.last_sdp_str = NULL;
 
-		if (sip->sip_payload && sip->sip_payload->pl_data) {
+		if (sip && sip->sip_payload && sip->sip_payload->pl_data) {
 			if (!zstr(tech_pvt->mparams.prev_sdp_str) && strcmp(tech_pvt->mparams.prev_sdp_str, sip->sip_payload->pl_data)) {
 				switch_channel_set_variable(channel, "sip_reinvite_sdp", sip->sip_payload->pl_data);
 				tech_pvt->mparams.last_sdp_str = switch_core_session_strdup(session, sip->sip_payload->pl_data);
diff --git a/src/mod/endpoints/mod_sofia/sofia_glue.c b/src/mod/endpoints/mod_sofia/sofia_glue.c
index 72c654fe460..a412f25cd36 100644
--- a/src/mod/endpoints/mod_sofia/sofia_glue.c
+++ b/src/mod/endpoints/mod_sofia/sofia_glue.c
@@ -1309,7 +1309,7 @@ switch_status_t sofia_glue_do_invite(switch_core_session_t *session)
 			use_from_str = switch_core_session_sprintf(session, "sip:%s", use_from_str);
 		}
 
-		if (!from_display && !strcasecmp(tech_pvt->caller_profile->caller_id_name, "_undef_")) {
+		if (!from_display && (!strcasecmp(tech_pvt->caller_profile->caller_id_name, "_undef_") || zstr(tech_pvt->caller_profile->caller_id_name))) {
 			from_str = switch_core_session_sprintf(session, "<%s>", use_from_str);
 		} else {
 			char *name = switch_core_session_strdup(session, from_display ? from_display : tech_pvt->caller_profile->caller_id_name);
diff --git a/src/mod/endpoints/mod_sofia/sofia_reg.c b/src/mod/endpoints/mod_sofia/sofia_reg.c
index 569699a1c84..0ffa2358758 100644
--- a/src/mod/endpoints/mod_sofia/sofia_reg.c
+++ b/src/mod/endpoints/mod_sofia/sofia_reg.c
@@ -389,6 +389,7 @@ void sofia_reg_check_gateway(sofia_profile_t *profile, time_t now)
 						TAG_IF(gateway_ptr->register_sticky_proxy, NUTAG_PROXY(gateway_ptr->register_sticky_proxy)),
 						TAG_IF(user_via, SIPTAG_VIA_STR(user_via)),
 						SIPTAG_TO_STR(gateway_ptr->options_to_uri), SIPTAG_FROM_STR(gateway_ptr->options_from_uri),
+						TAG_IF(gateway_ptr->contact_in_ping, SIPTAG_CONTACT_STR(gateway_ptr->register_contact)),
 						TAG_IF(gateway_ptr->options_user_agent, SIPTAG_USER_AGENT_STR(gateway_ptr->options_user_agent)),
 						TAG_END());
 
@@ -2636,7 +2637,7 @@ void sofia_reg_handle_sip_r_challenge(int status,
 	} else if (gateway) {
 		switch_snprintf(authentication, sizeof(authentication), "%s:%s:%s:%s", scheme, realm, gateway->auth_username, gateway->register_password);
 	} else {
-		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR,
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING,
 						  "Cannot locate any authentication credentials to complete an authentication request for realm '%s'\n", realm);
 		goto cancel;
 	}
diff --git a/src/mod/endpoints/mod_verto/Makefile.am b/src/mod/endpoints/mod_verto/Makefile.am
index 4b8c144d65e..dd54a32ee48 100644
--- a/src/mod/endpoints/mod_verto/Makefile.am
+++ b/src/mod/endpoints/mod_verto/Makefile.am
@@ -24,11 +24,11 @@ MCAST_la_LDFLAGS   = -avoid-version -module -no-undefined -shared $(PERL_LDFLAGS
 #	install -m 755 mcast/MCAST.pm $(DESTDIR)$(PERL_SITEDIR)
 endif
 
-mcast/esl_wrap.cpp:
-	cd mcast && swig -module MCAST -shadow -perl5 -c++ -DMULTIPLICITY -I../src/include -o mcast_wrap.cpp ../MCAST.i
+mcast/mcast_wrap.cpp:
+	cd mcast && swig -module MCAST -shadow -perl5 -c++ -DMULTIPLICITY -I../src/include -o mcast_wrap.cpp MCAST.i
 
 mcast/perlxsi.c:
-	$(PERL) -MExtUtils::Embed -e xsinit -- -o perlxsi.c
+	$(PERL) -MExtUtils::Embed -e xsinit -- -o mcast/perlxsi.c
 
 clean-data-local:
 	rm -f *.o *.so *~
diff --git a/src/mod/endpoints/mod_verto/mcast/MCAST.i b/src/mod/endpoints/mod_verto/mcast/MCAST.i
index 097a52bc77a..f8215233ff6 100644
--- a/src/mod/endpoints/mod_verto/mcast/MCAST.i
+++ b/src/mod/endpoints/mod_verto/mcast/MCAST.i
@@ -1,3 +1,13 @@
+%begin %{
+#ifdef __clang_analyzer__
+#include <string.h>
+static int mystrcmp (const char *a, const char *b) {
+    return a == b ? 0 : !a ? -1 : !b ? 1 : strcmp(a, b);
+}
+#define strcmp mystrcmp
+#endif
+%}
+
 %{
 #include "mcast.h"
 #include "mcast_cpp.h"
diff --git a/src/mod/endpoints/mod_verto/mcast/MCAST.pm b/src/mod/endpoints/mod_verto/mcast/MCAST.pm
index dc2d01f1f7b..ba72ccc8b0f 100644
--- a/src/mod/endpoints/mod_verto/mcast/MCAST.pm
+++ b/src/mod/endpoints/mod_verto/mcast/MCAST.pm
@@ -1,16 +1,16 @@
 # This file was automatically generated by SWIG (http://www.swig.org).
-# Version 1.3.35
+# Version 3.0.12
 #
-# Don't modify this file, modify the SWIG interface instead.
+# Do not make changes to this file unless you know what you are doing--modify
+# the SWIG interface file instead.
 
 package MCAST;
-require Exporter;
-require DynaLoader;
-@ISA = qw(Exporter DynaLoader);
+use base qw(Exporter);
+use base qw(DynaLoader);
 package MCASTc;
 bootstrap MCAST;
 package MCAST;
-@EXPORT = qw( );
+@EXPORT = qw();
 
 # ---------- BASE METHODS -------------
 
diff --git a/src/mod/endpoints/mod_verto/mcast/mcast_wrap.cpp b/src/mod/endpoints/mod_verto/mcast/mcast_wrap.cpp
index bae2665bb1b..5473788de73 100644
--- a/src/mod/endpoints/mod_verto/mcast/mcast_wrap.cpp
+++ b/src/mod/endpoints/mod_verto/mcast/mcast_wrap.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 1.3.35
+ * Version 3.0.12
  *
  * This file is not intended to be easily readable and contains a number of
  * coding conventions designed to improve portability and efficiency. Do not make
@@ -8,22 +8,39 @@
  * interface file instead.
  * ----------------------------------------------------------------------------- */
 
+#ifdef __clang_analyzer__
+#include <string.h>
+static int mystrcmp (const char *a, const char *b) {
+    return a == b ? 0 : !a ? -1 : !b ? 1 : strcmp(a, b);
+}
+#define strcmp mystrcmp
+#endif
+
+
+
+#ifndef SWIGPERL
 #define SWIGPERL
+#endif
+
 #define SWIG_CASTRANK_MODE
 
+
 #ifdef __cplusplus
+/* SwigValueWrapper is described in swig.swg */
 template<typename T> class SwigValueWrapper {
-    T *tt;
+  struct SwigMovePointer {
+    T *ptr;
+    SwigMovePointer(T *p) : ptr(p) { }
+    ~SwigMovePointer() { delete ptr; }
+    SwigMovePointer& operator=(SwigMovePointer& rhs) { T* oldptr = ptr; ptr = 0; delete oldptr; ptr = rhs.ptr; rhs.ptr = 0; return *this; }
+  } pointer;
+  SwigValueWrapper& operator=(const SwigValueWrapper<T>& rhs);
+  SwigValueWrapper(const SwigValueWrapper<T>& rhs);
 public:
-    SwigValueWrapper() : tt(0) { }
-    SwigValueWrapper(const SwigValueWrapper<T>& rhs) : tt(new T(*rhs.tt)) { }
-    SwigValueWrapper(const T& t) : tt(new T(t)) { }
-    ~SwigValueWrapper() { delete tt; }
-    SwigValueWrapper& operator=(const T& t) { delete tt; tt = new T(t); return *this; }
-    operator T&() const { return *tt; }
-    T *operator&() { return tt; }
-private:
-    SwigValueWrapper& operator=(const SwigValueWrapper<T>& rhs);
+  SwigValueWrapper() : pointer(0) { }
+  SwigValueWrapper& operator=(const T& t) { SwigMovePointer tmp(new T(t)); pointer = tmp; return *this; }
+  operator T&() const { return *pointer.ptr; }
+  T *operator&() { return pointer.ptr; }
 };
 
 template <typename T> T SwigValueInit() {
@@ -73,6 +90,12 @@ template <typename T> T SwigValueInit() {
 # endif
 #endif
 
+#ifndef SWIG_MSC_UNSUPPRESS_4505
+# if defined(_MSC_VER)
+#   pragma warning(disable : 4505) /* unreferenced local function has been removed */
+# endif
+#endif
+
 #ifndef SWIGUNUSEDPARM
 # ifdef __cplusplus
 #   define SWIGUNUSEDPARM(p)
@@ -92,9 +115,11 @@ template <typename T> T SwigValueInit() {
 #endif
 
 /* exporting methods */
-#if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-#  ifndef GCC_HASCLASSVISIBILITY
-#    define GCC_HASCLASSVISIBILITY
+#if defined(__GNUC__)
+#  if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
+#    ifndef GCC_HASCLASSVISIBILITY
+#      define GCC_HASCLASSVISIBILITY
+#    endif
 #  endif
 #endif
 
@@ -133,11 +158,24 @@ template <typename T> T SwigValueInit() {
 # define _SCL_SECURE_NO_DEPRECATE
 #endif
 
+/* Deal with Apple's deprecated 'AssertMacros.h' from Carbon-framework */
+#if defined(__APPLE__) && !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES)
+# define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0
+#endif
+
+/* Intel's compiler complains if a variable which was never initialised is
+ * cast to void, which is a common idiom which we use to indicate that we
+ * are aware a variable isn't used.  So we just silence that warning.
+ * See: https://github.com/swig/swig/issues/192 for more discussion.
+ */
+#ifdef __INTEL_COMPILER
+# pragma warning disable 592
+#endif
 
 /* -----------------------------------------------------------------------------
  * swigrun.swg
  *
- * This file contains generic CAPI SWIG runtime support for pointer
+ * This file contains generic C API SWIG runtime support for pointer
  * type checking.
  * ----------------------------------------------------------------------------- */
 
@@ -156,11 +194,11 @@ template <typename T> T SwigValueInit() {
 
 /*
   You can use the SWIGRUNTIME and SWIGRUNTIMEINLINE macros for
-  creating a static or dynamic library from the swig runtime code.
-  In 99.9% of the cases, swig just needs to declare them as 'static'.
+  creating a static or dynamic library from the SWIG runtime code.
+  In 99.9% of the cases, SWIG just needs to declare them as 'static'.
 
-  But only do this if is strictly necessary, ie, if you have problems
-  with your compiler or so.
+  But only do this if strictly necessary, ie, if you have problems
+  with your compiler or suchlike.
 */
 
 #ifndef SWIGRUNTIME
@@ -187,14 +225,14 @@ template <typename T> T SwigValueInit() {
 /*
    Flags/methods for returning states.
 
-   The swig conversion methods, as ConvertPtr, return and integer
+   The SWIG conversion methods, as ConvertPtr, return an integer
    that tells if the conversion was successful or not. And if not,
    an error code can be returned (see swigerrors.swg for the codes).
 
    Use the following macros/flags to set or process the returning
    states.
 
-   In old swig versions, you usually write code as:
+   In old versions of SWIG, code such as the following was usually written:
 
      if (SWIG_ConvertPtr(obj,vptr,ty.flags) != -1) {
        // success code
@@ -202,7 +240,7 @@ template <typename T> T SwigValueInit() {
        //fail code
      }
 
-   Now you can be more explicit as:
+   Now you can be more explicit:
 
     int res = SWIG_ConvertPtr(obj,vptr,ty.flags);
     if (SWIG_IsOK(res)) {
@@ -211,7 +249,7 @@ template <typename T> T SwigValueInit() {
       // fail code
     }
 
-   that seems to be the same, but now you can also do
+   which is the same really, but now you can also do
 
     Type *ptr;
     int res = SWIG_ConvertPtr(obj,(void **)(&ptr),ty.flags);
@@ -229,7 +267,7 @@ template <typename T> T SwigValueInit() {
 
    I.e., now SWIG_ConvertPtr can return new objects and you can
    identify the case and take care of the deallocation. Of course that
-   requires also to SWIG_ConvertPtr to return new result values, as
+   also requires SWIG_ConvertPtr to return new result values, such as
 
       int SWIG_ConvertPtr(obj, ptr,...) {
         if (<obj is ok>) {
@@ -247,7 +285,7 @@ template <typename T> T SwigValueInit() {
 
    Of course, returning the plain '0(success)/-1(fail)' still works, but you can be
    more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the
-   swig errors code.
+   SWIG errors code.
 
    Finally, if the SWIG_CASTRANK_MODE is enabled, the result code
    allows to return the 'cast rank', for example, if you have this
@@ -261,9 +299,8 @@ template <typename T> T SwigValueInit() {
       fooi(1)   // cast rank '0'
 
    just use the SWIG_AddCast()/SWIG_CheckState()
+*/
 
-
- */
 #define SWIG_OK                    (0)
 #define SWIG_ERROR                 (-1)
 #define SWIG_IsOK(r)               (r >= 0)
@@ -288,7 +325,6 @@ template <typename T> T SwigValueInit() {
 #define SWIG_DelTmpMask(r)         (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r)
 #define SWIG_IsTmpObj(r)           (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK))
 
-
 /* Cast-Rank Mode */
 #if defined(SWIG_CASTRANK_MODE)
 #  ifndef SWIG_TypeRank
@@ -306,13 +342,11 @@ SWIGINTERNINLINE int SWIG_CheckState(int r) {
   return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0;
 }
 #else /* no cast-rank mode */
-#  define SWIG_AddCast
+#  define SWIG_AddCast(r) (r)
 #  define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0)
 #endif
 
 
-
-
 #include <string.h>
 
 #ifdef __cplusplus
@@ -372,18 +406,18 @@ SWIG_TypeNameComp(const char *f1, const char *l1,
 
 /*
   Check type equivalence in a name list like <name1>|<name2>|...
-  Return 0 if not equal, 1 if equal
+  Return 0 if equal, -1 if nb < tb, 1 if nb > tb
 */
 SWIGRUNTIME int
-SWIG_TypeEquiv(const char *nb, const char *tb) {
-  int equiv = 0;
+SWIG_TypeCmp(const char *nb, const char *tb) {
+  int equiv = 1;
   const char* te = tb + strlen(tb);
   const char* ne = nb;
-  while (!equiv && *ne) {
+  while (equiv != 0 && *ne) {
     for (nb = ne; *ne; ++ne) {
       if (*ne == '|') break;
     }
-    equiv = (SWIG_TypeNameComp(nb, ne, tb, te) == 0) ? 1 : 0;
+    equiv = SWIG_TypeNameComp(nb, ne, tb, te);
     if (*ne) ++ne;
   }
   return equiv;
@@ -391,58 +425,65 @@ SWIG_TypeEquiv(const char *nb, const char *tb) {
 
 /*
   Check type equivalence in a name list like <name1>|<name2>|...
-  Return 0 if equal, -1 if nb < tb, 1 if nb > tb
+  Return 0 if not equal, 1 if equal
 */
 SWIGRUNTIME int
-SWIG_TypeCompare(const char *nb, const char *tb) {
-  int equiv = 0;
-  const char* te = tb + strlen(tb);
-  const char* ne = nb;
-  while (!equiv && *ne) {
-    for (nb = ne; *ne; ++ne) {
-      if (*ne == '|') break;
-    }
-    equiv = (SWIG_TypeNameComp(nb, ne, tb, te) == 0) ? 1 : 0;
-    if (*ne) ++ne;
-  }
-  return equiv;
+SWIG_TypeEquiv(const char *nb, const char *tb) {
+  return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0;
 }
 
-
-/* think of this as a c++ template<> or a scheme macro */
-#define SWIG_TypeCheck_Template(comparison, ty)         \
-  if (ty) {                                             \
-    swig_cast_info *iter = ty->cast;                    \
-    while (iter) {                                      \
-      if (comparison) {                                 \
-        if (iter == ty->cast) return iter;              \
-        /* Move iter to the top of the linked list */   \
-        iter->prev->next = iter->next;                  \
-        if (iter->next)                                 \
-          iter->next->prev = iter->prev;                \
-        iter->next = ty->cast;                          \
-        iter->prev = 0;                                 \
-        if (ty->cast) ty->cast->prev = iter;            \
-        ty->cast = iter;                                \
-        return iter;                                    \
-      }                                                 \
-      iter = iter->next;                                \
-    }                                                   \
-  }                                                     \
-  return 0
-
 /*
   Check the typename
 */
 SWIGRUNTIME swig_cast_info *
 SWIG_TypeCheck(const char *c, swig_type_info *ty) {
-  SWIG_TypeCheck_Template(strcmp(iter->type->name, c) == 0, ty);
+  if (ty) {
+    swig_cast_info *iter = ty->cast;
+    while (iter) {
+      if (strcmp(iter->type->name, c) == 0) {
+        if (iter == ty->cast)
+          return iter;
+        /* Move iter to the top of the linked list */
+        iter->prev->next = iter->next;
+        if (iter->next)
+          iter->next->prev = iter->prev;
+        iter->next = ty->cast;
+        iter->prev = 0;
+        if (ty->cast) ty->cast->prev = iter;
+        ty->cast = iter;
+        return iter;
+      }
+      iter = iter->next;
+    }
+  }
+  return 0;
 }
 
-/* Same as previous function, except strcmp is replaced with a pointer comparison */
+/*
+  Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison
+*/
 SWIGRUNTIME swig_cast_info *
-SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *into) {
-  SWIG_TypeCheck_Template(iter->type == from, into);
+SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) {
+  if (ty) {
+    swig_cast_info *iter = ty->cast;
+    while (iter) {
+      if (iter->type == from) {
+        if (iter == ty->cast)
+          return iter;
+        /* Move iter to the top of the linked list */
+        iter->prev->next = iter->next;
+        if (iter->next)
+          iter->next->prev = iter->prev;
+        iter->next = ty->cast;
+        iter->prev = 0;
+        if (ty->cast) ty->cast->prev = iter;
+        ty->cast = iter;
+        return iter;
+      }
+      iter = iter->next;
+    }
+  }
+  return 0;
 }
 
 /*
@@ -537,14 +578,14 @@ SWIG_MangledTypeQueryModule(swig_module_info *start,
   swig_module_info *iter = start;
   do {
     if (iter->size) {
-      register size_t l = 0;
-      register size_t r = iter->size - 1;
+      size_t l = 0;
+      size_t r = iter->size - 1;
       do {
 	/* since l+r >= 0, we can (>> 1) instead (/ 2) */
-	register size_t i = (l + r) >> 1;
+	size_t i = (l + r) >> 1;
 	const char *iname = iter->types[i]->name;
 	if (iname) {
-	  register int compare = strcmp(name, iname);
+	  int compare = strcmp(name, iname);
 	  if (compare == 0) {
 	    return iter->types[i];
 	  } else if (compare < 0) {
@@ -588,7 +629,7 @@ SWIG_TypeQueryModule(swig_module_info *start,
        of the str field (the human readable name) */
     swig_module_info *iter = start;
     do {
-      register size_t i = 0;
+      size_t i = 0;
       for (; i < iter->size; ++i) {
 	if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name)))
 	  return iter->types[i];
@@ -607,10 +648,10 @@ SWIG_TypeQueryModule(swig_module_info *start,
 SWIGRUNTIME char *
 SWIG_PackData(char *c, void *ptr, size_t sz) {
   static const char hex[17] = "0123456789abcdef";
-  register const unsigned char *u = (unsigned char *) ptr;
-  register const unsigned char *eu =  u + sz;
+  const unsigned char *u = (unsigned char *) ptr;
+  const unsigned char *eu =  u + sz;
   for (; u != eu; ++u) {
-    register unsigned char uu = *u;
+    unsigned char uu = *u;
     *(c++) = hex[(uu & 0xf0) >> 4];
     *(c++) = hex[uu & 0xf];
   }
@@ -622,22 +663,22 @@ SWIG_PackData(char *c, void *ptr, size_t sz) {
 */
 SWIGRUNTIME const char *
 SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
-  register unsigned char *u = (unsigned char *) ptr;
-  register const unsigned char *eu = u + sz;
+  unsigned char *u = (unsigned char *) ptr;
+  const unsigned char *eu = u + sz;
   for (; u != eu; ++u) {
-    register char d = *(c++);
-    register unsigned char uu;
+    char d = *(c++);
+    unsigned char uu;
     if ((d >= '0') && (d <= '9'))
-      uu = ((d - '0') << 4);
+      uu = (unsigned char)((d - '0') << 4);
     else if ((d >= 'a') && (d <= 'f'))
-      uu = ((d - ('a'-10)) << 4);
+      uu = (unsigned char)((d - ('a'-10)) << 4);
     else
       return (char *) 0;
     d = *(c++);
     if ((d >= '0') && (d <= '9'))
-      uu |= (d - '0');
+      uu |= (unsigned char)(d - '0');
     else if ((d >= 'a') && (d <= 'f'))
-      uu |= (d - ('a'-10));
+      uu |= (unsigned char)(d - ('a'-10));
     else
       return (char *) 0;
     *u = uu;
@@ -817,7 +858,7 @@ extern "C" {
 
 #ifndef pTHX_
 #define pTHX_
-#endif
+#endif   
 
 #include <string.h>
 #ifdef __cplusplus
@@ -830,50 +871,35 @@ extern "C" {
 
 SWIGINTERN const char*
 SWIG_Perl_ErrorType(int code) {
-  const char* type = 0;
   switch(code) {
   case SWIG_MemoryError:
-    type = "MemoryError";
-    break;
+    return "MemoryError";
   case SWIG_IOError:
-    type = "IOError";
-    break;
+    return "IOError";
   case SWIG_RuntimeError:
-    type = "RuntimeError";
-    break;
+    return "RuntimeError";
   case SWIG_IndexError:
-    type = "IndexError";
-    break;
+    return "IndexError";
   case SWIG_TypeError:
-    type = "TypeError";
-    break;
+    return "TypeError";
   case SWIG_DivisionByZero:
-    type = "ZeroDivisionError";
-    break;
+    return "ZeroDivisionError";
   case SWIG_OverflowError:
-    type = "OverflowError";
-    break;
+    return "OverflowError";
   case SWIG_SyntaxError:
-    type = "SyntaxError";
-    break;
+    return "SyntaxError";
   case SWIG_ValueError:
-    type = "ValueError";
-    break;
+    return "ValueError";
   case SWIG_SystemError:
-    type = "SystemError";
-    break;
+    return "SystemError";
   case SWIG_AttributeError:
-    type = "AttributeError";
-    break;
+    return "AttributeError";
   default:
-    type = "RuntimeError";
+    return "RuntimeError";
   }
-  return type;
 }
 
 
-
-
 /* -----------------------------------------------------------------------------
  * perlrun.swg
  *
@@ -894,7 +920,9 @@ SWIG_Perl_ErrorType(int code) {
 
 /* for raw pointers */
 #define SWIG_ConvertPtr(obj, pp, type, flags)           SWIG_Perl_ConvertPtr(SWIG_PERL_OBJECT_CALL obj, pp, type, flags)
+#define SWIG_ConvertPtrAndOwn(obj, pp, type, flags,own) SWIG_Perl_ConvertPtrAndOwn(SWIG_PERL_OBJECT_CALL obj, pp, type, flags, own)
 #define SWIG_NewPointerObj(p, type, flags)              SWIG_Perl_NewPointerObj(SWIG_PERL_OBJECT_CALL p, type, flags)
+#define swig_owntype                                    int
 
 /* for raw packed data */
 #define SWIG_ConvertPacked(obj, p, s, type)             SWIG_Perl_ConvertPacked(SWIG_PERL_OBJECT_CALL obj, p, s, type)
@@ -915,15 +943,15 @@ SWIG_Perl_ErrorType(int code) {
 
 /* Runtime API */
 
-#define SWIG_GetModule(clientdata)                      SWIG_Perl_GetModule()
+#define SWIG_GetModule(clientdata)                      SWIG_Perl_GetModule(clientdata)
 #define SWIG_SetModule(clientdata, pointer)             SWIG_Perl_SetModule(pointer)
 
 
 /* Error manipulation */
 
-#define SWIG_ErrorType(code)                            SWIG_Perl_ErrorType(code)
-#define SWIG_Error(code, msg)            		sv_setpvf(GvSV(PL_errgv),"%s %s\n", SWIG_ErrorType(code), msg)
-#define SWIG_fail                        		goto fail
+#define SWIG_ErrorType(code)                            SWIG_Perl_ErrorType(code)               
+#define SWIG_Error(code, msg)            		sv_setpvf(get_sv("@", GV_ADD), "%s %s", SWIG_ErrorType(code), msg)
+#define SWIG_fail                        		goto fail						    
 
 /* Perl-specific SWIG API */
 
@@ -961,15 +989,15 @@ extern "C" {
 #endif
 
 /* Macro to call an XS function */
-#ifdef PERL_OBJECT
-#  define SWIG_CALLXS(_name) _name(cv,pPerl)
-#else
-#  ifndef MULTIPLICITY
-#    define SWIG_CALLXS(_name) _name(cv)
-#  else
-#    define SWIG_CALLXS(_name) _name(PERL_GET_THX, cv)
-#  endif
-#endif
+#ifdef PERL_OBJECT 
+#  define SWIG_CALLXS(_name) _name(cv,pPerl) 
+#else 
+#  ifndef MULTIPLICITY 
+#    define SWIG_CALLXS(_name) _name(cv) 
+#  else 
+#    define SWIG_CALLXS(_name) _name(PERL_GET_THX, cv) 
+#  endif 
+#endif 
 
 #ifdef PERL_OBJECT
 #define MAGIC_PPERL  CPerlObj *pPerl = (CPerlObj *) this;
@@ -1016,35 +1044,32 @@ typedef int (*SwigMagicFunc)(struct interpreter *, SV *, MAGIC *);
 #endif /* MULTIPLICITY */
 #endif /* PERL_OBJECT */
 
-/* Workaround for bug in perl 5.6.x croak and earlier */
-#if (PERL_VERSION < 8)
 #  ifdef PERL_OBJECT
 #    define SWIG_croak_null() SWIG_Perl_croak_null(pPerl)
-static void SWIG_Perl_croak_null(CPerlObj *pPerl)
+static void SWIGUNUSED SWIG_Perl_croak_null(CPerlObj *pPerl)
 #  else
-static void SWIG_croak_null()
+static void SWIGUNUSED SWIG_croak_null()
 #  endif
 {
-  SV *err=ERRSV;
+  SV *err = get_sv("@", GV_ADD);
 #  if (PERL_VERSION < 6)
   croak("%_", err);
 #  else
-  if (SvOK(err) && !SvROK(err)) croak("%_", err);
-  croak(Nullch);
+  if (sv_isobject(err))
+    croak(0);
+  else
+    croak("%s", SvPV_nolen(err));
 #  endif
 }
-#else
-#  define SWIG_croak_null() croak(Nullch)
-#endif
 
 
-/*
+/* 
    Define how strict is the cast between strings and integers/doubles
    when overloading between these types occurs.
-
+   
    The default is making it as strict as possible by using SWIG_AddCast
    when needed.
-
+   
    You can use -DSWIG_PERL_NO_STRICT_STR2NUM at compilation time to
    disable the SWIG_AddCast, making the casting between string and
    numbers less strict.
@@ -1060,7 +1085,7 @@ static void SWIG_croak_null()
 #endif
 #ifdef SWIG_PERL_STRICT_STR2NUM
 /* string takes precedence */
-#define SWIG_Str2NumCast(x) SWIG_AddCast(x)
+#define SWIG_Str2NumCast(x) SWIG_AddCast(x)  
 #else
 /* number takes precedence */
 #define SWIG_Str2NumCast(x) x
@@ -1075,26 +1100,48 @@ SWIG_Perl_TypeProxyName(const swig_type_info *type) {
   if (!type) return NULL;
   if (type->clientdata != NULL) {
     return (const char*) type->clientdata;
-  }
+  } 
   else {
     return type->name;
   }
 }
 
+/* Identical to SWIG_TypeCheck, except for strcmp comparison */
 SWIGRUNTIME swig_cast_info *
 SWIG_TypeProxyCheck(const char *c, swig_type_info *ty) {
-  SWIG_TypeCheck_Template(( (!iter->type->clientdata && (strcmp(iter->type->name, c) == 0))
-			    || (iter->type->clientdata && (strcmp((char*)iter->type->clientdata, c) == 0))), ty);
+  if (ty) {
+    swig_cast_info *iter = ty->cast;
+    while (iter) {
+      if (strcmp(SWIG_Perl_TypeProxyName(iter->type), c) == 0) {
+        if (iter == ty->cast)
+          return iter;
+        /* Move iter to the top of the linked list */
+        iter->prev->next = iter->next;
+        if (iter->next)
+          iter->next->prev = iter->prev;
+        iter->next = ty->cast;
+        iter->prev = 0;
+        if (ty->cast) ty->cast->prev = iter;
+        ty->cast = iter;
+        return iter;
+      }
+      iter = iter->next;
+    }
+  }
+  return 0;
 }
 
-
 /* Function for getting a pointer value */
 
 SWIGRUNTIME int
-SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *_t, int flags) {
+SWIG_Perl_ConvertPtrAndOwn(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *_t, int flags, int *own) {
   swig_cast_info *tc;
   void *voidptr = (void *)0;
   SV *tsv = 0;
+
+  if (own)
+    *own = 0;
+
   /* If magical, apply more magic */
   if (SvGMAGICAL(sv))
     mg_get(sv);
@@ -1126,8 +1173,14 @@ SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *
     return SWIG_OK;
   } else if (SvTYPE(sv) == SVt_RV) {  /* Check for NULL pointer */
     if (!SvROK(sv)) {
-      *(ptr) = (void *) 0;
-      return SWIG_OK;
+      /* In Perl 5.12 and later, SVt_RV == SVt_IV, so sv could be a valid integer value.  */
+      if (SvIOK(sv)) {
+        return SWIG_ERROR;
+      } else {
+        /* NULL pointer (reference to undef). */
+        *(ptr) = (void *) 0;
+        return SWIG_OK;
+      }
     } else {
       return SWIG_ERROR;
     }
@@ -1138,32 +1191,40 @@ SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *
     /* Now see if the types match */
     char *_c = HvNAME(SvSTASH(SvRV(sv)));
     tc = SWIG_TypeProxyCheck(_c,_t);
+#ifdef SWIG_DIRECTORS
+    if (!tc && !sv_derived_from(sv,SWIG_Perl_TypeProxyName(_t))) {
+#else
     if (!tc) {
+#endif
       return SWIG_ERROR;
     }
     {
       int newmemory = 0;
       *ptr = SWIG_TypeCast(tc,voidptr,&newmemory);
-      assert(!newmemory); /* newmemory handling not yet implemented */
+      if (newmemory == SWIG_CAST_NEW_MEMORY) {
+        assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */
+        if (own)
+          *own = *own | SWIG_CAST_NEW_MEMORY;
+      }
     }
   } else {
     *ptr = voidptr;
   }
 
-  /*
+  /* 
    *  DISOWN implementation: we need a perl guru to check this one.
    */
   if (tsv && (flags & SWIG_POINTER_DISOWN)) {
-    /*
+    /* 
      *  almost copy paste code from below SWIG_POINTER_OWN setting
      */
     SV *obj = sv;
     HV *stash = SvSTASH(SvRV(obj));
-    GV *gv = *(GV**) hv_fetch(stash, "OWNER", 5, TRUE);
+    GV *gv = *(GV**)hv_fetch(stash, "OWNER", 5, TRUE);
     if (isGV(gv)) {
       HV *hv = GvHVn(gv);
       /*
-       * To set ownership (see below), a newSViv(1) entry is added.
+       * To set ownership (see below), a newSViv(1) entry is added. 
        * Hence, to remove ownership, we delete the entry.
        */
       if (hv_exists_ent(hv, obj, 0)) {
@@ -1174,18 +1235,23 @@ SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *
   return SWIG_OK;
 }
 
+SWIGRUNTIME int
+SWIG_Perl_ConvertPtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void **ptr, swig_type_info *_t, int flags) {
+  return SWIG_Perl_ConvertPtrAndOwn(sv, ptr, _t, flags, 0);
+}
+
 SWIGRUNTIME void
 SWIG_Perl_MakePtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void *ptr, swig_type_info *t, int flags) {
-  if (ptr && (flags & SWIG_SHADOW)) {
+  if (ptr && (flags & (SWIG_SHADOW | SWIG_POINTER_OWN))) {
     SV *self;
     SV *obj=newSV(0);
     HV *hash=newHV();
     HV *stash;
-    sv_setref_pv(obj, (char *) SWIG_Perl_TypeProxyName(t), ptr);
+    sv_setref_pv(obj, SWIG_Perl_TypeProxyName(t), ptr);
     stash=SvSTASH(SvRV(obj));
     if (flags & SWIG_POINTER_OWN) {
       HV *hv;
-      GV *gv=*(GV**)hv_fetch(stash, "OWNER", 5, TRUE);
+      GV *gv = *(GV**)hv_fetch(stash, "OWNER", 5, TRUE);
       if (!isGV(gv))
         gv_init(gv, stash, "OWNER", 5, FALSE);
       hv=GvHVn(gv);
@@ -1199,7 +1265,7 @@ SWIG_Perl_MakePtr(SWIG_MAYBE_PERL_OBJECT SV *sv, void *ptr, swig_type_info *t, i
     sv_bless(sv, stash);
   }
   else {
-    sv_setref_pv(sv, (char *) SWIG_Perl_TypeProxyName(t), ptr);
+    sv_setref_pv(sv, SWIG_Perl_TypeProxyName(t), ptr);
   }
 }
 
@@ -1290,19 +1356,23 @@ typedef struct {
 
 /* Magic variable code */
 #ifndef PERL_OBJECT
-#define swig_create_magic(s,a,b,c) _swig_create_magic(s,a,b,c)
-  #ifndef MULTIPLICITY
-     SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(SV *, MAGIC *), int (*get)(SV *,MAGIC *))
-  #else
-     SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(struct interpreter*, SV *, MAGIC *), int (*get)(struct interpreter*, SV *,MAGIC *))
-  #endif
+# ifdef __cplusplus
+#  define swig_create_magic(s,a,b,c) _swig_create_magic(s,const_cast<char*>(a),b,c)
+# else
+#  define swig_create_magic(s,a,b,c) _swig_create_magic(s,(char*)(a),b,c)
+# endif
+# ifndef MULTIPLICITY
+SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(SV *, MAGIC *), int (*get)(SV *,MAGIC *)) 
+# else
+SWIGRUNTIME void _swig_create_magic(SV *sv, char *name, int (*set)(struct interpreter*, SV *, MAGIC *), int (*get)(struct interpreter*, SV *,MAGIC *)) 
+# endif
 #else
 #  define swig_create_magic(s,a,b,c) _swig_create_magic(pPerl,s,a,b,c)
-SWIGRUNTIME void _swig_create_magic(CPerlObj *pPerl, SV *sv, const char *name, int (CPerlObj::*set)(SV *, MAGIC *), int (CPerlObj::*get)(SV *, MAGIC *))
+SWIGRUNTIME void _swig_create_magic(CPerlObj *pPerl, SV *sv, const char *name, int (CPerlObj::*set)(SV *, MAGIC *), int (CPerlObj::*get)(SV *, MAGIC *)) 
 #endif
 {
   MAGIC *mg;
-  sv_magic(sv,sv,'U',(char *) name,strlen(name));
+  sv_magic(sv,sv,'U',name,strlen(name));
   mg = mg_find(sv,'U');
   mg->mg_virtual = (MGVTBL *) malloc(sizeof(MGVTBL));
   mg->mg_virtual->svt_get = (SwigMagicFunc) get;
@@ -1314,7 +1384,7 @@ SWIGRUNTIME void _swig_create_magic(CPerlObj *pPerl, SV *sv, const char *name, i
 
 
 SWIGRUNTIME swig_module_info *
-SWIG_Perl_GetModule(void) {
+SWIG_Perl_GetModule(void *SWIGUNUSEDPARM(clientdata)) {
   static void *type_pointer = (void *)0;
   SV *pointer;
 
@@ -1351,6 +1421,9 @@ SWIG_Perl_SetModule(swig_module_info *module) {
 #ifdef do_close
   #undef do_close
 #endif
+#ifdef do_exec
+  #undef do_exec
+#endif
 #ifdef scalar
   #undef scalar
 #endif
@@ -1405,9 +1478,6 @@ SWIG_Perl_SetModule(swig_module_info *module) {
 #ifdef eof
   #undef eof
 #endif
-#ifdef bool
-  #undef bool
-#endif
 #ifdef close
   #undef close
 #endif
@@ -1435,12 +1505,35 @@ SWIG_Perl_SetModule(swig_module_info *module) {
 #ifdef open
   #undef open
 #endif
+#ifdef readdir
+  #undef readdir
+#endif
+#ifdef bind
+  #undef bind
+#endif
+#ifdef access
+  #undef access
+#endif
+#ifdef stat
+  #undef stat
+#endif
+#ifdef seed
+  #undef seed
+#endif
+
+#ifdef bool
+  /* Leave if macro is from C99 stdbool.h */
+  #ifndef __bool_true_false_are_defined
+    #undef bool
+  #endif
+#endif
+
 
 
 
-#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0)
+#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0) 
 
-#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else
+#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else 
 
 
 
@@ -1448,8 +1541,10 @@ SWIG_Perl_SetModule(swig_module_info *module) {
 
 #define SWIGTYPE_p_McastHandle swig_types[0]
 #define SWIGTYPE_p_char swig_types[1]
-static swig_type_info *swig_types[3];
-static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0};
+#define SWIGTYPE_p_int16_t swig_types[2]
+#define SWIGTYPE_p_mcast_socket_t swig_types[3]
+static swig_type_info *swig_types[5];
+static swig_module_info swig_module = {swig_types, 4, 0, 0, 0, 0};
 #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name)
 #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name)
 
@@ -1460,12 +1555,12 @@ static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0};
 #define SWIG_name   "MCASTc::boot_MCAST"
 #define SWIG_prefix "MCASTc::"
 
-#define SWIGVERSION 0x010335
+#define SWIGVERSION 0x030012 
 #define SWIG_VERSION SWIGVERSION
 
 
-#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a))
-#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a))
+#define SWIG_as_voidptr(a) const_cast< void * >(static_cast< const void * >(a)) 
+#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),reinterpret_cast< void** >(a)) 
 
 
 #include <stdexcept>
@@ -1505,14 +1600,19 @@ SWIG_pchar_descriptor(void)
 SWIGINTERN int
 SWIG_AsCharPtrAndSize(SV *obj, char** cptr, size_t* psize, int *alloc)
 {
+  if (SvMAGICAL(obj)) {
+     SV *tmp = sv_newmortal();
+     SvSetSV(tmp, obj);
+     obj = tmp;
+  }
   if (SvPOK(obj)) {
     STRLEN len = 0;
-    char *cstr = SvPV(obj, len);
+    char *cstr = SvPV(obj, len); 
     size_t size = len + 1;
     if (cptr)  {
       if (alloc) {
 	if (*alloc == SWIG_NEWOBJ) {
-	  *cptr = reinterpret_cast< char* >(memcpy((new char[size]), cstr, sizeof(char)*(size)));
+	  *cptr = reinterpret_cast< char* >(memcpy(new char[size], cstr, sizeof(char)*(size)));
 	} else {
 	  *cptr = cstr;
 	  *alloc = SWIG_OLDOBJ;
@@ -1524,7 +1624,7 @@ SWIG_AsCharPtrAndSize(SV *obj, char** cptr, size_t* psize, int *alloc)
   } else {
     swig_type_info* pchar_descriptor = SWIG_pchar_descriptor();
     if (pchar_descriptor) {
-      char* vptr = 0;
+      char* vptr = 0; 
       if (SWIG_ConvertPtr(obj, (void**)&vptr, pchar_descriptor, 0) == SWIG_OK) {
 	if (cptr) *cptr = vptr;
 	if (psize) *psize = vptr ? (strlen(vptr) + 1) : 0;
@@ -1550,6 +1650,17 @@ SWIG_AsCharPtrAndSize(SV *obj, char** cptr, size_t* psize, int *alloc)
 #endif
 
 
+#include <stdlib.h>
+#ifdef _MSC_VER
+# ifndef strtoull
+#  define strtoull _strtoui64
+# endif
+# ifndef strtoll
+#  define strtoll _strtoi64
+# endif
+#endif
+
+
 SWIGINTERN int
 SWIG_AsVal_double SWIG_PERL_DECL_ARGS_2(SV *obj, double *val)
 {
@@ -1563,7 +1674,9 @@ SWIG_AsVal_double SWIG_PERL_DECL_ARGS_2(SV *obj, double *val)
     const char *nptr = SvPV_nolen(obj);
     if (nptr) {
       char *endptr;
-      double v = strtod(nptr, &endptr);
+      double v;
+      errno = 0;
+      v = strtod(nptr, &endptr);
       if (errno == ERANGE) {
 	errno = 0;
 	return SWIG_OverflowError;
@@ -1618,9 +1731,20 @@ SWIG_CanCastAsInteger(double *d, double min, double max) {
 SWIGINTERN int
 SWIG_AsVal_long SWIG_PERL_DECL_ARGS_2(SV *obj, long* val)
 {
-  if (SvIOK(obj)) {
-    if (val) *val = SvIV(obj);
-    return SWIG_OK;
+  if (SvUOK(obj)) {
+    UV v = SvUV(obj);
+    if (UVSIZE < sizeof(*val) || v <= LONG_MAX) {
+      if (val) *val = v;
+      return SWIG_OK;
+    }
+    return SWIG_OverflowError;
+  } else if (SvIOK(obj)) {
+    IV v = SvIV(obj);
+    if (IVSIZE <= sizeof(*val) || (v >= LONG_MIN && v <= LONG_MAX)) {
+      if(val) *val = v;
+      return SWIG_OK;
+    }
+    return SWIG_OverflowError;
   } else {
     int dispatch = 0;
     const char *nptr = SvPV_nolen(obj);
@@ -1663,7 +1787,7 @@ SWIG_AsVal_int SWIG_PERL_DECL_ARGS_2(SV * obj, int *val)
     } else {
       if (val) *val = static_cast< int >(v);
     }
-  }
+  }  
   return res;
 }
 
@@ -1671,15 +1795,18 @@ SWIG_AsVal_int SWIG_PERL_DECL_ARGS_2(SV * obj, int *val)
 SWIGINTERNINLINE SV *
 SWIG_From_long  SWIG_PERL_DECL_ARGS_1(long value)
 {
-  SV *obj = sv_newmortal();
-  sv_setiv(obj, (IV) value);
-  return obj;
+  SV *sv;
+  if (IVSIZE >= sizeof(value) || (value >= IV_MIN && value <= IV_MAX))
+    sv = newSViv(value);
+  else
+    sv = newSVpvf("%ld", value);
+  return sv_2mortal(sv);
 }
 
 
 SWIGINTERNINLINE SV *
 SWIG_From_int  SWIG_PERL_DECL_ARGS_1(int value)
-{
+{    
   return SWIG_From_long  SWIG_PERL_CALL_ARGS_1(value);
 }
 
@@ -1697,9 +1824,9 @@ SWIG_FromCharPtrAndSize(const char* carray, size_t size)
 }
 
 
-SWIGINTERNINLINE SV *
+SWIGINTERNINLINE SV * 
 SWIG_FromCharPtr(const char *cptr)
-{
+{ 
   return SWIG_FromCharPtrAndSize(cptr, (cptr ? strlen(cptr) : 0));
 }
 
@@ -1735,19 +1862,19 @@ extern "C" {
 XS(_wrap_new_McastHandle) {
   {
     char *arg1 = (char *) 0 ;
-    int arg2 ;
+    int16_t arg2 ;
     int arg3 ;
-    McastHandle *result = 0 ;
     int res1 ;
     char *buf1 = 0 ;
     int alloc1 = 0 ;
-    int val2 ;
-    int ecode2 = 0 ;
+    void *argp2 ;
+    int res2 = 0 ;
     int val3 ;
     int ecode3 = 0 ;
     int argvi = 0;
+    McastHandle *result = 0 ;
     dXSARGS;
-
+    
     if ((items < 3) || (items > 3)) {
       SWIG_croak("Usage: new_McastHandle(host,port,flags);");
     }
@@ -1756,26 +1883,30 @@ XS(_wrap_new_McastHandle) {
       SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "new_McastHandle" "', argument " "1"" of type '" "char const *""'");
     }
     arg1 = reinterpret_cast< char * >(buf1);
-    ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2);
-    if (!SWIG_IsOK(ecode2)) {
-      SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "new_McastHandle" "', argument " "2"" of type '" "int""'");
+    {
+      res2 = SWIG_ConvertPtr(ST(1), &argp2, SWIGTYPE_p_int16_t,  0 );
+      if (!SWIG_IsOK(res2)) {
+        SWIG_exception_fail(SWIG_ArgError(res2), "in method '" "new_McastHandle" "', argument " "2"" of type '" "int16_t""'"); 
+      }  
+      if (!argp2) {
+        SWIG_exception_fail(SWIG_ValueError, "invalid null reference " "in method '" "new_McastHandle" "', argument " "2"" of type '" "int16_t""'");
+      } else {
+        arg2 = *(reinterpret_cast< int16_t * >(argp2));
+      }
     }
-    arg2 = static_cast< int >(val2);
     ecode3 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(2), &val3);
     if (!SWIG_IsOK(ecode3)) {
       SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "new_McastHandle" "', argument " "3"" of type '" "int""'");
-    }
+    } 
     arg3 = static_cast< int >(val3);
     result = (McastHandle *)new McastHandle((char const *)arg1,arg2,arg3);
     ST(argvi) = SWIG_NewPointerObj(SWIG_as_voidptr(result), SWIGTYPE_p_McastHandle, SWIG_OWNER | SWIG_SHADOW); argvi++ ;
     if (alloc1 == SWIG_NEWOBJ) delete[] buf1;
-
-
+    
     XSRETURN(argvi);
   fail:
     if (alloc1 == SWIG_NEWOBJ) delete[] buf1;
-
-
+    
     SWIG_croak_null();
   }
 }
@@ -1788,22 +1919,21 @@ XS(_wrap_delete_McastHandle) {
     int res1 = 0 ;
     int argvi = 0;
     dXSARGS;
-
+    
     if ((items < 1) || (items > 1)) {
       SWIG_croak("Usage: delete_McastHandle(self);");
     }
     res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_McastHandle, SWIG_POINTER_DISOWN |  0 );
     if (!SWIG_IsOK(res1)) {
-      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_McastHandle" "', argument " "1"" of type '" "McastHandle *""'");
+      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "delete_McastHandle" "', argument " "1"" of type '" "McastHandle *""'"); 
     }
     arg1 = reinterpret_cast< McastHandle * >(argp1);
     delete arg1;
-
-
-
+    ST(argvi) = sv_newmortal();
+    
     XSRETURN(argvi);
   fail:
-
+    
     SWIG_croak_null();
   }
 }
@@ -1813,21 +1943,21 @@ XS(_wrap_McastHandle_send) {
   {
     McastHandle *arg1 = (McastHandle *) 0 ;
     char *arg2 = (char *) 0 ;
-    int result;
     void *argp1 = 0 ;
     int res1 = 0 ;
     int res2 ;
     char *buf2 = 0 ;
     int alloc2 = 0 ;
     int argvi = 0;
+    int result;
     dXSARGS;
-
+    
     if ((items < 2) || (items > 2)) {
       SWIG_croak("Usage: McastHandle_send(self,data);");
     }
     res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_McastHandle, 0 |  0 );
     if (!SWIG_IsOK(res1)) {
-      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_send" "', argument " "1"" of type '" "McastHandle *""'");
+      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_send" "', argument " "1"" of type '" "McastHandle *""'"); 
     }
     arg1 = reinterpret_cast< McastHandle * >(argp1);
     res2 = SWIG_AsCharPtrAndSize(ST(1), &buf2, NULL, &alloc2);
@@ -1837,11 +1967,11 @@ XS(_wrap_McastHandle_send) {
     arg2 = reinterpret_cast< char * >(buf2);
     result = (int)(arg1)->send((char const *)arg2);
     ST(argvi) = SWIG_From_int  SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ;
-
+    
     if (alloc2 == SWIG_NEWOBJ) delete[] buf2;
     XSRETURN(argvi);
   fail:
-
+    
     if (alloc2 == SWIG_NEWOBJ) delete[] buf2;
     SWIG_croak_null();
   }
@@ -1852,37 +1982,37 @@ XS(_wrap_McastHandle_recv) {
   {
     McastHandle *arg1 = (McastHandle *) 0 ;
     int arg2 = (int) 0 ;
-    char *result = 0 ;
     void *argp1 = 0 ;
     int res1 = 0 ;
     int val2 ;
     int ecode2 = 0 ;
     int argvi = 0;
+    char *result = 0 ;
     dXSARGS;
-
+    
     if ((items < 1) || (items > 2)) {
       SWIG_croak("Usage: McastHandle_recv(self,ms);");
     }
     res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_McastHandle, 0 |  0 );
     if (!SWIG_IsOK(res1)) {
-      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_recv" "', argument " "1"" of type '" "McastHandle *""'");
+      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_recv" "', argument " "1"" of type '" "McastHandle *""'"); 
     }
     arg1 = reinterpret_cast< McastHandle * >(argp1);
     if (items > 1) {
       ecode2 = SWIG_AsVal_int SWIG_PERL_CALL_ARGS_2(ST(1), &val2);
       if (!SWIG_IsOK(ecode2)) {
         SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "McastHandle_recv" "', argument " "2"" of type '" "int""'");
-      }
+      } 
       arg2 = static_cast< int >(val2);
     }
     result = (char *)(arg1)->recv(arg2);
     ST(argvi) = SWIG_FromCharPtr((const char *)result); argvi++ ;
-
-
+    
+    
     XSRETURN(argvi);
   fail:
-
-
+    
+    
     SWIG_croak_null();
   }
 }
@@ -1891,26 +2021,26 @@ XS(_wrap_McastHandle_recv) {
 XS(_wrap_McastHandle_filenum) {
   {
     McastHandle *arg1 = (McastHandle *) 0 ;
-    int result;
     void *argp1 = 0 ;
     int res1 = 0 ;
     int argvi = 0;
+    mcast_socket_t result;
     dXSARGS;
-
+    
     if ((items < 1) || (items > 1)) {
       SWIG_croak("Usage: McastHandle_filenum(self);");
     }
     res1 = SWIG_ConvertPtr(ST(0), &argp1,SWIGTYPE_p_McastHandle, 0 |  0 );
     if (!SWIG_IsOK(res1)) {
-      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_filenum" "', argument " "1"" of type '" "McastHandle *""'");
+      SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "McastHandle_filenum" "', argument " "1"" of type '" "McastHandle *""'"); 
     }
     arg1 = reinterpret_cast< McastHandle * >(argp1);
-    result = (int)(arg1)->filenum();
-    ST(argvi) = SWIG_From_int  SWIG_PERL_CALL_ARGS_1(static_cast< int >(result)); argvi++ ;
-
+    result = (arg1)->filenum();
+    ST(argvi) = SWIG_NewPointerObj((new mcast_socket_t(static_cast< const mcast_socket_t& >(result))), SWIGTYPE_p_mcast_socket_t, SWIG_POINTER_OWN | 0); argvi++ ;
+    
     XSRETURN(argvi);
   fail:
-
+    
     SWIG_croak_null();
   }
 }
@@ -1921,18 +2051,26 @@ XS(_wrap_McastHandle_filenum) {
 
 static swig_type_info _swigt__p_McastHandle = {"_p_McastHandle", "McastHandle *", 0, 0, (void*)"MCAST::McastHandle", 0};
 static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0};
+static swig_type_info _swigt__p_int16_t = {"_p_int16_t", "int16_t *", 0, 0, (void*)0, 0};
+static swig_type_info _swigt__p_mcast_socket_t = {"_p_mcast_socket_t", "mcast_socket_t *", 0, 0, (void*)0, 0};
 
 static swig_type_info *swig_type_initial[] = {
   &_swigt__p_McastHandle,
   &_swigt__p_char,
+  &_swigt__p_int16_t,
+  &_swigt__p_mcast_socket_t,
 };
 
 static swig_cast_info _swigc__p_McastHandle[] = {  {&_swigt__p_McastHandle, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_char[] = {  {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}};
+static swig_cast_info _swigc__p_int16_t[] = {  {&_swigt__p_int16_t, 0, 0, 0},{0, 0, 0, 0}};
+static swig_cast_info _swigc__p_mcast_socket_t[] = {  {&_swigt__p_mcast_socket_t, 0, 0, 0},{0, 0, 0, 0}};
 
 static swig_cast_info *swig_cast_initial[] = {
   _swigc__p_McastHandle,
   _swigc__p_char,
+  _swigc__p_int16_t,
+  _swigc__p_mcast_socket_t,
 };
 
 
@@ -1968,7 +2106,7 @@ static swig_command_info swig_commands[] = {
  * array with the correct data and linking the correct swig_cast_info
  * structures together.
  *
- * The generated swig_type_info structures are assigned staticly to an initial
+ * The generated swig_type_info structures are assigned statically to an initial
  * array. We just loop through that array, and handle each type individually.
  * First we lookup if this type has been already loaded, and if so, use the
  * loaded structure instead of the generated one. Then we have to fill in the
@@ -2012,10 +2150,8 @@ SWIGRUNTIME void
 SWIG_InitializeModule(void *clientdata) {
   size_t i;
   swig_module_info *module_head, *iter;
-  int found, init;
-
-  clientdata = clientdata;
-
+  int init;
+  
   /* check to see if the circular list has been setup, if not, set it up */
   if (swig_module.next==0) {
     /* Initialize the swig_module */
@@ -2026,39 +2162,35 @@ SWIG_InitializeModule(void *clientdata) {
   } else {
     init = 0;
   }
-
+  
   /* Try and load any already created modules */
   module_head = SWIG_GetModule(clientdata);
   if (!module_head) {
     /* This is the first module loaded for this interpreter */
     /* so set the swig module into the interpreter */
     SWIG_SetModule(clientdata, &swig_module);
-    module_head = &swig_module;
   } else {
     /* the interpreter has loaded a SWIG module, but has it loaded this one? */
-    found=0;
     iter=module_head;
     do {
       if (iter==&swig_module) {
-        found=1;
-        break;
+        /* Our module is already in the list, so there's nothing more to do. */
+        return;
       }
       iter=iter->next;
     } while (iter!= module_head);
-
-    /* if the is found in the list, then all is done and we may leave */
-    if (found) return;
-    /* otherwise we must add out module into the list */
+    
+    /* otherwise we must add our module into the list */
     swig_module.next = module_head->next;
     module_head->next = &swig_module;
   }
-
-  /* When multiple interpeters are used, a module could have already been initialized in
+  
+  /* When multiple interpreters are used, a module could have already been initialized in
        a different interpreter, but not yet have a pointer in this interpreter.
        In this case, we do not want to continue adding types... everything should be
        set up already */
   if (init == 0) return;
-
+  
   /* Now work on filling in swig_module.types */
 #ifdef SWIGRUNTIME_DEBUG
   printf("SWIG_InitializeModule: size %d\n", swig_module.size);
@@ -2067,11 +2199,11 @@ SWIG_InitializeModule(void *clientdata) {
     swig_type_info *type = 0;
     swig_type_info *ret;
     swig_cast_info *cast;
-
+    
 #ifdef SWIGRUNTIME_DEBUG
     printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name);
 #endif
-
+    
     /* if there is another module already loaded */
     if (swig_module.next != &swig_module) {
       type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, swig_module.type_initial[i]->name);
@@ -2090,7 +2222,7 @@ SWIG_InitializeModule(void *clientdata) {
     } else {
       type = swig_module.type_initial[i];
     }
-
+    
     /* Insert casting types */
     cast = swig_module.cast_initial[i];
     while (cast->type) {
@@ -2121,7 +2253,7 @@ SWIG_InitializeModule(void *clientdata) {
           if (!ocast) ret = 0;
         }
       }
-
+      
       if (!ret) {
 #ifdef SWIGRUNTIME_DEBUG
         printf("SWIG_InitializeModule: adding cast %s\n", cast->type->name);
@@ -2138,7 +2270,7 @@ SWIG_InitializeModule(void *clientdata) {
     swig_module.types[i] = type;
   }
   swig_module.types[i] = 0;
-
+  
 #ifdef SWIGRUNTIME_DEBUG
   printf("**** SWIG_InitializeModule: Cast List ******\n");
   for (i = 0; i < swig_module.size; ++i) {
@@ -2166,10 +2298,10 @@ SWIG_PropagateClientData(void) {
   size_t i;
   swig_cast_info *equiv;
   static int init_run = 0;
-
+  
   if (init_run) return;
   init_run = 1;
-
+  
   for (i = 0; i < swig_module.size; i++) {
     if (swig_module.types[i]->clientdata) {
       equiv = swig_module.types[i]->cast;
@@ -2194,37 +2326,43 @@ SWIG_PropagateClientData(void) {
 
 
 
-#ifdef __cplusplus
+#if defined(__cplusplus) && ! defined(XSPROTO)
 extern "C"
 #endif
 
 XS(SWIG_init) {
   dXSARGS;
   int i;
-
+  (void)items;
+  
   SWIG_InitializeModule(0);
-
+  
   /* Install commands */
   for (i = 0; swig_commands[i].name; i++) {
-    newXS((char*) swig_commands[i].name,swig_commands[i].wrapper, (char*)__FILE__);
+    /* Casts only needed for Perl < 5.10. */
+#ifdef __cplusplus
+    newXS(const_cast<char*>(swig_commands[i].name), swig_commands[i].wrapper, const_cast<char*>(__FILE__));
+#else
+    newXS((char*)swig_commands[i].name, swig_commands[i].wrapper, (char*)__FILE__);
+#endif
   }
-
+  
   /* Install variables */
   for (i = 0; swig_variables[i].name; i++) {
     SV *sv;
-    sv = get_sv((char*) swig_variables[i].name, TRUE | 0x2 | GV_ADDMULTI);
+    sv = get_sv(swig_variables[i].name, TRUE | 0x2 | GV_ADDMULTI);
     if (swig_variables[i].type) {
       SWIG_MakePtr(sv,(void *)1, *swig_variables[i].type,0);
     } else {
       sv_setiv(sv,(IV) 0);
     }
-    swig_create_magic(sv, (char *) swig_variables[i].name, swig_variables[i].set, swig_variables[i].get);
+    swig_create_magic(sv, swig_variables[i].name, swig_variables[i].set, swig_variables[i].get); 
   }
-
+  
   /* Install constant */
   for (i = 0; swig_constants[i].type; i++) {
     SV *sv;
-    sv = get_sv((char*)swig_constants[i].name, TRUE | 0x2 | GV_ADDMULTI);
+    sv = get_sv(swig_constants[i].name, TRUE | 0x2 | GV_ADDMULTI);
     switch(swig_constants[i].type) {
     case SWIG_INT:
       sv_setiv(sv, (IV) swig_constants[i].lvalue);
@@ -2233,7 +2371,7 @@ XS(SWIG_init) {
       sv_setnv(sv, (double) swig_constants[i].dvalue);
       break;
     case SWIG_STRING:
-      sv_setpv(sv, (char *) swig_constants[i].pvalue);
+      sv_setpv(sv, (const char *) swig_constants[i].pvalue);
       break;
     case SWIG_POINTER:
       SWIG_MakePtr(sv, swig_constants[i].pvalue, *(swig_constants[i].ptype),0);
@@ -2246,7 +2384,7 @@ XS(SWIG_init) {
     }
     SvREADONLY_on(sv);
   }
-
+  
   SWIG_TypeClientData(SWIGTYPE_p_McastHandle, (void*) "MCAST::McastHandle");
   ST(0) = &PL_sv_yes;
   XSRETURN(1);
diff --git a/src/mod/endpoints/mod_verto/mcast/perlxsi.c b/src/mod/endpoints/mod_verto/mcast/perlxsi.c
index 9ca8fc1fb17..f11b766af50 100644
--- a/src/mod/endpoints/mod_verto/mcast/perlxsi.c
+++ b/src/mod/endpoints/mod_verto/mcast/perlxsi.c
@@ -1,5 +1,6 @@
-#include <EXTERN.h>
-#include <perl.h>
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
 
 EXTERN_C void xs_init (pTHX);
 
@@ -8,9 +9,10 @@ EXTERN_C void boot_DynaLoader (pTHX_ CV* cv);
 EXTERN_C void
 xs_init(pTHX)
 {
-	char *file = __FILE__;
-	dXSUB_SYS;
+    static const char file[] = __FILE__;
+    dXSUB_SYS;
+    PERL_UNUSED_CONTEXT;
 
-	/* DynaLoader is a special case */
-	newXS("DynaLoader::boot_DynaLoader", boot_DynaLoader, file);
+    /* DynaLoader is a special case */
+    newXS("DynaLoader::boot_DynaLoader", boot_DynaLoader, file);
 }
diff --git a/src/mod/endpoints/mod_verto/mod_verto.c b/src/mod/endpoints/mod_verto/mod_verto.c
index efd76ac25c0..755af50ac10 100644
--- a/src/mod/endpoints/mod_verto/mod_verto.c
+++ b/src/mod/endpoints/mod_verto/mod_verto.c
@@ -807,7 +807,6 @@ static void set_perm(const char *str, switch_event_t **event)
 	if (!zstr(str)) {
 		edup = strdup(str);
 		switch_assert(edup);
-		cur = edup;
 
 		if (strchr(edup, ' ')) {
 			delim = ' ';
@@ -1637,7 +1636,7 @@ static void http_run(jsock_t *jsock)
 		goto done;
 	}
 
-	if (!strncmp(request.method, "POST", 4) && request.content_length &&
+	if (!strncmp(request.method, "POST", 4) && request.content_length && request.content_type &&
 		!strncmp(request.content_type, "application/x-www-form-urlencoded", 33)) {
 
 		char *buffer = NULL;
@@ -1722,7 +1721,7 @@ static void http_run(jsock_t *jsock)
 			*auth_pass++ = '\0';
 		}
 
-		if (vhost->auth_user && vhost->auth_pass &&
+		if (vhost->auth_user && vhost->auth_pass && auth_pass &&
 			!strcmp(vhost->auth_user, auth_user) &&
 			!strcmp(vhost->auth_pass, auth_pass)) {
 			goto authed;
@@ -2995,6 +2994,7 @@ static switch_bool_t attended_transfer(switch_core_session_t *session, switch_co
 
 	if (tech_pvt && b_tech_pvt) {
 		switch_channel_set_variable(tech_pvt->channel, "refer_uuid", switch_core_session_get_uuid(b_tech_pvt->session));
+		switch_channel_set_variable(tech_pvt->channel, "transfer_disposition", "recv_replace");
 		switch_channel_set_variable(b_tech_pvt->channel, "transfer_disposition", "replaced");
 
 		br_a = switch_channel_get_partner_uuid(tech_pvt->channel);
@@ -3206,6 +3206,7 @@ static switch_bool_t verto__modify_func(const char *method, cJSON *params, jsock
 			if (switch_core_session_get_partner(tech_pvt->session, &other_session) == SWITCH_STATUS_SUCCESS) {
 				switch_ivr_session_transfer(other_session, destination, NULL, NULL);
 				cJSON_AddItemToObject(obj, "message", cJSON_CreateString("CALL TRANSFERRED"));
+				switch_channel_set_variable(tech_pvt->channel, "transfer_disposition", "recv_replace");
 				switch_core_session_rwunlock(other_session);
 			} else {
 				cJSON_AddItemToObject(obj, "message", cJSON_CreateString("call is not bridged"));
@@ -3588,6 +3589,8 @@ static switch_bool_t verto__invite_func(const char *method, cJSON *params, jsock
 
 	*response = obj;
 
+	PROTECT_INTERFACE(verto_endpoint_interface);
+
 	if (!params) {
 		cJSON_AddItemToObject(obj, "message", cJSON_CreateString("Params data missing"));
 		err = 1; goto cleanup;
@@ -3614,7 +3617,6 @@ static switch_bool_t verto__invite_func(const char *method, cJSON *params, jsock
 
 	switch_event_add_header_string(var_event, SWITCH_STACK_BOTTOM, "origination_uuid", call_id);
 
-	PROTECT_INTERFACE(verto_endpoint_interface);
 	if ((reason = switch_core_session_outgoing_channel(NULL, var_event, "rtc",
 													   NULL, &session, NULL, SOF_NONE, &cancel_cause)) != SWITCH_CAUSE_SUCCESS) {
 		cJSON_AddItemToObject(obj, "message", cJSON_CreateString("Cannot create channel"));
@@ -5287,7 +5289,7 @@ static switch_status_t verto_read_text_frame(switch_core_session_t *session, swi
 
 	switch_mutex_lock(tech_pvt->text_cond_mutex);
 
-	status = switch_thread_cond_timedwait(tech_pvt->text_cond, tech_pvt->text_cond_mutex, 100000);
+	switch_thread_cond_timedwait(tech_pvt->text_cond, tech_pvt->text_cond_mutex, 100000);
 	switch_mutex_unlock(tech_pvt->text_cond_mutex);
 
 	*frame = &tech_pvt->text_read_frame;
diff --git a/src/mod/endpoints/mod_verto/ws.c b/src/mod/endpoints/mod_verto/ws.c
index f33ce572393..be2fd8296c2 100644
--- a/src/mod/endpoints/mod_verto/ws.c
+++ b/src/mod/endpoints/mod_verto/ws.c
@@ -42,7 +42,7 @@ void deinit_ssl(void)
 }
 
 #else
-static unsigned long pthreads_thread_id(void);
+static void pthreads_thread_id(CRYPTO_THREADID *id);
 static void pthreads_locking_callback(int mode, int type, const char *file, int line);
 
 static pthread_mutex_t *lock_cs;
@@ -62,7 +62,7 @@ static void thread_setup(void)
 		pthread_mutex_init(&(lock_cs[i]), NULL);
 	}
 
-	CRYPTO_set_id_callback(pthreads_thread_id);
+	CRYPTO_THREADID_set_callback(pthreads_thread_id);
 	CRYPTO_set_locking_callback(pthreads_locking_callback);
 }
 
@@ -93,9 +93,9 @@ static void pthreads_locking_callback(int mode, int type, const char *file, int
 
 
 
-static unsigned long pthreads_thread_id(void)
+static void pthreads_thread_id(CRYPTO_THREADID *id)
 {
-	return (unsigned long) pthread_self();
+	CRYPTO_THREADID_set_numeric(id, (unsigned long)pthread_self());
 }
 
 
diff --git a/src/mod/event_handlers/mod_amqp/mod_amqp_connection.c b/src/mod/event_handlers/mod_amqp/mod_amqp_connection.c
index 7f56c9579c1..1da96328f9d 100644
--- a/src/mod/event_handlers/mod_amqp/mod_amqp_connection.c
+++ b/src/mod/event_handlers/mod_amqp/mod_amqp_connection.c
@@ -126,7 +126,9 @@ switch_status_t mod_amqp_connection_open(mod_amqp_connection_t *connections, mod
 		}
 	}
 
-	*active = connection_attempt;
+	if (active) {
+		*active = connection_attempt;
+	}
 
 	if (!connection_attempt) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "Profile[%s] could not connect to any AMQP brokers\n", profile_name);
@@ -148,20 +150,26 @@ switch_status_t mod_amqp_connection_open(mod_amqp_connection_t *connections, mod
 										connection_attempt->password);
 
 	if (mod_amqp_log_if_amqp_error(status, "Logging in")) {
-		mod_amqp_connection_close(*active);
-		*active = NULL;
+		if (active) {
+			mod_amqp_connection_close(*active);
+			*active = NULL;
+		}
 		goto err;
 	}
 
 	// Open a channel (1). This is fairly standard
 	amqp_channel_open(newConnection, 1);
 	if (mod_amqp_log_if_amqp_error(amqp_get_rpc_reply(newConnection), "Opening channel")) {
-		mod_amqp_connection_close(*active);
-		*active = NULL;
+		if (active) {
+			mod_amqp_connection_close(*active);
+			*active = NULL;
+		}
 		goto err;
 	}
 
-	(*active)->state = newConnection;
+	if (active) {
+		(*active)->state = newConnection;
+	}
 
 	if (oldConnection) {
 		amqp_destroy_connection(oldConnection);
diff --git a/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c b/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
index ee10130a0cc..d15f748eb78 100644
--- a/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
+++ b/src/mod/event_handlers/mod_erlang_event/mod_erlang_event.c
@@ -1376,7 +1376,9 @@ static listener_t *new_outbound_listener_locked(char *node)
 		listener->peer_nodename = switch_core_strdup(listener->pool, node);
 	}
 
-	switch_thread_rwlock_rdlock(listener->rwlock);
+	if (listener) {
+		switch_thread_rwlock_rdlock(listener->rwlock);
+	}
 
 	return listener;
 }
@@ -1399,7 +1401,6 @@ void destroy_listener(listener_t * listener)
 	switch_mutex_unlock(listener->sock_mutex);
 
 	switch_core_hash_destroy(&listener->event_hash);
-	switch_core_hash_destroy(&listener->sessions);
 
 	/* remove any bindings for this connection */
 	remove_binding(listener, NULL);
@@ -1412,6 +1413,7 @@ void destroy_listener(listener_t * listener)
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Orphaning call %s\n", s->uuid_str);
 		destroy_session_elem(s);
 	}
+	switch_core_hash_destroy(&listener->sessions);
 	switch_thread_rwlock_unlock(listener->session_rwlock);
 	switch_thread_rwlock_unlock(listener->rwlock);
 
@@ -1419,7 +1421,6 @@ void destroy_listener(listener_t * listener)
 		switch_memory_pool_t *pool = listener->pool;
 		switch_core_destroy_memory_pool(&pool);
 	}
-
 }
 
 static switch_status_t state_handler(switch_core_session_t *session)
@@ -1991,7 +1992,7 @@ SWITCH_MODULE_RUNTIME_FUNCTION(mod_erlang_event_runtime)
 	switch_memory_pool_t *pool = NULL, *listener_pool = NULL;
 	switch_status_t rv;
 	switch_sockaddr_t *sa;
-	switch_os_socket_t sockdes;
+	switch_os_socket_t sockdes = SWITCH_SOCK_INVALID;
 	listener_t *listener;
 	uint32_t x = 0;
 	struct ei_cnode_s ec;
diff --git a/src/mod/event_handlers/mod_json_cdr/mod_json_cdr.c b/src/mod/event_handlers/mod_json_cdr/mod_json_cdr.c
index a6eddc9f8e4..fc4794a4829 100644
--- a/src/mod/event_handlers/mod_json_cdr/mod_json_cdr.c
+++ b/src/mod/event_handlers/mod_json_cdr/mod_json_cdr.c
@@ -219,7 +219,7 @@ static void backup_cdr(cdr_data_t *data)
 					} while (!(x<0) && json_len > (wrote += x));
 					if (!(x<0)) do { x = write(fd, "\n", 1);
 						} while (!(x<0) && x<1);
-					close(fd); fd = -1;
+					close(fd);
 					if (x < 0) {
 						switch_log_printf(SWITCH_CHANNEL_UUID_LOG(data->uuid), SWITCH_LOG_ERROR, "Error writing [%s]\n",path);
 						if (0 > unlink(path))
@@ -285,7 +285,7 @@ static void process_cdr(cdr_data_t *data)
 				} while (!(x<0) && json_len > (wrote += x));
 				if (!(x<0)) do { x = write(fd, "\n", 1);
 					} while (!(x<0) && x<1);
-				close(fd); fd = -1;
+				close(fd);
 				if (x < 0) {
 					switch_log_printf(SWITCH_CHANNEL_UUID_LOG(data->uuid), SWITCH_LOG_ERROR, "Error writing [%s]\n",path);
 					if (0 > unlink(path))
diff --git a/src/mod/event_handlers/mod_kazoo/Makefile.am b/src/mod/event_handlers/mod_kazoo/Makefile.am
index 668d9ffed2a..36e7a9ea4a4 100644
--- a/src/mod/event_handlers/mod_kazoo/Makefile.am
+++ b/src/mod/event_handlers/mod_kazoo/Makefile.am
@@ -12,6 +12,7 @@ mod_kazoo_la_SOURCES  += kazoo_message.c
 mod_kazoo_la_SOURCES  += kazoo_ei_config.c kazoo_ei_utils.c kazoo_event_stream.c
 mod_kazoo_la_SOURCES  += kazoo_fetch_agent.c kazoo_node.c
 mod_kazoo_la_SOURCES  += kazoo_endpoints.c
+mod_kazoo_la_SOURCES  += kazoo_cdr.c
 mod_kazoo_la_SOURCES  += kz_node.c
 
 mod_kazoo_la_CFLAGS   = $(AM_CFLAGS) @ERLANG_CFLAGS@ -D_REENTRANT -DERLANG_VERSION=@ERLANG_VERSION@ -DERLANG_MAJOR=@ERLANG_MAJOR@ -DERLANG_MINOR=@ERLANG_MINOR@
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo.conf.xml b/src/mod/event_handlers/mod_kazoo/kazoo.conf.xml
index a8694206819..4422af121ba 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo.conf.xml
+++ b/src/mod/event_handlers/mod_kazoo/kazoo.conf.xml
@@ -4,13 +4,12 @@
 	<settings>
 		<param name="listen-ip" value="0.0.0.0" />
 		<param name="listen-port" value="6031" />
-		<!--<param name="cookie-file" value="/etc/freeswitch/autoload_configs/.erlang.cookie" 
-			/> -->
+		<!--<param name="cookie-file" value="/etc/freeswitch/autoload_configs/.erlang.cookie"/> -->
 		<param name="cookie" value="change_me" />
 		<param name="shortname" value="false" />
 		<param name="nodename" value="freeswitch" />
-		<!--<param name="kazoo-var-prefix" value="ecallmgr" /> -->
-		<!--<param name="compat-rel" value="12"/> -->
+		<!--<param name="kazoo-var-prefix" value="ecallmgr" />-->
+		<!--<param name="compat-rel" value="12"/>-->
 		<param name="send-all-headers" value="false" />
 
 		<param name="event-stream-preallocate" value="8192" />
@@ -21,633 +20,662 @@
 
 	</settings>
 
-    <variables>
-        <variable name="UNIX_EPOCH_IN_GREGORIAN" value="62167219200"/>
-    </variables>
-
-    <definitions>
-        <definition name="debug-call">
-            <filters>
-                <filter name="${first-of(variable_debug_call|Call-Debug|#$${Call-Debug})}" type="include" value="true" />
-            </filters>
-            <field name="Call-Debug" type="static" serialize-as="object">
-                <fields verbose="true" />
-            </field>
-        </definition>
-
-        <definition name="originated-legs">
-            <filters>
-                <filter name="variable_originated_legs" type="include" compare="exists" />
-            </filters>
-            <field name="Other-Legs" type="static" serialize-as="object">
-                <fields verbose="false">
-                    <field name="variable_originated_legs" as="Callee" />
-                    <field name="variable_originate_causes" as="Causes" />
-                </fields>
-            </field>            
-        </definition>
-
-        <definition name="transfer-history">
-            <filters>
-                <filter name="variable_transfer_history" type="include" compare="exists" />
-            </filters>
-            <field name="Transfer-History" type="expand" value="${kz_json_history(${variable_transfer_history})}" serialize-as="raw"/>
-        </definition>
-
-        <definition name="transfer-source">
-            <filters>
-                <filter name="variable_transfer_source" type="include" compare="exists" />
-            </filters>
-            <field name="Transfer-Source" type="expand" value="${kz_json_history(variable_transfer_source)}" serialize-as="raw"/>
-        </definition>
-
-        <definition name="interaction-id">
-            <field name="variable_Call-Interaction-ID" as="Call-Interaction-ID" />
-            <field name="Call-Interaction-Is-Root" type="static" value="true" serialize-as="boolean">
-	            <filters>
-                   <filter name="variable_Call-Interaction-ID" type="include" compare="field" value="variable_Original-Call-Interaction-ID" />
-	               <filter name="${first-of(Event-Subclass|Event-Name|#none)}" type="include" compare="list" value="CHANNEL_DESTROY|KZ_CDR"/>
-	            </filters>
-            </field>
-        </definition>
-
-        <definition name="Metaflow-Control">
-            <field name="Metaflow-Control" type="static" serialize-as="object">
-                <filters>
-                    <filter name="ecallmgr" type="include" compare="prefix"
-                        value="variable_Metaflow-Control-" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_Metaflow-Control-" type="prefix" exclude-prefix="true" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="Call-Control">
-            <field name="Call-Control" type="static" serialize-as="object">
-                <filters>
-                    <filter name="ecallmgr" type="include" compare="prefix"
-                        value="variable_Call-Control-" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_Call-Control-" type="prefix" exclude-prefix="true" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="Control">
-            <field name="Call-Control" type="reference" />
-            <field name="Metaflow-Control" type="reference" />
-        </definition>
-
-        <definition name="Privacy">
-            <field name="Privacy" type="static" serialize-as="object">
-                <fields verbose="false">
-		            <field name="Caller-Privacy-Hide-Number" as="Hide-Number" serialize-as="boolean" />
-		            <field name="Caller-Privacy-Hide-Name" as="Hide-Name" serialize-as="boolean" />
-		            <field name="Caller-Screen-Bit" as="Screen-Bit" serialize-as="boolean" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="Custom-Channel-Vars">
-            <field name="Custom-Channel-Vars" type="static" serialize-as="object">
-                <fields verbose="false">
-                    <field name="X-ecallmgr_" type="prefix"
-                        exclude-prefix="true" />
-                    <field name="variable_sip_h_X-ecallmgr_" type="prefix"
-                        exclude-prefix="true" />
-                    <field name="variable_ecallmgr_" type="prefix"
-                        exclude-prefix="true" />
-
-                    <field name="Fetch-UUID" as="Fetch-ID" />
-
-                    <field name="Referred-To" type="expand"
-                        value="sip:${regex(${url_decode(${variable_sip_refer_to})}|&lt;sips?:(.*)&gt;|%1)}">
-                        <filters>
-                            <filter name="variable_sip_refer_to" type="include" compare="exists" />
-                        </filters>
-                    </field>
-                    <field name="variable_sip_h_Referred-By" as="Referred-By"/>
-                    <!-- <field name="Referred-By" type="expand" value="sip:${regex(${variable_sip_h_Referred-By}|&lt;sips?:(.*)&gt;|%1)}">
-                        <filters>
-                            <filter name="variable_sip_h_Referred-By" type="include" compare="exists" />
-                        </filters>
-                    </field>
-                    -->
-
-                    <!-- <field name="variable_Call-Interaction-ID" as="Call-Interaction-ID" /> -->
-                    <field name="interaction-id" type="reference" />
-
-                    <field name="variable_presence_id" as="Presence-ID" />
-
-                    <field name="redirect" type="group">
-                        <filters>
-                            <!-- <filter name="variable_last_bridge_hangup_cause" type="include" compare="value" value="REDIRECTION_TO_NEW_DESTINATION" /> -->
-                            <filter name="variable_sip_redirected_by" type="include" compare="exists" />
-                        </filters>
-                        <fields verbose="false">
-                            <field name="Redirected-By" type="expand" value="sip:${regex(${variable_sip_redirected_by}|&lt;sips?:(.*)&gt;|%1)}"/>
-                            <field name="Redirected-Reason" type="expand" value="${regex(${variable_sip_redirected_by}|reason=(.*)|%1)}"/>
-                        </fields>
-                    </field>
-
-                    <field name="Caller-Privacy-Hide-Number" as="Privacy-Hide-Number" serialize-as="boolean" />
-                    <field name="Caller-Privacy-Hide-Name" as="Privacy-Hide-Name" serialize-as="boolean" />
-
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="Custom-Application-Vars">
-            <field name="Custom-Application-Vars" type="static"
-                serialize-as="object">
-                <filters>
-                    <filter name="ecallmgr" type="include" compare="prefix"
-                        value="variable_cav_" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_cav_" type="prefix" exclude-prefix="true" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="Custom-SIP-Headers">
-            <field name="Custom-SIP-Headers" type="static" serialize-as="object">
-                <filters>
-                    <filter name="ecallmgr" type="include" compare="prefix"
-                        value="variable_sip_h_" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_sip_h_" type="prefix" exclude-prefix="true" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="to-did">
-            <field name="To-DID" type="first-of"
-                value="variable_ecallmgr_E164-Destination|variable_ecallmgr_Original-Number|Caller-Destination-Number" />
-        </definition>
-
-        <definition name="from-network">
-            <field name="From-Network-Addr" type="first-of"
-                value="variable_sip_h_X-AUTH-IP|variable_sip_received_ip" />
-            <field name="From-Network-Port" type="first-of"
-                value="variable_sip_h_X-AUTH-PORT|variable_sip_received_port" />
-        </definition>
-
-        <definition name="call-direction">
-            <!-- <field name="Call-Direction" type="first-of" value="Caller-Logical-Direction|Call-Direction" 
-                /> -->
-            <field name="Call-Direction" />
-        </definition>
-
-        <definition name="caller-id">
-            <field name="Caller-ID-Number" type="first-of"
-                value="variable_origination_caller_id_number|variable_effective_caller_id_number|Caller-Caller-ID-Number" />
-            <field name="Caller-ID-Name" type="first-of"
-                value="variable_origination_caller_id_name|variable_effective_caller_id_name|Caller-Caller-ID-Name" />
-            <field name="Callee-ID-Number" type="first-of"
-                value="variable_origination_callee_id_number|variable_effective_callee_id_number|Caller-Callee-ID-Number|Other-Leg-Caller-ID-Number" />
-            <field name="Callee-ID-Name" type="first-of"
-                value="variable_origination_callee_id_name|variable_effective_callee_id_name|Caller-Callee-ID-Name|Other-Leg-Caller-ID-Name" />
-        </definition>
-
-        <definition name="other-leg">
-            <field name="Other-Leg-Direction" />
-            <field name="Other-Leg-Caller-ID-Name" />
-            <field name="Other-Leg-Caller-ID-Number" />
-            <field name="Other-Leg-Destination-Number" />
-            <field name="Other-Leg-Call-ID" type="first-of"
-                value="Other-Leg-Unique-ID|Other-Leg-Call-ID|variable_origination_uuid" />
-        </definition>
-
-        <definition name="application">
-            <field name="Application-Name" type="first-of"
-                value="Application-UUID-Name|kazoo_application_name|Application|Event-Subclass" />
-            <field name="Application-Response" type="first-of"
-                value="kazoo_application_response|Application-Response" />
-        </definition>
-
-        <definition name="raw-application">
-            <field name="Raw-Application-Name" type="first-of"
-                value="kazoo_application_name|Application|Event-Subclass" />
-            <field name="Application-Data" as="Raw-Application-Data" />
-        </definition>
-
-        <definition name="application-all">
-            <field name="raw-application" type="reference" />
-            <field name="application" type="reference" />
-        </definition>
-
-        <definition name="application-uuid">
-            <field name="application-all" type="reference" />
-            <field name="Application-UUID" type="first-of"
-                value="app_uuid|variable_app_uuid|Application-UUID" />
-        </definition>
-
-        <definition name="Switch-URI">
-            <field name="variable_Switch-URI" as="Switch-URI" />
-        </definition>
-
-        <definition name="freeswitch-url">
-            <field name="FreeSWITCH-Hostname" as="Media-Server" />
-            <field name="FreeSWITCH-Hostname" as="Switch-Hostname" />
-            <field name="Switch-Nodename" />
-            <field name="variable_Switch-URL" as="Switch-URL" />
-            <field name="Switch-URI" type="reference" />
-        </definition>
-
-        <definition name="loopback">
-            <field name="variable_is_loopback" as="Channel-Is-Loopback"
-                serialize-as="boolean" />
-            <field name="Loopback" type="group">
-                <filters>
-                    <filter name="variable_is_loopback" type="include" compare="value"
-                        value="1" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_loopback_leg" as="Channel-Loopback-Leg" />
-                    <field name="variable_other_loopback_leg_uuid" as="Channel-Loopback-Other-Leg-ID" />
-                    <field name="variable_loopback_bowout" as="Channel-Loopback-Bowout"
-                        serialize-as="boolean" />
-                    <field name="variable_loopback_bowout_on_execute" as="Channel-Loopback-Bowout-Execute"
-                        serialize-as="boolean" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="caller-context">
-            <field name="variable_sofia_profile_name" as="Caller-Profile" />
-            <field name="Caller-Context" />
-            <field name="Caller-Dialplan" />
-            <field name="Caller-Destination-Number" />
-        </definition>
-
-        <definition name="hunt-context">
-            <field name="variable_sofia_profile_name" as="Hunt-Profile" />
-            <field name="Hunt-Context" />
-            <field name="Hunt-Dialplan" />
-            <field name="Hunt-Destination-Number" />
-            <!-- 
-            <field name="Request" type="expand"
-                value="${first-of(variable_sip_to_uri|variable_sip_req_uri|#${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)})}" />
-            <field name="To" type="expand"
-                value="${first-of(variable_sip_to_uri|variable_sip_req_uri|#${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)})}" />
-            -->
-            <field name="Request" type="expand"
-                value="${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
-            <field name="To" type="expand"
-                value="${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
-            <field name="From" type="expand"
-                value="${first-of(Hunt-Caller-ID-Number|origination_callee_id_number|variable_sip_from_user)}@${first-of(variable_sip_invite_domain|variable_sip_from_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
-        </definition>
-
-        <definition name="sip-tags">
-            <field name="variable_sip_to_tag" as="To-Tag" />
-            <field name="variable_sip_from_tag" as="From-Tag" />
-        </definition>
-
-        <definition name="channel-state">
-            <!-- erlang strips CS_ from beginning, lets do that -->
-            <field name="Channel-State" type="expand"
-                value="${regex(${Channel-State}|^CS_(.*)$|%1)}" />
-            <field name="Channel-Call-State" />
-            <field name="Channel-Name" />
-            <field name="Answer-State" />
-        </definition>
-
-        <definition name="presence-id">
-            <field name="variable_presence_id" as="Presence-ID" />
-        </definition>
-
-
-        <definition name="call_event_headers">
-            <field name="Event-Category" type="static" value="call_event" />
-            <field name="Event-Name" type="first-of"
-                value="kazoo_event_name|Event-Subclass|Event-Name" />
-            <field name="Event-Date-Timestamp" as="Msg-ID" />
-            <field name="Timestamp" type="expand"
-                value="${expr(ceil((${Event-Date-Timestamp} / 1000000) + $${UNIX_EPOCH_IN_GREGORIAN}))}"
-                serialize-as="number" />
-            <field name="variable_sip_origination_call_uuid" as="Origination-Call-ID" />
-            <field name="Call-ID" type="first-of" value="Unique-ID|Caller-Unique-ID|caller-unique-id"/>
-            <field name="Caller-Channel-Created-Time" as="Channel-Created-Time" />
-
-            <field name="Switch-Nodename" as="Node" />
-
-            <field name="variable_sip_invite_domain" as="Domain-Name" />
-            <field name="App-Name" type="static" value="freeswitch" />
-            <field name="App-Version" type="static" value="1.0" />
-        </definition>
-
-        <definition name="user-agent">
-            <field name="variable_sip_user_agent" as="User-Agent" />
-        </definition>
-
-
-        <definition name="call_event">
-            <field name="call_event_headers" type="reference" />
-            <field name="application-all" type="reference" />
-            <field name="call-direction" type="reference" />
-            <field name="channel-state" type="reference" />
-            <field name="caller-id" type="reference" />
-            <field name="caller-context" type="reference" />
-            <field name="sip-tags" type="reference" />
-            <field name="presence-id" type="reference" />
-            <field name="loopback" type="reference" />
-            <field name="freeswitch-url" type="reference" />
-            <field name="other-leg" type="reference" />
-            <field name="user-agent" type="reference" />
-            <field name="Custom-Channel-Vars" type="reference" />
-            <field name="Custom-Application-Vars" type="reference" />
-            <field name="Control" type="reference" />
-            <field name="debug-call" type="reference" />
-        </definition>
-
-        <definition name="fetch-info">
-            <field name="Event-Category" type="static" value="fetch" />
-            <field name="App-Name" type="static" value="freeswitch" />
-            <field name="App-Version" type="static" value="1.0" />
-            <field name="Switch-Nodename" as="Node" />
-            <field name="Fetch-UUID" />
-            <field name="Fetch-UUID" as="Msg-ID" />
-            <!-- <field name="Call-ID" type="first-of" value="Fetch-Call-UUID|Unique-ID" 
-                /> -->
-            <field name="Call-ID" type="first-of" value="Unique-ID" />
-            <field name="Fetch-Section" />
-            <field name="Fetch-Timeout" />
-            <field name="Fetch-Timestamp-Micro" />
-            <field name="Kazoo-Bundle" as="Fetch-Version" />
-            <field name="Custom-SIP-Headers" type="reference" />
-            <field name="Custom-Channel-Vars" type="reference" />
-            <field name="Custom-Application-Vars" type="reference" />
-            <field name="freeswitch-url" type="reference" />
-            <field name="Control" type="reference" />
-            <field name="Privacy" type="reference" />
-            <field name="debug-call" type="reference" />
-            
-        </definition>
-
-        <definition name="destination-number">
-            <field name="Destination-Number" type="first-of"
-                value="Hunt-Destination-Number|Caller-Destination-Number" />
-        </definition>
-
-        <definition name="from-to">
-            <field name="Request" type="first-of"
-                value="variable_sip_req_uri|variable_sip_loopback_req_uri|variable_sip_to_uri" />
-            <field name="To" type="first-of"
-                value="variable_sip_to_uri|variable_sip_req_uri|variable_sip_loopback_req_uri" />
-            <field name="variable_sip_to_uri" as="To-URI" />
-            <field name="From" type="first-of"
-                value="variable_sip_from_uri|variable_presence_id|variable_sip_loopback_from_uri" />
-            <field name="variable_sip_from_uri" as="From-URI" />
-        </definition>
-
-        <definition name="sdp">
-            <field name="variable_switch_r_sdp" as="Remote-SDP" />
-            <field name="variable_rtp_local_sdp_str" as="Local-SDP" />
-        </definition>
-
-        <definition name="call-duration">
-            <field name="variable_duration" as="Duration-Seconds"
-                serialize-as="number" />
-            <field name="variable_progresssec" as="Ringing-Seconds"
-                serialize-as="number" />
-            <field name="Billing-Seconds" type="expand"
-                value="${expr(ceil(${variable_billmsec} / 1000))}" serialize-as="number" />
-        </definition>
-
-        <definition name="hangup-disposition">
-            <field name="Disposition" type="first-of"
-                value="variable_originate_disposition|variable_endpoint_disposition" />
-        </definition>
-
-        <definition name="hangup-code">
-            <field name="Hangup-Code" type="first-of"
-                value="variable_proto_specific_hangup_cause|variable_last_bridge_proto_specific_hangup_cause" />
-        </definition>
-
-        <definition name="hangup-cause">
-            <field name="Hangup-Cause" type="first-of"
-                value="variable_originate_failed_cause|variable_bridge_hangup_cause|variable_hangup_cause|Hangup-Cause">
-                <filters>
-                    <filter name="variable_current_application" type="include"
-                        compare="value" value="bridge" />
-                </filters>
-            </field>
-            <field name="Hangup-Cause" type="first-of"
-                value="variable_originate_failed_cause|variable_hangup_cause|variable_bridge_hangup_cause|Hangup-Cause">
-                <filters>
-                    <filter name="variable_current_application" type="exclude"
-                        compare="value" value="bridge" />
-                </filters>
-            </field>
-        </definition>
-
-        <definition name="hangup-fields">
-            <field name="hangup-code" type="reference" />
-            <field name="hangup-cause" type="reference" />
-            <field name="hangup-disposition" type="reference" />
-        </definition>
-
-        <definition name="conference-caller-id">
-            <field name="CONF-CALLER-ID-IN" type="group">
-                <filters>
-                    <filter name="Call-Direction" type="include" compare="value" value="inbound" />
-                </filters>
-                <fields verbose="false">
-                    <field name="Caller-Caller-ID-Name" as="Caller-ID-Name" />
-                    <field name="Caller-Caller-ID-Number" as="Caller-ID-Number" />
-                </fields>
-            </field>
-            <field name="CONF-CALLER-ID-OUT" type="group">
-                <filters>
-                    <filter name="Call-Direction" type="include" compare="value" value="outbound" />
-                </filters>
-                <fields verbose="false">
-                    <field name="variable_origination_callee_id_name" as="Caller-ID-Name" />
-                    <field name="variable_origination_callee_id_number" as="Caller-ID-Number" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="conference-bgdial">
-            <field name="Dial-Result" type="static" serialize-as="object">
-                <filters>
-                    <filter name="Action" type="include" compare="value" value="bgdial-result" />
-                </filters>
-                <fields verbose="false">
-                    <field name="Result"/>
-                    <field name="Job-UUID"/>
-                    <field name="Peer-UUID"/>
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="conference-vars">
-            <field name="Conference-Vars" type="static"
-                serialize-as="object">
-                <fields verbose="false">
-                    <field name="Conference-" type="prefix" exclude-prefix="true" />
-                    <field name="Account-ID" type="first-of" value="Account-ID|Conference-Account-ID|variable_ecallmgr_Account-ID" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="conference-channel-vars">
-            <field name="Conference-Channel-Vars" type="static"
-                serialize-as="object">
-                <fields verbose="false">
-                    <field name="variable_conference_moderator" as="Is-Moderator"
-                        serialize-as="boolean" />
-                    <field name="Floor" serialize-as="boolean" />
-                    <field name="Video" serialize-as="boolean" />
-                    <field name="See" serialize-as="boolean" />
-                    <field name="Speak" serialize-as="boolean" />
-                    <field name="Hear" serialize-as="boolean" />
-                    <field name="Talking" serialize-as="boolean" />
-                    <field name="Mute-Detect" serialize-as="boolean" />
-
-                    <field name="Energy-Level" serialize-as="number" />
-                    <field name="Current-Energy" serialize-as="number" />
-                    <field name="Member-ID" serialize-as="number" />
-
-                    <field name="Member-Ghost" serialize-as="boolean" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="conference-event">
-            <field name="Event-Category" type="static" value="conference" />
-            <field name="Event-Name" type="static" value="event" />
-            <field name="Switch-Nodename" as="Node" />
-            <field name="Core-UUID" />
-            <field name="freeswitch-url" type="reference" />
-            <field name="App-Name" type="static" value="freeswitch" />
-            <field name="App-Version" type="static" value="1.0" />
-            <field name="Action" as="Event" />
-            <field name="Conference-Name" as="Conference-ID" />
-            <field name="Conference-Unique-ID" as="Instance-ID" />
-            <field name="Conference-Node" />
-            <field name="Conference-Profile" as="Profile" />
-            <field name="Unique-ID" as="Call-ID" />
-            <field name="Account-ID" type="first-of"
-                value="Conference-Account-ID|variable_ecallmgr_Account-ID" />
-            <!-- <field name="Account-ID"/> -->
-            <!-- <field name="variable_ecallmgr_Account-ID" as="Account-ID"/> -->
-            
-            <!-- 
-            <field name="Caller-Caller-ID-Name" as="Caller-ID-Name" />
-            <field name="Caller-Caller-ID-Number" as="Caller-ID-Number" />
-            <field name="variable_origination_callee_id_name" as="Callee-ID-Name" />
-            <field name="variable_origination_callee_id_number" as="Callee-ID-Number" />
-            <field name="call-direction" type="reference" />
-            -->
-            
-            <field name="conference-caller-id" type="reference" />
-            
-            <field name="Channel-Presence-ID" />
-            <field name="Member-ID" as="Participant-ID" serialize-as="number" />
-            <field name="Custom-Channel-Vars" type="reference" />
-            <field name="Custom-Application-Vars" type="reference" />
-            <field name="conference-vars" type="reference" />
-            <field name="conference-channel-vars" type="reference" />
-            <field name="conference-bgdial" type="reference" />
-        </definition>
-
-        <definition name="recording_vars">
-            <field name="Recording" type="static" serialize-as="object">
-                <filters>
-                    <filter name="recording" type="include" compare="prefix"
-                        value="Recording-Variable-" />
-                </filters>
-                <fields verbose="false">
-                    <field name="Recording-Variable-" type="prefix" exclude-prefix="true" />
-                </fields>
-            </field>
-        </definition>
-
-        <definition name="recording">
-            <field name="Application-Name" type="static" value="record" />
-            <field name="Application-Response" type="first-of"
-                value="Record-File-Path|kazoo_application_response" />
-          <field name="recording_vars" type="reference" />
-        </definition>
-
-        <definition name="fax_data">
-            <field name="variable_fax_success" as="Fax-Success" serialize-as="boolean" />
-            <field name="variable_has_t38" as="Fax-T38-Used" serialize-as="boolean" />
-            <field name="variable_fax_t38_status" as="Fax-T38-Status" />
-            <field name="variable_fax_ecm_used" as="Fax-ECM-Used" serialize-as="boolean" />
-            <field name="variable_fax_ecm_requested" as="Fax-ECM-Requested" serialize-as="boolean" />
-            <field name="variable_fax_document_transferred_pages" as="Fax-Transferred-Pages" serialize-as="number" />
-            <field name="variable_fax_document_total_pages" as="Fax-Total-Pages" serialize-as="number" />
-            <field name="variable_fax_bad_rows" as="Fax-Bad-Rows" serialize-as="number" />
-            <field name="variable_fax_transfer_rate" as="Fax-Transfer-Rate" serialize-as="number" />
-            <field name="variable_fax_local_station_id" as="Fax-Local-Station-ID" />
-            <field name="variable_fax_remote_station_id" as="Fax-Remote-Station-ID" />
-            <field name="variable_fax_remote_country" as="Fax-Remote-Country" />
-            <field name="variable_fax_remote_vendor" as="Fax-Remote-Vendor" />
-            <field name="variable_fax_remote_model" as="Fax-Remote-Model" />
-            <field name="variable_fax_image_resolution" as="Fax-Image-Resolution" />
-            <field name="variable_fax_file_image_resolution" as="Fax-File-Image-Resolution" />
-            <field name="variable_fax_image_size" as="Fax-Image-Size" serialize-as="number" />
-            <field name="variable_fax_image_pixel_size" as="Fax-Image-Pixel-Size" />
-            <field name="variable_fax_file_image_pixel_size" as="Fax-File-Image-Pixel-Size" />
-            <field name="variable_fax_longest_bad_row_run" as="Fax-Longest-Bad-Row-Run" serialize-as="number" />
-            <field name="variable_fax_encoding" as="Fax-Encoding" />
-            <field name="variable_fax_encoding_name" as="Fax-Encoding-Name" />
-            <field name="variable_fax_timezone" as="Fax-Timezone" />
-            <field name="variable_fax_ident" as="Fax-Identity-Number" />
-            <field name="variable_fax_header" as="Fax-Identity-Name" />
-            <field name="variable_fax_doc_id" as="Fax-Doc-ID" />
-            <field name="variable_fax_doc_database" as="Fax-Doc-DB" />
-
-        </definition>
-
-        <definition name="fax_event">
-            <field name="call_event_headers" type="reference" />
-            <field name="Event-Name" type="static" value="CHANNEL_FAX_STATUS" />
-            <field name="Application-Data" type="static" serialize-as="object">
-                <fields verbose="false">
-                    <field name="fax_data" type="reference" />
-                </fields>
-            </field>
-            <field name="user-agent" as="reference" />
-            <field name="Custom-Channel-Vars" type="reference" />
-            <field name="Custom-Application-Vars" type="reference" />
-        </definition>
-
-        <definition name="voice_dialplan">
-                <field name="fetch-info" type="reference" />
-                <field name="Timestamp" type="expand"
-                    value="${expr(ceil((${Event-Date-Timestamp} / 1000000) + $${UNIX_EPOCH_IN_GREGORIAN}))}"
-                    serialize-as="number" />
-                <field name="Unique-ID" as="Call-ID" />
-                <field name="variable_sip_invite_domain" as="Domain-Name" />
-                <field name="call-direction" type="reference" />
-                <field name="caller-id" type="reference" />
-                <field name="from-network" type="reference" />
-                <field name="user-agent" type="reference" />
-                <field name="from-to" type="reference" />
-                <field name="to-did" type="reference" />
-                <field name="loopback" type="reference" />
-                <field name="sip-tags" type="reference" />
-                <field name="freeswitch-url" type="reference" />
-                <field name="destination-number" type="reference" />
-                <field name="Resource-Type" type="static" value="audio" />
-                <field name="hunt-context" type="reference" />
-                <field name="Call-Setup" type="static" value="true"
-                    serialize-as="boolean" />
-
-                <field name="Event-Category" type="static" value="dialplan" />
-                <field name="Event-Name" type="static" value="route_req" />
-                <field name="Hunt-Context" as="Fetch-Key-Value" />
-        </definition>
-
-    </definitions>
+	<variables>
+		<variable name="UNIX_EPOCH_IN_GREGORIAN" value="62167219200" />
+	</variables>
+
+	<definitions>
+		<definition name="msg-id">
+			<field name="Event-Date-Timestamp" as="Msg-ID" />
+		</definition>
+
+		<definition name="timestamp">
+			<field name="Timestamp" type="expand"
+				value="${expr(ceil((${Event-Date-Timestamp} / 1000000) + ${UNIX_EPOCH_IN_GREGORIAN}))}"
+				serialize-as="number" />
+		</definition>
+
+		<definition name="call-interaction">
+			<field name="Call-Interaction" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="variable_Call-Interaction-ID" as="Id" />
+					<field name="Key" type="expand"
+						value="${regex(${variable_Call-Interaction-ID}|([^-]*)-(.*)|%2)}" />
+					<field name="Timestamp" type="expand"
+						value="${regex(${variable_Call-Interaction-ID}|([^-]*)-(.*)|%1)}"
+						serialize-as="number" />
+					<field name="Is-Root" type="expand"
+						value="${cond(${variable_Call-Interaction-ID} == ${variable_Original-Call-Interaction-ID} ? true : false)}"
+						serialize-as="boolean" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="debug-call">
+			<filters>
+				<filter
+					name="${first-of(variable_debug_call|Call-Debug|#${Call-Debug})}"
+					type="include" value="true" />
+			</filters>
+			<field name="Call-Debug" type="static" serialize-as="object">
+				<fields verbose="true" />
+			</field>
+		</definition>
+
+		<definition name="originated-legs">
+			<filters>
+				<filter name="variable_originated_legs" type="include"
+					compare="exists" />
+			</filters>
+			<field name="Other-Legs" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="variable_originated_legs" as="Callee" />
+					<field name="variable_originate_causes" as="Causes" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="transfer-history">
+			<filters>
+				<filter name="variable_transfer_history" type="include"
+					compare="exists" />
+			</filters>
+			<field name="Transfer-History" type="expand"
+				value="${kz_json_history(${variable_transfer_history})}"
+				serialize-as="raw" />
+		</definition>
+
+		<definition name="transfer-source">
+			<filters>
+				<filter name="variable_transfer_source" type="include"
+					compare="exists" />
+			</filters>
+			<field name="Transfer-Source" type="expand"
+				value="${kz_json_history(variable_transfer_source)}" serialize-as="raw" />
+		</definition>
+
+		<definition name="interaction-id">
+			<field name="variable_Call-Interaction-ID" as="Call-Interaction-ID" />
+			<field name="Call-Interaction-Is-Root" type="static" value="true"
+				serialize-as="boolean">
+				<filters>
+					<filter name="variable_Call-Interaction-ID" type="include"
+						compare="field" value="variable_Original-Call-Interaction-ID" />
+					<filter name="${first-of(Event-Subclass|Event-Name|#none)}"
+						type="include" compare="list" value="CHANNEL_DESTROY|KZ_CDR" />
+				</filters>
+			</field>
+		</definition>
+
+		<definition name="Metaflow-Control">
+			<field name="Metaflow-Control" type="static" serialize-as="object">
+				<filters>
+					<filter name="ecallmgr" type="include" compare="prefix"
+						value="variable_Metaflow-Control-" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_Metaflow-Control-" type="prefix"
+						exclude-prefix="true" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="Call-Control">
+			<field name="Call-Control" type="static" serialize-as="object">
+				<filters>
+					<filter name="ecallmgr" type="include" compare="prefix"
+						value="variable_Call-Control-" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_Call-Control-" type="prefix"
+						exclude-prefix="true" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="Control">
+			<field name="Call-Control" type="reference" />
+			<field name="Metaflow-Control" type="reference" />
+		</definition>
+
+		<definition name="Privacy">
+			<field name="Privacy" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="Caller-Privacy-Hide-Number" as="Hide-Number"
+						serialize-as="boolean" />
+					<field name="Caller-Privacy-Hide-Name" as="Hide-Name"
+						serialize-as="boolean" />
+					<field name="Caller-Screen-Bit" as="Screen-Bit" serialize-as="boolean" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="Custom-Channel-Vars">
+			<field name="Custom-Channel-Vars" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="X-ecallmgr_" type="prefix" exclude-prefix="true" />
+					<field name="variable_sip_h_X-ecallmgr_" type="prefix"
+						exclude-prefix="true" />
+					<field name="variable_ecallmgr_" type="prefix"
+						exclude-prefix="true" />
+
+					<field name="Fetch-UUID" as="Fetch-ID" />
+
+					<field name="Referred-To" type="expand"
+						value="sip:${regex(${url_decode(${variable_sip_refer_to})}|&lt;sips?:(.*)&gt;|%1)}">
+						<filters>
+							<filter name="variable_sip_refer_to" type="include"
+								compare="exists" />
+						</filters>
+					</field>
+					<field name="variable_sip_h_Referred-By" as="Referred-By" />
+
+					<field name="variable_presence_id" as="Presence-ID" />
+
+					<field name="redirect" type="group">
+						<filters>
+							<filter name="variable_sip_redirected_by" type="include"
+								compare="exists" />
+						</filters>
+						<fields verbose="false">
+							<field name="Redirected-By" type="expand"
+								value="sip:${regex(${variable_sip_redirected_by}|&lt;sips?:(.*)&gt;|%1)}" />
+							<field name="Redirected-Reason" type="expand"
+								value="${regex(${variable_sip_redirected_by}|reason=(.*)|%1)}" />
+						</fields>
+					</field>
+
+					<field name="Caller-Privacy-Hide-Number" as="Privacy-Hide-Number"
+						serialize-as="boolean" />
+					<field name="Caller-Privacy-Hide-Name" as="Privacy-Hide-Name"
+						serialize-as="boolean" />
+
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="Custom-Application-Vars">
+			<field name="Custom-Application-Vars" type="static"
+				serialize-as="object">
+				<filters>
+					<filter name="ecallmgr" type="include" compare="prefix"
+						value="variable_cav_" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_cav_" type="prefix" exclude-prefix="true" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="Custom-SIP-Headers">
+			<field name="Custom-SIP-Headers" type="static" serialize-as="object">
+				<filters>
+					<filter name="ecallmgr" type="include" compare="prefix"
+						value="variable_sip_h_" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_sip_h_" type="prefix" exclude-prefix="true" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="to-did">
+			<field name="To-DID" type="first-of"
+				value="variable_ecallmgr_E164-Destination|variable_ecallmgr_Original-Number|Caller-Destination-Number" />
+		</definition>
+
+		<definition name="from-network">
+			<field name="From-Network-Addr" type="first-of"
+				value="variable_sip_h_X-AUTH-IP|variable_sip_received_ip" />
+			<field name="From-Network-Port" type="first-of"
+				value="variable_sip_h_X-AUTH-PORT|variable_sip_received_port" />
+		</definition>
+
+		<definition name="call-direction">
+			<field name="Call-Direction" />
+		</definition>
+
+		<definition name="caller-id">
+			<field name="Caller-ID-Number" type="first-of"
+				value="variable_origination_caller_id_number|variable_effective_caller_id_number|Caller-Caller-ID-Number" />
+			<field name="Caller-ID-Name" type="first-of"
+				value="variable_origination_caller_id_name|variable_effective_caller_id_name|Caller-Caller-ID-Name" />
+			<field name="Callee-ID-Number" type="first-of"
+				value="variable_origination_callee_id_number|variable_effective_callee_id_number|Caller-Callee-ID-Number|Other-Leg-Caller-ID-Number" />
+			<field name="Callee-ID-Name" type="first-of"
+				value="variable_origination_callee_id_name|variable_effective_callee_id_name|Caller-Callee-ID-Name|Other-Leg-Caller-ID-Name" />
+		</definition>
+
+		<definition name="other-leg">
+			<field name="Other-Leg-Direction" />
+			<field name="Other-Leg-Caller-ID-Name" />
+			<field name="Other-Leg-Caller-ID-Number" />
+			<field name="Other-Leg-Destination-Number" />
+			<field name="Other-Leg-Call-ID" type="first-of"
+				value="Other-Leg-Unique-ID|Other-Leg-Call-ID|variable_origination_uuid" />
+		</definition>
+
+		<definition name="application">
+			<field name="Application-Name" type="first-of"
+				value="Application-UUID-Name|kazoo_application_name|Application|Event-Subclass" />
+			<field name="Application-Response" type="first-of"
+				value="kazoo_application_response|Application-Response" />
+		</definition>
+
+		<definition name="raw-application">
+			<field name="Raw-Application-Name" type="first-of"
+				value="kazoo_application_name|Application|Event-Subclass" />
+			<field name="Application-Data" as="Raw-Application-Data" />
+		</definition>
+
+		<definition name="application-all">
+			<field name="raw-application" type="reference" />
+			<field name="application" type="reference" />
+		</definition>
+
+		<definition name="application-uuid">
+			<field name="application-all" type="reference" />
+			<field name="Application-UUID" type="first-of"
+				value="app_uuid|variable_app_uuid|Application-UUID" />
+		</definition>
+
+		<definition name="Switch-URI">
+			<field name="variable_Switch-URI" as="Switch-URI" />
+		</definition>
+
+		<definition name="freeswitch-url">
+			<field name="FreeSWITCH-Hostname" as="Media-Server" />
+			<field name="FreeSWITCH-Hostname" as="Switch-Hostname" />
+			<field name="Switch-Nodename" />
+			<field name="variable_Switch-URL" as="Switch-URL" />
+			<field name="Switch-URI" type="reference" />
+		</definition>
+
+		<definition name="loopback">
+			<field name="variable_is_loopback" as="Channel-Is-Loopback"
+				serialize-as="boolean" />
+			<field name="Loopback" type="group">
+				<filters>
+					<filter name="variable_is_loopback" type="include" compare="value"
+						value="1" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_loopback_leg" as="Channel-Loopback-Leg" />
+					<field name="variable_other_loopback_leg_uuid" as="Channel-Loopback-Other-Leg-ID" />
+					<field name="variable_loopback_bowout" as="Channel-Loopback-Bowout"
+						serialize-as="boolean" />
+					<field name="variable_loopback_bowout_on_execute" as="Channel-Loopback-Bowout-Execute"
+						serialize-as="boolean" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="caller-context">
+			<field name="variable_sofia_profile_name" as="Caller-Profile" />
+			<field name="Caller-Context" />
+			<field name="Caller-Dialplan" />
+			<field name="Caller-Destination-Number" />
+		</definition>
+
+		<definition name="hunt-context">
+			<field name="variable_sofia_profile_name" as="Hunt-Profile" />
+			<field name="Hunt-Context" />
+			<field name="Hunt-Dialplan" />
+			<field name="Hunt-Destination-Number" />
+			<field name="Request" type="expand"
+				value="${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
+			<field name="To" type="expand"
+				value="${Hunt-Destination-Number}@${first-of(variable_sip_invite_domain|variable_sip_to_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
+			<field name="From" type="expand"
+				value="${first-of(Hunt-Caller-ID-Number|origination_callee_id_number|variable_sip_from_user)}@${first-of(variable_sip_invite_domain|variable_sip_from_host|variable_sip_req_host|variable_ecallmgr_Realm)}" />
+		</definition>
+
+		<definition name="sip-tags">
+			<field name="variable_sip_to_tag" as="To-Tag" />
+			<field name="variable_sip_from_tag" as="From-Tag" />
+		</definition>
+
+		<definition name="channel-state">
+			<!-- erlang strips CS_ from beginning, lets do that -->
+			<field name="Channel-State" type="expand"
+				value="${regex(${Channel-State}|^CS_(.*)$|%1)}" />
+			<field name="Channel-Call-State" />
+			<field name="Channel-Name" />
+			<field name="Answer-State" />
+		</definition>
+
+		<definition name="presence-id">
+			<field name="variable_presence_id" as="Presence-ID" />
+		</definition>
+
+
+		<definition name="call_event_headers">
+			<field name="Event-Category" type="static" value="call_event" />
+			<field name="Event-Name" type="first-of"
+				value="kazoo_event_name|Event-Subclass|Event-Name" />
+			<field name="Event-Date-Timestamp" as="Msg-ID" />
+			<field name="Timestamp" type="expand"
+				value="${expr(ceil((${Event-Date-Timestamp} / 1000000) + ${UNIX_EPOCH_IN_GREGORIAN}))}"
+				serialize-as="number" />
+			<field name="variable_sip_origination_call_uuid" as="Origination-Call-ID" />
+			<field name="Call-ID" type="first-of"
+				value="Unique-ID|Caller-Unique-ID|caller-unique-id" />
+			<field name="Caller-Channel-Created-Time" as="Channel-Created-Time" />
+
+			<field name="Switch-Nodename" as="Node" />
+
+			<field name="variable_sip_invite_domain" as="Domain-Name" />
+			<field name="App-Name" type="static" value="freeswitch" />
+			<field name="App-Version" type="static" value="1.0" />
+		</definition>
+
+		<definition name="user-agent">
+			<field name="variable_sip_user_agent" as="User-Agent" />
+		</definition>
+
+
+		<definition name="call_event">
+			<field name="call_event_headers" type="reference" />
+			<field name="application-all" type="reference" />
+			<field name="call-direction" type="reference" />
+			<field name="channel-state" type="reference" />
+			<field name="caller-id" type="reference" />
+			<field name="caller-context" type="reference" />
+			<field name="sip-tags" type="reference" />
+			<field name="presence-id" type="reference" />
+			<field name="loopback" type="reference" />
+			<field name="freeswitch-url" type="reference" />
+			<field name="other-leg" type="reference" />
+			<field name="user-agent" type="reference" />
+			<field name="Custom-Channel-Vars" type="reference" />
+			<field name="Custom-Application-Vars" type="reference" />
+			<field name="Control" type="reference" />
+			<field name="debug-call" type="reference" />
+		</definition>
+
+		<definition name="fetch-info">
+			<field name="Event-Category" type="static" value="fetch" />
+			<field name="App-Name" type="static" value="freeswitch" />
+			<field name="App-Version" type="static" value="1.0" />
+			<field name="Switch-Nodename" as="Node" />
+			<field name="Fetch-UUID" />
+			<field name="Fetch-UUID" as="Msg-ID" />
+			<field name="Call-ID" type="first-of" value="Unique-ID" />
+			<field name="Fetch-Section" />
+			<field name="Fetch-Timeout" />
+			<field name="Fetch-Timestamp-Micro" />
+			<field name="Kazoo-Bundle" as="Fetch-Version" />
+			<field name="Custom-SIP-Headers" type="reference" />
+			<field name="Custom-Channel-Vars" type="reference" />
+			<field name="Custom-Application-Vars" type="reference" />
+			<field name="freeswitch-url" type="reference" />
+			<field name="Control" type="reference" />
+			<field name="Privacy" type="reference" />
+			<field name="debug-call" type="reference" />
+
+		</definition>
+
+		<definition name="destination-number">
+			<field name="Destination-Number" type="first-of"
+				value="Hunt-Destination-Number|Caller-Destination-Number" />
+		</definition>
+
+		<definition name="from-to">
+			<field name="Request" type="first-of"
+				value="variable_sip_req_uri|variable_sip_loopback_req_uri|variable_sip_to_uri" />
+			<field name="To" type="first-of"
+				value="variable_sip_to_uri|variable_sip_req_uri|variable_sip_loopback_req_uri" />
+			<field name="variable_sip_to_uri" as="To-URI" />
+			<field name="From" type="first-of"
+				value="variable_sip_from_uri|variable_presence_id|variable_sip_loopback_from_uri" />
+			<field name="variable_sip_from_uri" as="From-URI" />
+		</definition>
+
+		<definition name="sdp">
+			<field name="variable_switch_r_sdp" as="Remote-SDP" />
+			<field name="variable_rtp_local_sdp_str" as="Local-SDP" />
+		</definition>
+
+		<definition name="call-duration">
+			<field name="variable_duration" as="Duration-Seconds"
+				serialize-as="number" />
+			<field name="variable_progresssec" as="Ringing-Seconds"
+				serialize-as="number" />
+			<field name="Billing-Seconds" type="expand"
+				value="${expr(ceil(${variable_billmsec} / 1000))}" serialize-as="number" />
+		</definition>
+
+		<definition name="hangup-disposition">
+			<field name="Disposition" type="first-of"
+				value="variable_originate_disposition|variable_endpoint_disposition" />
+		</definition>
+
+		<definition name="hangup-code">
+			<field name="Hangup-Code" type="first-of"
+				value="variable_proto_specific_hangup_cause|variable_last_bridge_proto_specific_hangup_cause" />
+		</definition>
+
+		<definition name="hangup-cause">
+			<field name="Hangup-Cause" type="first-of"
+				value="variable_originate_failed_cause|variable_bridge_hangup_cause|variable_hangup_cause|Hangup-Cause">
+				<filters>
+					<filter name="variable_current_application" type="include"
+						compare="value" value="bridge" />
+				</filters>
+			</field>
+			<field name="Hangup-Cause" type="first-of"
+				value="variable_originate_failed_cause|variable_hangup_cause|variable_bridge_hangup_cause|Hangup-Cause">
+				<filters>
+					<filter name="variable_current_application" type="exclude"
+						compare="value" value="bridge" />
+				</filters>
+			</field>
+		</definition>
+
+		<definition name="hangup-fields">
+			<field name="hangup-code" type="reference" />
+			<field name="hangup-cause" type="reference" />
+			<field name="hangup-disposition" type="reference" />
+		</definition>
+
+		<definition name="conference-caller-id">
+			<field name="CONF-CALLER-ID-IN" type="group">
+				<filters>
+					<filter name="Call-Direction" type="include" compare="value"
+						value="inbound" />
+				</filters>
+				<fields verbose="false">
+					<field name="Caller-Caller-ID-Name" as="Caller-ID-Name" />
+					<field name="Caller-Caller-ID-Number" as="Caller-ID-Number" />
+				</fields>
+			</field>
+			<field name="CONF-CALLER-ID-OUT" type="group">
+				<filters>
+					<filter name="Call-Direction" type="include" compare="value"
+						value="outbound" />
+				</filters>
+				<fields verbose="false">
+					<field name="variable_origination_callee_id_name" as="Caller-ID-Name" />
+					<field name="variable_origination_callee_id_number" as="Caller-ID-Number" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="conference-bgdial">
+			<field name="Dial-Result" type="static" serialize-as="object">
+				<filters>
+					<filter name="Action" type="include" compare="value" value="bgdial-result" />
+				</filters>
+				<fields verbose="false">
+					<field name="Result" />
+					<field name="Job-UUID" />
+					<field name="Peer-UUID" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="conference-vars">
+			<field name="Conference-Vars" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="Conference-" type="prefix" exclude-prefix="true" />
+					<field name="Account-ID" type="first-of"
+						value="Account-ID|Conference-Account-ID|variable_ecallmgr_Account-ID" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="conference-channel-vars">
+			<field name="Conference-Channel-Vars" type="static"
+				serialize-as="object">
+				<fields verbose="false">
+					<field name="variable_conference_moderator" as="Is-Moderator"
+						serialize-as="boolean" />
+					<field name="Floor" serialize-as="boolean" />
+					<field name="Video" serialize-as="boolean" />
+					<field name="See" serialize-as="boolean" />
+					<field name="Speak" serialize-as="boolean" />
+					<field name="Hear" serialize-as="boolean" />
+					<field name="Talking" serialize-as="boolean" />
+					<field name="Mute-Detect" serialize-as="boolean" />
+
+					<field name="Energy-Level" serialize-as="number" />
+					<field name="Current-Energy" serialize-as="number" />
+					<field name="Member-ID" serialize-as="number" />
+
+					<field name="Member-Ghost" serialize-as="boolean" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="conference-event">
+			<field name="Event-Category" type="static" value="conference" />
+			<field name="Event-Name" type="static" value="event" />
+			<field name="Switch-Nodename" as="Node" />
+			<field name="Core-UUID" />
+			<field name="freeswitch-url" type="reference" />
+			<field name="App-Name" type="static" value="freeswitch" />
+			<field name="App-Version" type="static" value="1.0" />
+			<field name="Action" as="Event" />
+			<field name="Conference-Name" as="Conference-ID" />
+			<field name="Conference-Unique-ID" as="Instance-ID" />
+			<field name="Conference-Node" />
+			<field name="Conference-Profile" as="Profile" />
+			<field name="Unique-ID" as="Call-ID" />
+			<field name="Account-ID" type="first-of"
+				value="Conference-Account-ID|variable_ecallmgr_Account-ID" />
+
+			<field name="conference-caller-id" type="reference" />
+
+			<field name="Channel-Presence-ID" />
+			<field name="Member-ID" as="Participant-ID" serialize-as="number" />
+			<field name="Custom-Channel-Vars" type="reference" />
+			<field name="Custom-Application-Vars" type="reference" />
+			<field name="conference-vars" type="reference" />
+			<field name="conference-channel-vars" type="reference" />
+			<field name="conference-bgdial" type="reference" />
+		</definition>
+
+		<definition name="recording_vars">
+			<field name="Recording" type="static" serialize-as="object">
+				<filters>
+					<filter name="recording" type="include" compare="prefix"
+						value="Recording-Variable-" />
+				</filters>
+				<fields verbose="false">
+					<field name="Recording-Variable-" type="prefix"
+						exclude-prefix="true" />
+				</fields>
+			</field>
+		</definition>
+
+		<definition name="recording">
+			<field name="Application-Name" type="static" value="record" />
+			<field name="Application-Response" type="first-of"
+				value="Record-File-Path|kazoo_application_response" />
+			<field name="recording_vars" type="reference" />
+		</definition>
+
+		<definition name="fax_data">
+			<field name="variable_fax_success" as="Fax-Success"
+				serialize-as="boolean" />
+			<field name="variable_has_t38" as="Fax-T38-Used" serialize-as="boolean" />
+			<field name="variable_fax_t38_status" as="Fax-T38-Status" />
+			<field name="variable_fax_ecm_used" as="Fax-ECM-Used"
+				serialize-as="boolean" />
+			<field name="variable_fax_ecm_requested" as="Fax-ECM-Requested"
+				serialize-as="boolean" />
+			<field name="variable_fax_document_transferred_pages" as="Fax-Transferred-Pages"
+				serialize-as="number" />
+			<field name="variable_fax_document_total_pages" as="Fax-Total-Pages"
+				serialize-as="number" />
+			<field name="variable_fax_bad_rows" as="Fax-Bad-Rows"
+				serialize-as="number" />
+			<field name="variable_fax_transfer_rate" as="Fax-Transfer-Rate"
+				serialize-as="number" />
+			<field name="variable_fax_local_station_id" as="Fax-Local-Station-ID" />
+			<field name="variable_fax_remote_station_id" as="Fax-Remote-Station-ID" />
+			<field name="variable_fax_remote_country" as="Fax-Remote-Country" />
+			<field name="variable_fax_remote_vendor" as="Fax-Remote-Vendor" />
+			<field name="variable_fax_remote_model" as="Fax-Remote-Model" />
+			<field name="variable_fax_image_resolution" as="Fax-Image-Resolution" />
+			<field name="variable_fax_file_image_resolution" as="Fax-File-Image-Resolution" />
+			<field name="variable_fax_image_size" as="Fax-Image-Size"
+				serialize-as="number" />
+			<field name="variable_fax_image_pixel_size" as="Fax-Image-Pixel-Size" />
+			<field name="variable_fax_file_image_pixel_size" as="Fax-File-Image-Pixel-Size" />
+			<field name="variable_fax_longest_bad_row_run" as="Fax-Longest-Bad-Row-Run"
+				serialize-as="number" />
+			<field name="variable_fax_encoding" as="Fax-Encoding" />
+			<field name="variable_fax_encoding_name" as="Fax-Encoding-Name" />
+			<field name="variable_fax_timezone" as="Fax-Timezone" />
+			<field name="variable_fax_ident" as="Fax-Identity-Number" />
+			<field name="variable_fax_header" as="Fax-Identity-Name" />
+			<field name="variable_fax_doc_id" as="Fax-Doc-ID" />
+			<field name="variable_fax_doc_database" as="Fax-Doc-DB" />
+
+		</definition>
+
+		<definition name="fax_event">
+			<field name="call_event_headers" type="reference" />
+			<field name="Event-Name" type="static" value="CHANNEL_FAX_STATUS" />
+			<field name="Application-Data" type="static" serialize-as="object">
+				<fields verbose="false">
+					<field name="fax_data" type="reference" />
+				</fields>
+			</field>
+			<field name="user-agent" as="reference" />
+			<field name="Custom-Channel-Vars" type="reference" />
+			<field name="Custom-Application-Vars" type="reference" />
+		</definition>
+
+		<definition name="voice_dialplan">
+			<field name="fetch-info" type="reference" />
+			<field name="Timestamp" type="expand"
+				value="${expr(ceil((${Event-Date-Timestamp} / 1000000) + ${UNIX_EPOCH_IN_GREGORIAN}))}"
+				serialize-as="number" />
+			<field name="Unique-ID" as="Call-ID" />
+			<field name="variable_sip_invite_domain" as="Domain-Name" />
+			<field name="call-direction" type="reference" />
+			<field name="caller-id" type="reference" />
+			<field name="from-network" type="reference" />
+			<field name="user-agent" type="reference" />
+			<field name="from-to" type="reference" />
+			<field name="to-did" type="reference" />
+			<field name="loopback" type="reference" />
+			<field name="sip-tags" type="reference" />
+			<field name="freeswitch-url" type="reference" />
+			<field name="destination-number" type="reference" />
+			<field name="Resource-Type" type="static" value="audio" />
+			<field name="hunt-context" type="reference" />
+			<field name="Call-Setup" type="static" value="true"
+				serialize-as="boolean" />
+
+			<field name="Event-Category" type="static" value="dialplan" />
+			<field name="Event-Name" type="static" value="route_req" />
+			<field name="Hunt-Context" as="Fetch-Key-Value" />
+		</definition>
+
+	</definitions>
 
 
 	<event-handlers>
@@ -674,15 +702,15 @@
 					<fields verbose="false">
 						<field name="call_event" type="reference" />
 						<field name="from-to" type="reference" />
-                        <field name="other-leg" type="reference" />
+						<field name="other-leg" type="reference" />
 						<field name="sdp" type="reference" />
 						<field name="call-duration" type="reference" />
 						<field name="hangup-fields" type="reference" />
 						<field name="variable_Media-Recordings" as="Media-Recordings" />
 						<field name="debug-call" type="reference" />
-                        <field name="transfer-history" type="reference" />
-                        <field name="originated-legs" type="reference" />
-						
+						<field name="transfer-history" type="reference" />
+						<field name="originated-legs" type="reference" />
+
 					</fields>
 					<filters>
 						<filter name="variable_hangup_cause" type="exclude"
@@ -691,13 +719,13 @@
 
 				</event>
 
-                <event name="CHANNEL_PROGRESS">
-                    <fields verbose="false">
-                        <field name="call_event" type="reference" />
-                        <field name="from-to" type="reference" />
-                        <field name="debug-call" type="reference" />
-                    </fields>
-                </event>
+				<event name="CHANNEL_PROGRESS">
+					<fields verbose="false">
+						<field name="call_event" type="reference" />
+						<field name="from-to" type="reference" />
+						<field name="debug-call" type="reference" />
+					</fields>
+				</event>
 
 				<event name="CHANNEL_PROGRESS_MEDIA">
 					<fields verbose="false">
@@ -763,7 +791,6 @@
 					<fields verbose="false">
 						<field name="call_event" type="reference" />
 						<field name="Event-Name" type="static" value="CHANNEL_TRANSFEROR" />
-						<!-- <field name="Intercepted-By" type="first-of" value="variable_intercepted_by|intercepted_by"/> -->
 					</fields>
 
 					<routing-key>
@@ -776,7 +803,6 @@
 					<fields verbose="false">
 						<field name="call_event" type="reference" />
 						<field name="Event-Name" type="static" value="CHANNEL_TRANSFEREE" />
-						<!-- <field name="Intercepted-By" type="first-of" value="variable_intercepted_by|intercepted_by"/> -->
 					</fields>
 
 					<routing-key>
@@ -840,18 +866,8 @@
 				</event>
 
 				<event name="conference::maintenance">
-				<!-- 
-					<filters>
-						<filter name="Action" type="include" compare="list"
-							value="conference-create|conference-destroy|lock|unlock|add-member|del-member|start-talking|stop-talking|mute-member|unmute-member|deaf-member|undeaf-member" />
-					</filters>
-			    -->
 					<fields verbose="false">
 						<field name="conference-event" type="reference" />
-						<!-- <field name="create-group" type="group"> <filters> <filter name="Action" 
-							type="include" compare="value" value="conference-create" /> </filters> <fields 
-							verbose="false"> <field name="variable_ecallmgr_Ecallmgr-Node" as="Conference-Node" 
-							/> <field name="Conference-Profile-Name" as="Profile" /> </fields> </field> -->
 					</fields>
 				</event>
 
@@ -866,12 +882,6 @@
 				</event>
 
 				<event name="kazoo::masquerade">
-				<!-- 
-					<filters>
-						<filter type="exclude" name="kazoo_event_name" compare="value"
-							value="CHANNEL_EXECUTE_COMPLETE" />
-					</filters>
-			    -->
 					<fields verbose="false">
 						<field name="call_event" type="reference" />
 						<field name="application-uuid" type="reference" />
@@ -879,34 +889,34 @@
 							value="variable_endpoint_disposition|variable_originate_disposition|#FAIL" />
 						<field name="Disposition" type="first-of"
 							value="variable_originate_disposition|variable_endpoint_disposition" />
-                        <field name="Bridge-Hangup-Cause" type="first-of" value="variable_bridge_hangup_cause|variable_last_bridge_hangup_cause" />
-                        <field name="variable_endpoint_disposition" as="Endpoint-Disposition" />
-                        <field name="variable_transfer_disposition" as="Transfer-Disposition" />
+						<field name="Bridge-Hangup-Cause" type="first-of"
+							value="variable_bridge_hangup_cause|variable_last_bridge_hangup_cause" />
+						<field name="variable_endpoint_disposition" as="Endpoint-Disposition" />
+						<field name="variable_transfer_disposition" as="Transfer-Disposition" />
 					</fields>
 				</event>
 
 				<event name="CHANNEL_EXECUTE_COMPLETE">
 					<filters>
-                        <filter type="exclude" compare="value"
-                            name="Application-UUID" value="null" />
+						<filter type="exclude" compare="value" name="Application-UUID"
+							value="null" />
 						<filter type="include" compare="exists"
 							name="variable_Call-Control-Queue" />
-                        <filter type="include" compare="exists"
-                            name="Application-UUID-Name" />
-                        <filter type="exclude" name="Application-UUID-Name" value="set" />
+						<filter type="include" compare="exists" name="Application-UUID-Name" />
+						<filter type="exclude" name="Application-UUID-Name" value="set" />
 						<filter type="exclude" name="Application" value="set" />
-                        <filter name="Application" value="park" />
-                        <filter name="Application" value="export" />
-                        <filter name="Application" value="event" />
-                        <filter name="Application" value="unshift" />
-                        <filter name="Application" value="kz_multiset" />
-                        <filter name="Application" value="kz_multiunset" />
-                        <filter name="Application" value="kz_prefix_unset" />
-                        <filter name="Application" value="kz_export" />
-                        <filter name="Application" value="ring_ready" />
-                        <filter name="Application" value="log" />
-                        <filter name="Application" value="execute_extension" />
-                        <filter name="Application" value="bridge" />
+						<filter name="Application" value="park" />
+						<filter name="Application" value="export" />
+						<filter name="Application" value="event" />
+						<filter name="Application" value="unshift" />
+						<filter name="Application" value="kz_multiset" />
+						<filter name="Application" value="kz_multiunset" />
+						<filter name="Application" value="kz_prefix_unset" />
+						<filter name="Application" value="kz_export" />
+						<filter name="Application" value="ring_ready" />
+						<filter name="Application" value="log" />
+						<filter name="Application" value="execute_extension" />
+						<filter name="Application" value="bridge" />
 					</filters>
 
 					<fields verbose="false">
@@ -916,10 +926,11 @@
 							value="Application-Response|variable_originate_disposition|variable_endpoint_disposition|#NONE" />
 						<field name="Disposition" type="first-of"
 							value="variable_originate_disposition|variable_endpoint_disposition" />
-						<field name="Bridge-Hangup-Cause" type="first-of" value="variable_bridge_hangup_cause|variable_last_bridge_hangup_cause" />
-                        <field name="variable_endpoint_disposition" as="Endpoint-Disposition" />
-                        <field name="variable_transfer_disposition" as="Transfer-Disposition" />
-						
+						<field name="Bridge-Hangup-Cause" type="first-of"
+							value="variable_bridge_hangup_cause|variable_last_bridge_hangup_cause" />
+						<field name="variable_endpoint_disposition" as="Endpoint-Disposition" />
+						<field name="variable_transfer_disposition" as="Transfer-Disposition" />
+
 						<field name="debug-call" type="reference" />
 					</fields>
 
@@ -930,7 +941,8 @@
 						<field name="fax_event" type="reference" />
 						<field name="Application-Name" type="static" value="send_fax" />
 						<field name="Application-Event" type="static" value="negociateresult" />
-						<field name="Call-ID" type="first-of" value="variable_sip_origination_call_uuid|Unique-ID" />
+						<field name="Call-ID" type="first-of"
+							value="variable_sip_origination_call_uuid|Unique-ID" />
 					</fields>
 
 				</event>
@@ -940,7 +952,8 @@
 						<field name="fax_event" type="reference" />
 						<field name="Application-Name" type="static" value="send_fax" />
 						<field name="Application-Event" type="static" value="pageresult" />
-                        <field name="Call-ID" type="first-of" value="variable_sip_origination_call_uuid|Unique-ID" />
+						<field name="Call-ID" type="first-of"
+							value="variable_sip_origination_call_uuid|Unique-ID" />
 					</fields>
 
 				</event>
@@ -950,7 +963,8 @@
 						<field name="fax_event" type="reference" />
 						<field name="Application-Name" type="static" value="send_fax" />
 						<field name="Application-Event" type="static" value="result" />
-                        <field name="Call-ID" type="first-of" value="variable_sip_origination_call_uuid|Unique-ID" />
+						<field name="Call-ID" type="first-of"
+							value="variable_sip_origination_call_uuid|Unique-ID" />
 					</fields>
 
 				</event>
@@ -986,9 +1000,9 @@
 				</event>
 
 				<event name="CHANNEL_DATA">
-				    <filters>
-                        <filter type="exclude" compare="exists" name="API-Command" />
-                    </filters>
+					<filters>
+						<filter type="exclude" compare="exists" name="API-Command" />
+					</filters>
 					<fields verbose="true">
 						<field name="call_event" type="reference" />
 					</fields>
@@ -1029,31 +1043,29 @@
 
 				<event name="loopback::bowout">
 					<fields verbose="true">
-                        <!-- <field name="call_event_headers" type="reference" /> -->
-                        <field name="call_event" type="reference" />
-                        <field name="Event-Name" type="static" value="CHANNEL_REPLACED"/>                    
-                        <field name="Resigning-UUID" as="Call-ID"/>
-                        <field name="Resigning-UUID"/>
-                        <field name="Resigning-Peer-UUID"/>
-                        <field name="Acquired-UUID"/>
-                        <field name="Acquired-UUID" as="Replaced-By"/>
+						<field name="call_event" type="reference" />
+						<field name="Event-Name" type="static" value="CHANNEL_REPLACED" />
+						<field name="Resigning-UUID" as="Call-ID" />
+						<field name="Resigning-UUID" />
+						<field name="Resigning-Peer-UUID" />
+						<field name="Acquired-UUID" />
+						<field name="Acquired-UUID" as="Replaced-By" />
 					</fields>
 				</event>
 
-                <event name="loopback::direct">
-                    <fields verbose="true">
-                        <!-- <field name="call_event_headers" type="reference" /> -->
-                        <field name="call_event" type="reference" />
-                        <field name="Event-Name" type="static" value="CHANNEL_DIRECT"/>
-                        <field name="Resigning-UUID" as="Call-ID"/>
-                        <field name="Resigning-UUID"/>
-                        <field name="Resigning-Peer-UUID"/>
-                        <field name="Acquired-UUID"/>
-                        <field name="Acquired-UUID" as="Replaced-By"/>
-                        <field name="Connecting-Leg-A-UUID" />
-                        <field name="Connecting-Leg-B-UUID" />
-                    </fields>
-                </event>
+				<event name="loopback::direct">
+					<fields verbose="true">
+						<field name="call_event" type="reference" />
+						<field name="Event-Name" type="static" value="CHANNEL_DIRECT" />
+						<field name="Resigning-UUID" as="Call-ID" />
+						<field name="Resigning-UUID" />
+						<field name="Resigning-Peer-UUID" />
+						<field name="Acquired-UUID" />
+						<field name="Acquired-UUID" as="Replaced-By" />
+						<field name="Connecting-Leg-A-UUID" />
+						<field name="Connecting-Leg-B-UUID" />
+					</fields>
+				</event>
 
 				<event name="HEARTBEAT">
 					<fields verbose="true">
@@ -1090,14 +1102,72 @@
 					</fields>
 				</event>
 
-                <event name="CHANNEL_METAFLOW">
-                    <fields verbose="false">
-                        <field name="call_event" type="reference" />
-		                <field name="from-to" type="reference" />                        
-                        <field name="user-agent" type="reference" />
-                        <field name="Metaflow-Collected-Digits"/>
-                    </fields>
-                </event>
+				<event name="CHANNEL_METAFLOW">
+					<fields verbose="false">
+						<field name="call_event" type="reference" />
+						<field name="from-to" type="reference" />
+						<field name="user-agent" type="reference" />
+						<field name="Metaflow-Collected-Digits" />
+					</fields>
+				</event>
+
+				<event name="KZ_CDR">
+					<fields verbose="false">
+						<field name="Event-Category" type="static" value="cdr" />
+						<field name="Event-Name" type="static" value="report" />
+						<field name="msg-id" type="reference" />
+						<field name="App-Name" type="static" value="freeswitch" />
+						<field name="App-Version" type="static" value="1.0" />
+						<field name="timestamp" type="reference" />
+						<field name="Unique-ID" as="Call-ID" />
+						<field name="Caller-Channel-Created-Time" as="Channel-Created-Time"
+							serialize-as="number" />
+						<field name="Core-UUID" as="Node" />
+						<field name="variable_domain_name" as="Domain-Name" />
+						<field name="variable_ecallmgr_Account-ID" as="Account-ID" />
+						<field name="call-interaction" type="reference" />
+						<field name="call-direction" type="reference" />
+						<field name="channel-state" type="reference" />
+						<field name="caller-id" type="reference" />
+						<field name="caller-context" type="reference" />
+						<field name="sip-tags" type="reference" />
+						<field name="presence-id" type="reference" />
+						<field name="loopback" type="reference" />
+						<field name="freeswitch-url" type="reference" />
+						<field name="user-agent" type="reference" />
+						<field name="Custom-Channel-Vars" type="reference" />
+						<field name="Custom-Application-Vars" type="reference" />
+						<field name="Control" type="reference" />
+						<field name="from-to" type="reference" />
+						<field name="other-leg" type="reference" />
+						<field name="sdp" type="reference" />
+						<field name="call-duration" type="reference" />
+						<field name="hangup-fields" type="reference" />
+						<field name="variable_Media-Recordings" as="Media-Recordings"
+							as-array="true" />
+						<field name="transfer-history" type="reference" />
+						<field name="originated-legs" type="reference" />
+						<field name="Extended-Data" type="static" serialize-as="object">
+							<fields verbose="false">
+								<field name="_json_callflow" as="Channel-CallFlow"
+									serialize-as="raw" />
+								<field name="_json_channel_stats" as="Channel-Stats"
+									serialize-as="raw" />
+								<field name="_json_application_log" as="Channel-Application-Log"
+									serialize-as="raw" />
+							</fields>
+						</field>
+						<field name="debug-call" type="reference" />
+					</fields>
+					<filters>
+						<filter name="${variable_Call-Control-Node}" type="include"
+							compare="field" value="Target-Node" />
+						<filter name="variable_hangup_cause" type="exclude"
+							compare="value" value="CALL_REJECTED" />
+						<filter name="variable_hangup_cause" type="exclude"
+							compare="value" value="WRONG_CALL_STATE" />
+					</filters>
+				</event>
 
 			</events>
 		</profile>
@@ -1126,10 +1196,10 @@
 
 		<profile name="dialplan" profile-type="fetch">
 
-            <fields verbose="false">
-                <field name="voice_dialplan" type="reference" />
-                <field name="debug-call" type="reference" />
-            </fields>
+			<fields verbose="false">
+				<field name="voice_dialplan" type="reference" />
+				<field name="debug-call" type="reference" />
+			</fields>
 
 			<params>
 				<param name="fetch-timeout" value="3500000" />
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_api.c b/src/mod/event_handlers/mod_kazoo/kazoo_api.c
index 955ff5c64ef..96f9ff7bb43 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_api.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_api.c
@@ -42,11 +42,13 @@
 #define API_COMMAND_OPTION 4
 
 #define API_NODE_OPTION_FRAMING 0
-#define API_NODE_OPTION_LEGACY 1
+#define API_NODE_OPTION_KEEPALIVE 1
+#define API_NODE_OPTION_LEGACY 2
 #define API_NODE_OPTION_MAX 99
 
 static const char *node_runtime_options[] = {
 		"event-stream-framing",
+		"event-stream-keepalive",
 		"enable-legacy",
 		NULL
 };
@@ -69,6 +71,10 @@ static switch_status_t api_get_node_option(ei_node_t *ei_node, switch_stream_han
 		stream->write_function(stream, "+OK %i", ei_node->event_stream_framing);
 		break;
 
+	case API_NODE_OPTION_KEEPALIVE:
+		stream->write_function(stream, "+OK %i", ei_node->event_stream_keepalive);
+		break;
+
 	case API_NODE_OPTION_LEGACY:
 		stream->write_function(stream, "+OK %s", ei_node->legacy ? "true" : "false");
 		break;
@@ -98,6 +104,12 @@ static switch_status_t api_set_node_option(ei_node_t *ei_node, switch_stream_han
 		}
 		break;
 
+	case API_NODE_OPTION_KEEPALIVE:
+		val = switch_true(value);
+		stream->write_function(stream, "+OK %i", val);
+		ei_node->event_stream_keepalive = val;
+		break;
+
 	case API_NODE_OPTION_LEGACY:
 		ei_node->legacy = switch_true(value);
 		stream->write_function(stream, "+OK %s", ei_node->legacy ? "true" : "false");
@@ -175,7 +187,6 @@ static switch_status_t api_erlang_event_filter(switch_stream_handle_t *stream) {
 
 	if (++column > 2) {
 		stream->write_function(stream, "\n");
-		column = 0;
 	}
 
 	while(kazoo_globals.kazoo_var_prefixes[idx] != NULL) {
@@ -264,9 +275,19 @@ static switch_status_t handle_node_api_event_stream(ei_event_stream_t *event_str
 		stream->write_function(stream, "%s:%d -> disconnected\n"
 							   ,ip_addr, port);
 	} else {
-		stream->write_function(stream, "%s:%d -> %s:%d\n"
+		unsigned int year, day, hour, min, sec, delta;
+
+		delta = (switch_micro_time_now() - event_stream->connected_time) / 1000000;
+		sec = delta % 60;
+		min = delta / 60 % 60;
+		hour = delta / 3600 % 24;
+		day = delta / 86400 % 7;
+		year = delta / 31556926 % 12;
+
+		stream->write_function(stream, "%s:%d -> %s:%d for %d years, %d days, %d hours, %d minutes, %d seconds\n"
 							   ,event_stream->local_ip, event_stream->local_port
-							   ,event_stream->remote_ip, event_stream->remote_port);
+							   ,event_stream->remote_ip, event_stream->remote_port
+							   ,year, day, hour, min, sec);
 	}
 
 	binding = event_stream->bindings;
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_cdr.c b/src/mod/event_handlers/mod_kazoo/kazoo_cdr.c
new file mode 100644
index 00000000000..f7b0d53df61
--- /dev/null
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_cdr.c
@@ -0,0 +1,630 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2012, Anthony Minessale II <anthm@freeswitch.org>
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ *
+ * The Initial Developer of the Original Code is
+ * Anthony Minessale II <anthm@freeswitch.org>
+ * Portions created by the Initial Developer are Copyright (C)
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *
+ * Luis Azedo <luis@2600hz.com>
+ *
+ * mod_hacks.c -- hacks with state handlers
+ *
+ */
+#include "mod_kazoo.h"
+
+#define MY_EVENT_JSON_CDR "KZ_CDR"
+
+#define maybe_add_json_string(_json, _name, _string) \
+	if (!zstr(_string)) cJSON_AddItemToObject(_json, _name, cJSON_CreateString((char *)_string))
+
+static void kz_switch_ivr_set_json_profile_data(cJSON *json, switch_caller_profile_t *caller_profile)
+{
+	cJSON *soft = NULL;
+	profile_node_t *pn = NULL;
+
+	maybe_add_json_string(json, "Username", caller_profile->username);
+	maybe_add_json_string(json, "Dialplan", caller_profile->dialplan);
+	maybe_add_json_string(json, "ANI", caller_profile->ani);
+	maybe_add_json_string(json, "ANIII", caller_profile->aniii);
+	maybe_add_json_string(json, "Caller-ID-Name", caller_profile->caller_id_name);
+	maybe_add_json_string(json, "Caller-ID-Number", caller_profile->caller_id_number);
+	maybe_add_json_string(json, "Caller-ID-Original-Name", caller_profile->orig_caller_id_name);
+	maybe_add_json_string(json, "Caller-ID-Original-Number", caller_profile->orig_caller_id_number);
+	maybe_add_json_string(json, "Network-Address", caller_profile->network_addr);
+	maybe_add_json_string(json, "RDNIS", caller_profile->rdnis);
+	maybe_add_json_string(json, "Destination-Number", caller_profile->destination_number);
+	maybe_add_json_string(json, "Callee-ID-Name", caller_profile->callee_id_name);
+	maybe_add_json_string(json, "Callee-ID-Number", caller_profile->callee_id_number);
+	maybe_add_json_string(json, "UUID", caller_profile->uuid);
+	maybe_add_json_string(json, "Source", caller_profile->source);
+	maybe_add_json_string(json, "Context", caller_profile->context);
+	maybe_add_json_string(json, "Channel-Name", caller_profile->chan_name);
+	maybe_add_json_string(json, "Profile-UUID", caller_profile->uuid_str);
+	maybe_add_json_string(json, "Profile-Clone-Of", caller_profile->clone_of);
+	maybe_add_json_string(json, "Transfer-Source", caller_profile->transfer_source);
+	cJSON_AddItemToObject(json, "Direction", cJSON_CreateString(caller_profile->direction == SWITCH_CALL_DIRECTION_OUTBOUND ? "outbound" : "inbound"));
+	cJSON_AddItemToObject(json, "Logical-Direction", cJSON_CreateString(caller_profile->logical_direction == SWITCH_CALL_DIRECTION_OUTBOUND ? "outbound" : "inbound"));
+
+	soft = cJSON_CreateObject();
+	for (pn = caller_profile->soft; pn; pn = pn->next) {
+		maybe_add_json_string(soft, pn->var, pn->val);
+	}
+
+	cJSON_AddItemToObject(json, "Directory", soft);
+}
+
+SWITCH_DECLARE(void) kz_switch_ivr_set_json_call_flaws(cJSON *json, switch_core_session_t *session, switch_media_type_t type)
+{
+	const char *name = (type == SWITCH_MEDIA_TYPE_VIDEO) ? "Video" : "Audio";
+	cJSON *j_stat;
+	switch_rtp_stats_t *stats = switch_core_media_get_stats(session, type, NULL);
+
+	if (!stats) return;
+
+	if (!stats->inbound.error_log && !stats->outbound.error_log) return;
+
+	j_stat = cJSON_CreateObject();
+	cJSON_AddItemToObject(json, name, j_stat);
+
+	if (stats->inbound.error_log) {
+		cJSON *j_err_log, *j_err, *j_in;
+		switch_error_period_t *ep;
+
+		j_in = cJSON_CreateObject();
+		cJSON_AddItemToObject(j_stat, "Inbound", j_in);
+
+		j_err_log = cJSON_CreateArray();
+		cJSON_AddItemToObject(j_in, "Error-Log", j_err_log);
+
+		for(ep = stats->inbound.error_log; ep; ep = ep->next) {
+
+			if (!(ep->start && ep->stop)) continue;
+
+			j_err = cJSON_CreateObject();
+
+			cJSON_AddItemToObject(j_err, "Start", cJSON_CreateNumber(ep->start));
+			cJSON_AddItemToObject(j_err, "Stop", cJSON_CreateNumber(ep->stop));
+			cJSON_AddItemToObject(j_err, "Flaws", cJSON_CreateNumber(ep->flaws));
+			cJSON_AddItemToObject(j_err, "Consecutive-Flaws", cJSON_CreateNumber(ep->consecutive_flaws));
+			cJSON_AddItemToObject(j_err, "Duration-MS", cJSON_CreateNumber((ep->stop - ep->start) / 1000));
+			cJSON_AddItemToArray(j_err_log, j_err);
+		}
+	}
+
+	if (stats->outbound.error_log) {
+		cJSON *j_err_log, *j_err, *j_out;
+		switch_error_period_t *ep;
+
+		j_out = cJSON_CreateObject();
+		cJSON_AddItemToObject(j_stat, "Outbound", j_out);
+
+		j_err_log = cJSON_CreateArray();
+		cJSON_AddItemToObject(j_out, "Error-Log", j_err_log);
+
+		for(ep = stats->outbound.error_log; ep; ep = ep->next) {
+
+			if (!(ep->start && ep->stop)) continue;
+
+			j_err = cJSON_CreateObject();
+
+			cJSON_AddItemToObject(j_err, "Start", cJSON_CreateNumber(ep->start));
+			cJSON_AddItemToObject(j_err, "Stop", cJSON_CreateNumber(ep->stop));
+			cJSON_AddItemToObject(j_err, "Flaws", cJSON_CreateNumber(ep->flaws));
+			cJSON_AddItemToObject(j_err, "Consecutive-Flaws", cJSON_CreateNumber(ep->consecutive_flaws));
+			cJSON_AddItemToObject(j_err, "Duration-MS", cJSON_CreateNumber((ep->stop - ep->start) / 1000));
+			cJSON_AddItemToArray(j_err_log, j_err);
+		}
+	}
+}
+
+#define add_jstat(_j, _i, _s)											\
+	switch_snprintf(var_val, sizeof(var_val), "%" SWITCH_SIZE_T_FMT, _i); \
+	cJSON_AddItemToObject(_j, _s, cJSON_CreateNumber(_i))
+
+SWITCH_DECLARE(void) kz_switch_ivr_set_json_call_stats(cJSON *json, switch_core_session_t *session, switch_media_type_t type)
+{
+	const char *name = (type == SWITCH_MEDIA_TYPE_VIDEO) ? "Video" : "Audio";
+	cJSON *j_stat, *j_in, *j_out;
+	switch_rtp_stats_t *stats = switch_core_media_get_stats(session, type, NULL);
+	char var_val[35] = "";
+
+	if (!stats) return;
+
+	j_stat = cJSON_CreateObject();
+	j_in = cJSON_CreateObject();
+	j_out = cJSON_CreateObject();
+
+	cJSON_AddItemToObject(json, name, j_stat);
+	cJSON_AddItemToObject(j_stat, "Inbound", j_in);
+	cJSON_AddItemToObject(j_stat, "Outbound", j_out);
+
+	stats->inbound.std_deviation = sqrt(stats->inbound.variance);
+
+	add_jstat(j_in, stats->inbound.raw_bytes, "Raw-Bytes");
+	add_jstat(j_in, stats->inbound.media_bytes, "Media-Bytes");
+	add_jstat(j_in, stats->inbound.packet_count, "Packet-Count");
+	add_jstat(j_in, stats->inbound.media_packet_count, "Media-Packet-Count");
+	add_jstat(j_in, stats->inbound.skip_packet_count, "Skip-Packet-Count");
+	add_jstat(j_in, stats->inbound.jb_packet_count, "Jitter-Packet-Count");
+	add_jstat(j_in, stats->inbound.dtmf_packet_count, "DTMF-Packet-Count");
+	add_jstat(j_in, stats->inbound.cng_packet_count, "CNG-Packet-Count");
+	add_jstat(j_in, stats->inbound.flush_packet_count, "Flush-Packet-Count");
+	add_jstat(j_in, stats->inbound.largest_jb_size, "Largest-JB-Size");
+	add_jstat(j_in, stats->inbound.min_variance, "Jitter-Min-Variance");
+	add_jstat(j_in, stats->inbound.max_variance, "Jitter-Max-Variance");
+	add_jstat(j_in, stats->inbound.lossrate, "Jitter-Loss-Rate");
+	add_jstat(j_in, stats->inbound.burstrate, "Jitter-Burst-Rate");
+	add_jstat(j_in, stats->inbound.mean_interval, "Mean-Interval");
+	add_jstat(j_in, stats->inbound.flaws, "Flaw-Total");
+	add_jstat(j_in, stats->inbound.R, "Quality-Percentage");
+	add_jstat(j_in, stats->inbound.mos, "MOS");
+
+
+	add_jstat(j_out, stats->outbound.raw_bytes, "Raw-Bytes");
+	add_jstat(j_out, stats->outbound.media_bytes, "Media-Bytes");
+	add_jstat(j_out, stats->outbound.packet_count, "Packet-Count");
+	add_jstat(j_out, stats->outbound.media_packet_count, "Media-Packet-Count");
+	add_jstat(j_out, stats->outbound.skip_packet_count, "Skip-Packet-Count");
+	add_jstat(j_out, stats->outbound.dtmf_packet_count, "DTMF-Packet-Count");
+	add_jstat(j_out, stats->outbound.cng_packet_count, "CNG-Packet-Count");
+	add_jstat(j_out, stats->rtcp.packet_count, "RTCP-Packet-Count");
+	add_jstat(j_out, stats->rtcp.octet_count, "RTCP-Octet-Count");
+}
+
+static switch_status_t kz_report_channel_flaws(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	cJSON *callStats = cJSON_CreateObject();
+
+	kz_switch_ivr_set_json_call_flaws(callStats, session, SWITCH_MEDIA_TYPE_AUDIO);
+	kz_switch_ivr_set_json_call_flaws(callStats, session, SWITCH_MEDIA_TYPE_VIDEO);
+
+	switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, "_json_channel_media_errors", cJSON_PrintUnformatted(callStats));
+
+	cJSON_Delete(callStats);
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t kz_report_channel_stats(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	cJSON *callStats = cJSON_CreateObject();
+
+	kz_switch_ivr_set_json_call_stats(callStats, session, SWITCH_MEDIA_TYPE_AUDIO);
+	kz_switch_ivr_set_json_call_stats(callStats, session, SWITCH_MEDIA_TYPE_VIDEO);
+
+	switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, "_json_channel_stats", cJSON_PrintUnformatted(callStats));
+
+	cJSON_Delete(callStats);
+
+	return SWITCH_STATUS_SUCCESS;
+
+}
+
+static switch_status_t kz_report_app_log(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	switch_app_log_t *ap, *app_log = switch_core_session_get_app_log(session);
+	cJSON *j_apps = NULL;
+
+	if (!app_log) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	j_apps = cJSON_CreateArray();
+
+	for (ap = app_log; ap; ap = ap->next) {
+		cJSON *j_application = cJSON_CreateObject();
+		cJSON_AddItemToObject(j_application, "app_name", cJSON_CreateString(ap->app));
+		cJSON_AddItemToObject(j_application, "app_data", cJSON_CreateString(ap->arg));
+		cJSON_AddItemToObject(j_application, "app_stamp", cJSON_CreateNumber(ap->stamp));
+		cJSON_AddItemToArray(j_apps, j_application);
+	}
+
+	switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, "_json_application_log", cJSON_PrintUnformatted(j_apps));
+
+	cJSON_Delete(j_apps);
+
+	return SWITCH_STATUS_SUCCESS;
+
+}
+
+static switch_status_t kz_report_callflow_extension(switch_caller_profile_t *caller_profile, cJSON *j_profile)
+{
+	cJSON *j_caller_extension, *j_caller_extension_apps, *j_application, *j_inner_extension;
+	if (caller_profile->caller_extension) {
+		switch_caller_application_t *ap;
+
+		j_caller_extension = cJSON_CreateObject();
+		j_caller_extension_apps = cJSON_CreateArray();
+
+		cJSON_AddItemToObject(j_profile, "extension", j_caller_extension);
+
+		cJSON_AddItemToObject(j_caller_extension, "name", cJSON_CreateString(caller_profile->caller_extension->extension_name));
+		cJSON_AddItemToObject(j_caller_extension, "number", cJSON_CreateString(caller_profile->caller_extension->extension_number));
+		cJSON_AddItemToObject(j_caller_extension, "applications", j_caller_extension_apps);
+
+		if (caller_profile->caller_extension->current_application) {
+			cJSON_AddItemToObject(j_caller_extension, "current_app", cJSON_CreateString(caller_profile->caller_extension->current_application->application_name));
+		}
+
+		for (ap = caller_profile->caller_extension->applications; ap; ap = ap->next) {
+			j_application = cJSON_CreateObject();
+
+			cJSON_AddItemToArray(j_caller_extension_apps, j_application);
+
+			if (ap == caller_profile->caller_extension->current_application) {
+				cJSON_AddItemToObject(j_application, "last_executed", cJSON_CreateString("true"));
+			}
+			cJSON_AddItemToObject(j_application, "app_name", cJSON_CreateString(ap->application_name));
+			cJSON_AddItemToObject(j_application, "app_data", cJSON_CreateString(switch_str_nil(ap->application_data)));
+		}
+
+		if (caller_profile->caller_extension->children) {
+			switch_caller_profile_t *cp = NULL;
+			j_inner_extension = cJSON_CreateArray();
+			cJSON_AddItemToObject(j_caller_extension, "sub_extensions", j_inner_extension);
+			for (cp = caller_profile->caller_extension->children; cp; cp = cp->next) {
+
+				if (!cp->caller_extension) {
+					continue;
+				}
+
+				j_caller_extension = cJSON_CreateObject();
+				cJSON_AddItemToArray(j_inner_extension, j_caller_extension);
+
+				cJSON_AddItemToObject(j_caller_extension, "name", cJSON_CreateString(cp->caller_extension->extension_name));
+				cJSON_AddItemToObject(j_caller_extension, "number", cJSON_CreateString(cp->caller_extension->extension_number));
+
+				cJSON_AddItemToObject(j_caller_extension, "dialplan", cJSON_CreateString((char *)cp->dialplan));
+
+				if (cp->caller_extension->current_application) {
+					cJSON_AddItemToObject(j_caller_extension, "current_app", cJSON_CreateString(cp->caller_extension->current_application->application_name));
+				}
+
+				j_caller_extension_apps = cJSON_CreateArray();
+				cJSON_AddItemToObject(j_caller_extension, "applications", j_caller_extension_apps);
+				for (ap = cp->caller_extension->applications; ap; ap = ap->next) {
+					j_application = cJSON_CreateObject();
+					cJSON_AddItemToArray(j_caller_extension_apps, j_application);
+
+					if (ap == cp->caller_extension->current_application) {
+						cJSON_AddItemToObject(j_application, "last_executed", cJSON_CreateString("true"));
+					}
+					cJSON_AddItemToObject(j_application, "app_name", cJSON_CreateString(ap->application_name));
+					cJSON_AddItemToObject(j_application, "app_data", cJSON_CreateString(switch_str_nil(ap->application_data)));
+				}
+			}
+		}
+	}
+
+	return SWITCH_STATUS_SUCCESS;
+
+}
+
+static switch_status_t kz_report_callflow(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	switch_caller_profile_t *caller_profile;
+	cJSON *j_main_cp, *j_times, *j_callflow, *j_profile, *j_o;
+
+
+	caller_profile = switch_channel_get_caller_profile(channel);
+
+	j_callflow = cJSON_CreateArray();
+
+	while (caller_profile) {
+
+		j_profile = cJSON_CreateObject();
+
+		if (!zstr(caller_profile->dialplan)) {
+			cJSON_AddItemToObject(j_profile, "dialplan", cJSON_CreateString((char *)caller_profile->dialplan));
+		}
+
+		if (!zstr(caller_profile->profile_index)) {
+			cJSON_AddItemToObject(j_profile, "profile_index", cJSON_CreateString((char *)caller_profile->profile_index));
+		}
+
+		kz_report_callflow_extension(caller_profile, j_profile);
+
+		j_main_cp = cJSON_CreateObject();
+		cJSON_AddItemToObject(j_profile, "Caller-Profile", j_main_cp);
+
+		kz_switch_ivr_set_json_profile_data(j_main_cp, caller_profile);
+
+		if (caller_profile->originator_caller_profile) {
+			j_o = cJSON_CreateObject();
+			cJSON_AddItemToObject(j_main_cp, "originator", j_o);
+			kz_switch_ivr_set_json_profile_data(j_o, caller_profile->originator_caller_profile);
+			kz_report_callflow_extension(caller_profile->originator_caller_profile, j_o);
+		}
+
+		if (caller_profile->originatee_caller_profile) {
+			j_o = cJSON_CreateObject();
+			cJSON_AddItemToObject(j_main_cp, "originatee", j_o);
+			kz_switch_ivr_set_json_profile_data(j_o, caller_profile->originatee_caller_profile);
+			kz_report_callflow_extension(caller_profile->originatee_caller_profile, j_o);
+		}
+
+		if (caller_profile->times) {
+			j_times = cJSON_CreateObject();
+			cJSON_AddItemToObject(j_profile, "Time", j_times);
+			cJSON_AddItemToObject(j_times, "Created", cJSON_CreateNumber(caller_profile->times->created));
+			cJSON_AddItemToObject(j_times, "Profile-Created", cJSON_CreateNumber(caller_profile->times->profile_created));
+			cJSON_AddItemToObject(j_times, "Progress", cJSON_CreateNumber(caller_profile->times->progress));
+			cJSON_AddItemToObject(j_times, "Progress-Media", cJSON_CreateNumber(caller_profile->times->progress_media));
+			cJSON_AddItemToObject(j_times, "Answered", cJSON_CreateNumber(caller_profile->times->answered));
+			cJSON_AddItemToObject(j_times, "Bridged", cJSON_CreateNumber(caller_profile->times->bridged));
+			cJSON_AddItemToObject(j_times, "Last-Hold", cJSON_CreateNumber(caller_profile->times->last_hold));
+			cJSON_AddItemToObject(j_times, "Hold-Accumulated", cJSON_CreateNumber(caller_profile->times->hold_accum));
+			cJSON_AddItemToObject(j_times, "Hangup", cJSON_CreateNumber(caller_profile->times->hungup));
+			cJSON_AddItemToObject(j_times, "Resurrect", cJSON_CreateNumber(caller_profile->times->resurrected));
+			cJSON_AddItemToObject(j_times, "Transfer", cJSON_CreateNumber(caller_profile->times->transferred));
+		}
+		cJSON_AddItemToArray(j_callflow, j_profile);
+		caller_profile = caller_profile->next;
+	}
+
+	switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, "_json_callflow", cJSON_PrintUnformatted(j_callflow));
+
+	cJSON_Delete(j_callflow);
+
+
+	return SWITCH_STATUS_SUCCESS;
+
+}
+
+
+#define ORIGINATED_LEGS_VARIABLE "originated_legs"
+#define ORIGINATED_LEGS_ITEM_DELIM ';'
+
+#define ORIGINATE_CAUSES_VARIABLE "originate_causes"
+#define ORIGINATE_CAUSES_ITEM_DELIM ';'
+
+static switch_status_t kz_report_originated_legs(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	cJSON *j_originated = cJSON_CreateArray();
+	const char *originated_legs_var = NULL, *originate_causes_var = NULL;
+	int idx = 0;
+
+	while(1) {
+		char *argv_leg[10] = { 0 }, *argv_cause[10] = { 0 };
+		char *originated_legs, *originate_causes;
+		cJSON *j_originated_leg;
+		originated_legs_var = switch_channel_get_variable_dup(channel, ORIGINATED_LEGS_VARIABLE, SWITCH_FALSE, idx);
+		originate_causes_var = switch_channel_get_variable_dup(channel, ORIGINATE_CAUSES_VARIABLE, SWITCH_FALSE, idx);
+
+		if (zstr(originated_legs_var) || zstr(originate_causes_var)) {
+			break;
+		}
+
+		originated_legs = strdup(originated_legs_var);
+		originate_causes = strdup(originate_causes_var);
+
+		switch_separate_string(originated_legs, ORIGINATED_LEGS_ITEM_DELIM, argv_leg, (sizeof(argv_leg) / sizeof(argv_leg[0])));
+		switch_separate_string(originate_causes, ORIGINATE_CAUSES_ITEM_DELIM, argv_cause, (sizeof(argv_cause) / sizeof(argv_cause[0])));
+
+		j_originated_leg = cJSON_CreateObject();
+		cJSON_AddItemToObject(j_originated_leg, "Call-ID", cJSON_CreateString(argv_leg[0]));
+		cJSON_AddItemToObject(j_originated_leg, "Caller-ID-Name", cJSON_CreateString(argv_leg[1]));
+		cJSON_AddItemToObject(j_originated_leg, "Caller-ID-Number", cJSON_CreateString(argv_leg[2]));
+		cJSON_AddItemToObject(j_originated_leg, "Result", cJSON_CreateString(argv_cause[1]));
+
+		cJSON_AddItemToArray(j_originated, j_originated_leg);
+
+		switch_safe_free(originated_legs);
+		switch_safe_free(originate_causes);
+
+		idx++;
+	}
+
+	switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, "_json_originated_legs", cJSON_PrintUnformatted(j_originated));
+
+	cJSON_Delete(j_originated);
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+#define MAX_HISTORY 50
+#define HST_ARRAY_DELIM "|:"
+#define HST_ITEM_DELIM ':'
+
+static void kz_report_transfer_history_item(char* value, cJSON *json)
+{
+	char *argv[4] = { 0 };
+	char *item = strdup(value);
+	int argc = switch_separate_string(item, HST_ITEM_DELIM, argv, (sizeof(argv) / sizeof(argv[0])));
+	cJSON *jitem = cJSON_CreateObject();
+	char *epoch = NULL, *callid = NULL, *type = NULL;
+	int add = 0;
+	if(argc == 4) {
+		add = 1;
+		epoch = argv[0];
+		callid = argv[1];
+		type = argv[2];
+
+		if(!strncmp(type, "bl_xfer", 7)) {
+			//char *split = strchr(argv[3], '/');
+			//if(split) *(split++) = '\0';
+			cJSON_AddItemToObject(jitem, "Caller-Profile-ID", cJSON_CreateString(callid));
+			cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("blind"));
+			cJSON_AddItemToObject(jitem, "Extension", cJSON_CreateString(argv[3]));
+			cJSON_AddItemToObject(jitem, "Timestamp", cJSON_CreateNumber(strtod(epoch, NULL)));
+		} else if(!strncmp(type, "att_xfer", 8)) {
+			char *split = strchr(argv[3], '/');
+			if(split) {
+				*(split++) = '\0';
+				cJSON_AddItemToObject(jitem, "Caller-Profile-ID", cJSON_CreateString(callid));
+				cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("attended"));
+				cJSON_AddItemToObject(jitem, "Transferee", cJSON_CreateString(argv[3]));
+				cJSON_AddItemToObject(jitem, "Transferer", cJSON_CreateString(split));
+				cJSON_AddItemToObject(jitem, "Timestamp", cJSON_CreateNumber(strtod(epoch, NULL)));
+			} else {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE '%s' NOT HANDLED => %s\n", type, item);
+				add = 0;
+			}
+		} else if(!strncmp(type, "uuid_br", 7)) {
+			cJSON_AddItemToObject(jitem, "Caller-Profile-ID", cJSON_CreateString(callid));
+			cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("bridge"));
+			cJSON_AddItemToObject(jitem, "Other-Leg", cJSON_CreateString(argv[3]));
+			cJSON_AddItemToObject(jitem, "Timestamp", cJSON_CreateNumber(strtod(epoch, NULL)));
+		} else {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE '%s' NOT HANDLED => %s\n", type, item);
+			add = 0;
+		}
+	} else {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE SPLIT ERROR %i => %s\n", argc, item);
+	}
+	if(add) {
+		cJSON_AddItemToArray(json, jitem);
+	} else {
+		cJSON_Delete(jitem);
+	}
+	switch_safe_free(item);
+}
+
+static switch_status_t kz_report_transfer_history(switch_core_session_t *session, switch_event_t *cdr_event, const char* var_name)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	cJSON *j_transfer = NULL;
+	char *tmp_history = NULL, *history = NULL, *argv[MAX_HISTORY] = { 0 };
+	char event_header[50];
+	int n, argc = 0;
+	const char *transfer_var = switch_channel_get_variable_dup(channel, var_name, SWITCH_FALSE, -1);
+	if (zstr(transfer_var)) {
+		return SWITCH_STATUS_SUCCESS;
+	}
+
+	if (!(tmp_history = strdup(transfer_var))) {
+		return SWITCH_STATUS_SUCCESS;
+	}
+
+	sprintf(event_header, "_json_%s", var_name);
+	history = tmp_history;
+	j_transfer = cJSON_CreateArray();
+
+	if (!strncmp(history, "ARRAY::", 7)) {
+		history += 7;
+		argc = switch_separate_string_string(history, HST_ARRAY_DELIM, argv, (sizeof(argv) / sizeof(argv[0])));
+		for(n=0; n < argc; n++) {
+			kz_report_transfer_history_item(argv[n], j_transfer);
+		}
+		switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, event_header, cJSON_PrintUnformatted(j_transfer));
+	} else if (strchr(history, HST_ITEM_DELIM)) {
+		kz_report_transfer_history_item(history, j_transfer);
+		switch_event_add_header_string(cdr_event, SWITCH_STACK_BOTTOM | SWITCH_STACK_NODUP, event_header, cJSON_PrintUnformatted(j_transfer));
+	}
+	cJSON_Delete(j_transfer);
+	switch_safe_free(tmp_history);
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+static switch_status_t kz_report(switch_core_session_t *session, switch_event_t *cdr_event)
+{
+	kz_report_app_log(session, cdr_event);
+	kz_report_callflow(session, cdr_event);
+	kz_report_channel_stats(session, cdr_event);
+	kz_report_channel_flaws(session, cdr_event);
+	kz_report_originated_legs(session, cdr_event);
+	kz_report_transfer_history(session, cdr_event, SWITCH_TRANSFER_HISTORY_VARIABLE);
+	kz_report_transfer_history(session, cdr_event, SWITCH_TRANSFER_SOURCE_VARIABLE);
+	return SWITCH_STATUS_SUCCESS;
+}
+
+
+static switch_status_t kz_cdr_on_reporting(switch_core_session_t *session)
+{
+	switch_event_t *cdr_event = NULL;
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+
+	if (switch_event_create_subclass(&cdr_event, SWITCH_EVENT_CUSTOM, MY_EVENT_JSON_CDR) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "error creating event for report data!\n");
+		return SWITCH_STATUS_FALSE;
+	}
+
+	kz_report(session, cdr_event);
+	switch_channel_event_set_data(channel, cdr_event);
+	switch_event_fire(&cdr_event);
+
+	return SWITCH_STATUS_SUCCESS;
+}
+
+
+static switch_state_handler_table_t kz_cdr_state_handlers = {
+	/*.on_init */ NULL,
+	/*.on_routing */ NULL,
+	/*.on_execute */ NULL,
+	/*.on_hangup */ NULL,
+	/*.on_exchange_media */ NULL,
+	/*.on_soft_execute */ NULL,
+	/*.on_consume_media */ NULL,
+	/*.on_hibernate */ NULL,
+	/*.on_reset */ NULL,
+	/*.on_park */ NULL,
+	/*.on_reporting */ kz_cdr_on_reporting
+};
+
+
+static void kz_cdr_register_state_handlers()
+{
+	switch_core_add_state_handler(&kz_cdr_state_handlers);
+}
+
+static void kz_cdr_unregister_state_handlers()
+{
+	switch_core_remove_state_handler(&kz_cdr_state_handlers);
+}
+
+static void kz_cdr_register_events()
+{
+	if (switch_event_reserve_subclass(MY_EVENT_JSON_CDR) != SWITCH_STATUS_SUCCESS) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Couldn't register subclass %s!\n", MY_EVENT_JSON_CDR);
+	}
+}
+
+static void kz_cdr_unregister_events()
+{
+	switch_event_free_subclass(MY_EVENT_JSON_CDR);
+}
+
+
+void kz_cdr_start()
+{
+	kz_cdr_register_events();
+	kz_cdr_register_state_handlers();
+}
+
+void kz_cdr_stop()
+{
+	kz_cdr_unregister_state_handlers();
+	kz_cdr_unregister_events();
+}
+
+/* For Emacs:
+ * Local Variables:
+ * mode:c
+ * indent-tabs-mode:t
+ * tab-width:4
+ * c-basic-offset:4
+ * End:
+ * For VIM:
+ * vim:set softtabstop=4 shiftwidth=4 tabstop=4:
+ */
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_commands.c b/src/mod/event_handlers/mod_kazoo/kazoo_commands.c
index 2e9a8f2459a..c1ac69f6200 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_commands.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_commands.c
@@ -48,138 +48,69 @@
 
 #define MAX_FIRST_OF 25
 
-#define MAX_HISTORY 50
-#define HST_ARRAY_DELIM "|:"
-#define HST_ITEM_DELIM ':'
-
-static void process_history_item(char* value, cJSON *json)
-{
-	char *argv[4] = { 0 };
-	char *item = strdup(value);
-	int argc = switch_separate_string(item, HST_ITEM_DELIM, argv, (sizeof(argv) / sizeof(argv[0])));
-	cJSON *jitem = cJSON_CreateObject();
-	char *epoch = NULL, *callid = NULL, *type = NULL;
-	int add = 0;
-	if(argc == 4) {
-		add = 1;
-		epoch = argv[0];
-		callid = argv[1];
-		type = argv[2];
-
-		if(!strncmp(type, "bl_xfer", 7)) {
-			char *split = strchr(argv[3], '/');
-			if(split) *(split++) = '\0';
-			cJSON_AddItemToObject(jitem, "Call-ID", cJSON_CreateString(callid));
-			cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("blind"));
-			cJSON_AddItemToObject(jitem, "Extension", cJSON_CreateString(argv[3]));
-		} else if(!strncmp(type, "att_xfer", 8)) {
-			char *split = strchr(argv[3], '/');
-			if(split) {
-				*(split++) = '\0';
-				cJSON_AddItemToObject(jitem, "Call-ID", cJSON_CreateString(callid));
-				cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("attended"));
-				cJSON_AddItemToObject(jitem, "Transferee", cJSON_CreateString(argv[3]));
-				cJSON_AddItemToObject(jitem, "Transferer", cJSON_CreateString(split));
-			} else {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE '%s' NOT HANDLED => %s\n", type, item);
-				add = 0;
-			}
-		} else if(!strncmp(type, "uuid_br", 7)) {
-			cJSON_AddItemToObject(jitem, "Call-ID", cJSON_CreateString(callid));
-			cJSON_AddItemToObject(jitem, "Type", cJSON_CreateString("bridge"));
-			cJSON_AddItemToObject(jitem, "Other-Leg", cJSON_CreateString(argv[3]));
-
-		} else {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE '%s' NOT HANDLED => %s\n", type, item);
-			add = 0;
-		}
-	} else {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER TYPE SPLIT ERROR %i => %s\n", argc, item);
-	}
-	if(add) {
-		cJSON_AddItemToObject(json, epoch, jitem);
-	} else {
-		cJSON_Delete(jitem);
-	}
-	switch_safe_free(item);
-}
-
-SWITCH_STANDARD_API(kz_json_history)
+SWITCH_STANDARD_API(kz_first_of)
 {
-	char *mycmd = NULL, *argv[MAX_HISTORY] = { 0 };
+	char delim = '|';
+	char *mycmd = NULL, *mycmd_dup = NULL, *argv[MAX_FIRST_OF] = { 0 };
 	int n, argc = 0;
-	cJSON *json = cJSON_CreateObject();
-	char* output = NULL;
 	switch_event_header_t *header = NULL;
-	if (!zstr(cmd) && (mycmd = strdup(cmd))) {
-		if (!strncmp(mycmd, "ARRAY::", 7)) {
-			mycmd += 7;
-			argc = switch_separate_string_string(mycmd, HST_ARRAY_DELIM, argv, (sizeof(argv) / sizeof(argv[0])));
-			for(n=0; n < argc; n++) {
-				process_history_item(argv[n], json);
-			}
-		} else if (strchr(mycmd, HST_ITEM_DELIM)) {
-			process_history_item(mycmd, json);
-		} else if (stream->param_event) {
-			header = switch_event_get_header_ptr(stream->param_event, mycmd);
-			if (header != NULL) {
-				if(header->idx) {
-					for(n = 0; n < header->idx; n++) {
-						process_history_item(header->array[n], json);
-					}
-				} else {
-					process_history_item(header->value, json);
-				}
+	switch_channel_t *channel = NULL;
 
-			} else {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER HISTORY HEADER NOT FOUND => %s\n", mycmd);
-			}
-		} else {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "TRANSFER HISTORY NOT PARSED => %s\n", mycmd);
-		}
+	if (zstr(cmd)) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "invalid arg\n");
+		return SWITCH_STATUS_GENERR;
 	}
-	output = cJSON_PrintUnformatted(json);
-	stream->write_function(stream, "%s", output);
-	switch_safe_free(output);
-	cJSON_Delete(json);
 
-	return SWITCH_STATUS_SUCCESS;
-}
+	if ( session ) {
+		channel = switch_core_session_get_channel(session);
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "GOT CHANNEL\n");
+	}
 
-SWITCH_STANDARD_API(kz_first_of)
-{
-	char delim = '|';
-	char *mycmd = NULL, *argv[MAX_FIRST_OF] = { 0 };
-	int n, argc = 0;
-	switch_event_header_t *header = NULL;
-	if (!zstr(cmd) && (mycmd = strdup(cmd))) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "FIRST-OF %s\n", mycmd);
-		if (!zstr(mycmd) && *mycmd == '^' && *(mycmd+1) == '^') {
-			mycmd += 2;
-			delim = *mycmd++;
-		}
-		argc = switch_separate_string(mycmd, delim, argv, (sizeof(argv) / sizeof(argv[0])));
-		for(n=0; n < argc; n++) {
-			char* item = argv[n];
-			if(*item == '#') {
-				if(*(++item) != '\0') {
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "RETURNING default %s\n", item);
-					stream->write_function(stream, item);
+	mycmd_dup = mycmd = strdup(cmd);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "FIRST-OF %s\n", mycmd);
+	if (!zstr(mycmd) && *mycmd == '^' && *(mycmd+1) == '^') {
+		mycmd += 2;
+		delim = *mycmd++;
+	}
+	argc = switch_separate_string(mycmd, delim, argv, (sizeof(argv) / sizeof(argv[0])));
+	for(n=0; n < argc; n++) {
+		char* item = argv[n];
+		if(*item == '#' || *item == '!' || *item == '?') {
+			if(*(++item) != '\0') {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "RETURNING default %s\n", item);
+				stream->write_function(stream, item);
+				break;
+			}
+		} else {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "CHECKING %s\n", item);
+			if (channel) {
+				const char *var = switch_channel_get_variable_dup(channel, item, SWITCH_FALSE, -1);
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "CHECKING CHANNEL %s\n", item);
+				if (var) {
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "GOT FROM CHANNEL %s => %s\n", item, var);
+					stream->write_function(stream, var);
 					break;
 				}
-			} else {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "CHECKING %s\n", item);
-				header = switch_event_get_header_ptr(stream->param_event, item);
-				if(header) {
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "RETURNING %s : %s\n", item, header->value);
-					stream->write_function(stream, header->value);
-					break;
+				if (!strncmp(item, "variable_", 9)) {
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "CHECKING CHANNEL %s\n", item+9);
+					var = switch_channel_get_variable_dup(channel, item+9, SWITCH_FALSE, -1);
+					if (var) {
+						switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "GOT FROM CHANNEL %s => %s\n", item+9, var);
+						stream->write_function(stream, var);
+						break;
+					}
 				}
 			}
+			header = switch_event_get_header_ptr(stream->param_event, item);
+			if(header) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "RETURNING %s : %s\n", item, header->value);
+				stream->write_function(stream, header->value);
+				break;
+			}
 		}
 	}
 
-	switch_safe_free(mycmd);
+	switch_safe_free(mycmd_dup);
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -364,7 +295,6 @@ SWITCH_STANDARD_API(kz_http_put)
 	long httpRes = 0;
 	struct stat file_info = {0};
 	FILE *file_to_put = NULL;
-	int fd;
 
 	if (session) {
 		pool = switch_core_session_get_pool(session);
@@ -390,7 +320,11 @@ SWITCH_STANDARD_API(kz_http_put)
 	/* parse params and get profile */
 	url = switch_core_strdup(pool, argv[0]);
 	if (*url == '{') {
-		switch_event_create_brackets(url, '{', '}', ',', &params, &url, SWITCH_FALSE);
+		if (switch_event_create_brackets(url, '{', '}', ',', &params, &url, SWITCH_FALSE) != SWITCH_STATUS_SUCCESS) {
+			status = SWITCH_STATUS_FALSE;
+			stream->write_function(stream, "-ERR error parsing parameters\n");
+			goto done;
+		}
 	}
 
 	filename = switch_core_strdup(pool, argv[1]);
@@ -404,33 +338,26 @@ SWITCH_STANDARD_API(kz_http_put)
 	}
 
 	buf = switch_mprintf("Content-Type: %s", mime_type);
-
 	headers = switch_curl_slist_append(headers, buf);
 
 	/* open file and get the file size */
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "opening %s for upload to %s\n", filename, url);
-	fd = open(filename, O_RDONLY);
-	if (fd == -1) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "open() error: %s\n", strerror(errno));
-		status = SWITCH_STATUS_FALSE;
-		stream->write_function(stream, "-ERR error opening file\n");
-		goto done;
-	}
-	if (fstat(fd, &file_info) == -1) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "fstat() error: %s\n", strerror(errno));
-		stream->write_function(stream, "-ERR fstat error\n");
-		goto done;
-	}
-	close(fd);
 
 	/* libcurl requires FILE* */
  	file_to_put = fopen(filename, "rb");
 	if (!file_to_put) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "fopen() error: %s\n", strerror(errno));
+		stream->write_function(stream, "-ERR error opening file\n");
 		status = SWITCH_STATUS_FALSE;
 		goto done;
 	}
 
+	if (fstat(fileno(file_to_put), &file_info) == -1) {
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "fstat() error: %s\n", strerror(errno));
+		stream->write_function(stream, "-ERR fstat error\n");
+		goto done;
+	}
+
 	curl_handle = switch_curl_easy_init();
 	if (!curl_handle) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "switch_curl_easy_init() failure\n");
@@ -502,12 +429,92 @@ SWITCH_STANDARD_API(kz_http_put)
 
 SWITCH_STANDARD_API(kz_expand_api)
 {
-	if (!zstr(cmd)) {
-		char * val = kz_expand(cmd);
-		stream->write_function(stream, "+OK %s", val);
-		switch_safe_free(val);
+	char *p = NULL, *input = NULL;
+	char *uuid = NULL, *mycmd;
+
+	if (zstr(cmd)) {
+		stream->write_function(stream, "-ERR invalid input");
+		return SWITCH_STATUS_GENERR;
+	}
+
+	if (!(mycmd = strdup(cmd))) {
+		stream->write_function(stream, "-ERR no memory");
+		return SWITCH_STATUS_GENERR;
+	}
+
+	if (!strncasecmp(mycmd, "uuid:", 5)) {
+		uuid = mycmd + 5;
+		if ((input = strchr(uuid, ' ')) != NULL) {
+			*input++ = '\0';
+		} else {
+			stream->write_function(stream, "-ERR invalid argument");
+			switch_safe_free(mycmd);
+			return SWITCH_STATUS_GENERR;
+		}
 	} else {
-		stream->write_function(stream, "ERR invalid input");
+		input = mycmd;
+	}
+
+	p = kz_expand(input, uuid);
+	stream->write_function(stream, "+OK %s", p);
+	if (p != input) {
+		switch_safe_free(p);
+	}
+	switch_safe_free(mycmd);
+	return SWITCH_STATUS_SUCCESS;
+}
+
+SWITCH_STANDARD_API(kz_eval_api)
+{
+	char *p = NULL, *input = NULL;
+	char *uuid = NULL, *mycmd;
+	switch_core_session_t *nsession = NULL;
+	switch_channel_t *channel = NULL;
+
+
+	if (zstr(cmd)) {
+		stream->write_function(stream, "-ERR invalid input");
+		return SWITCH_STATUS_GENERR;
+	}
+
+	if (!(mycmd = strdup(cmd))) {
+		stream->write_function(stream, "-ERR no memory");
+		return SWITCH_STATUS_GENERR;
+	}
+
+	if (!strncasecmp(mycmd, "uuid:", 5)) {
+		uuid = mycmd + 5;
+		if ((input = strchr(uuid, ' ')) != NULL) {
+			*input++ = '\0';
+			if ((nsession = switch_core_session_locate(uuid)) != NULL) {
+				channel = switch_core_session_get_channel(nsession);
+			} else {
+				stream->write_function(stream, "-ERR invalid session");
+				switch_safe_free(mycmd);
+				return SWITCH_STATUS_GENERR;
+			}
+		} else {
+			stream->write_function(stream, "-ERR invalid argument");
+			switch_safe_free(mycmd);
+			return SWITCH_STATUS_GENERR;
+		}
+	} else if (session == NULL) {
+		stream->write_function(stream, "-ERR invalid argument");
+		switch_safe_free(mycmd);
+		return SWITCH_STATUS_GENERR;
+	} else {
+		channel = switch_core_session_get_channel(session);
+		input = mycmd;
+	}
+
+	p = switch_channel_expand_variables_check(channel, input, NULL, NULL, 0);
+	stream->write_function(stream, "+OK %s", p);
+	if (p != input) {
+		switch_safe_free(p);
+	}
+	switch_safe_free(mycmd);
+	if (nsession) {
+		switch_core_session_rwunlock(nsession);
 	}
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -524,7 +531,7 @@ void add_kz_commands(switch_loadable_module_interface_t **module_interface) {
 	switch_console_set_complete("add kz_uuid_setvar_encoded ::console::list_uuid");
 	SWITCH_ADD_API(api_interface, "kz_http_put", KZ_HTTP_PUT_DESC, kz_http_put, KZ_HTTP_PUT_SYNTAX);
 	SWITCH_ADD_API(api_interface, "first-of", KZ_FIRST_OF_DESC, kz_first_of, KZ_FIRST_OF_SYNTAX);
-	SWITCH_ADD_API(api_interface, "kz_json_history", KZ_FIRST_OF_DESC, kz_json_history, KZ_FIRST_OF_SYNTAX);
 	SWITCH_ADD_API(api_interface, "kz_expand", KZ_FIRST_OF_DESC, kz_expand_api, KZ_FIRST_OF_SYNTAX);
+	SWITCH_ADD_API(api_interface, "kz_eval", KZ_FIRST_OF_DESC, kz_eval_api, KZ_FIRST_OF_SYNTAX);
 }
 
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_config.c b/src/mod/event_handlers/mod_kazoo/kazoo_config.c
index fcda9012ff2..a64677074b2 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_config.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_config.c
@@ -129,6 +129,10 @@ switch_status_t kazoo_config_loglevels(switch_memory_pool_t *pool, switch_xml_t
 	loglevels->warn_log_level = SWITCH_LOG_WARNING;
 	loglevels->success_log_level = SWITCH_LOG_DEBUG;
 	loglevels->time_log_level = SWITCH_LOG_DEBUG1;
+	loglevels->trace_log_level = SWITCH_LOG_DEBUG1;
+	loglevels->debug_log_level = SWITCH_LOG_DEBUG;
+	loglevels->error_log_level = SWITCH_LOG_ERROR;
+	loglevels->hashing_log_level = SWITCH_LOG_DEBUG1;
 
 	if ((xml_logging = switch_xml_child(cfg, "logging")) != NULL) {
 		for (xml_level = switch_xml_child(xml_logging, "log"); xml_level; xml_level = xml_level->next) {
@@ -159,6 +163,14 @@ switch_status_t kazoo_config_loglevels(switch_memory_pool_t *pool, switch_xml_t
 				loglevels->filtered_event_log_level = log_str2level(val);
 			} else if (!strncmp(var, "filtered-field", 14)) {
 				loglevels->filtered_field_log_level = log_str2level(val);
+			} else if (!strncmp(var, "trace", 5)) {
+				loglevels->trace_log_level = log_str2level(val);
+			} else if (!strncmp(var, "debug", 5)) {
+				loglevels->debug_log_level = log_str2level(val);
+			} else if (!strncmp(var, "error", 5)) {
+				loglevels->error_log_level = log_str2level(val);
+			} else if (!strncmp(var, "hashing", 7)) {
+				loglevels->hashing_log_level = log_str2level(val);
 			}
 		} /* xml_level for loop */
 	}
@@ -247,6 +259,7 @@ switch_status_t kazoo_config_field(kazoo_config_ptr definitions, switch_memory_p
 	const char *type = switch_xml_attr(cfg, "type");
 	const char *exclude_prefix = switch_xml_attr(cfg, "exclude-prefix");
 	const char *serialize_as = switch_xml_attr(cfg, "serialize-as");
+	const char *as_array = switch_xml_attr(cfg, "as-array");
 	kazoo_field_ptr cur = (kazoo_field_ptr) switch_core_alloc(pool, sizeof(kazoo_field));
 	cur->in_type = FIELD_NONE;
 	cur->out_type = JSON_NONE;
@@ -292,6 +305,10 @@ switch_status_t kazoo_config_field(kazoo_config_ptr definitions, switch_memory_p
 		}
 	}
 
+	if(as_array) {
+		cur->out_type_as_array = switch_true(as_array);
+	}
+
 	if(exclude_prefix)
 		cur->exclude_prefix = switch_true(exclude_prefix);
 
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_dptools.c b/src/mod/event_handlers/mod_kazoo/kazoo_dptools.c
index da5354f3ec9..5a016509694 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_dptools.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_dptools.c
@@ -267,7 +267,7 @@ void kz_uuid_multiset(switch_core_session_t *session, const char* data, int urld
 
 	if(delim != '\0') {
 		switch_core_session_t *uuid_session = NULL;
-		if ((uuid_session = switch_core_session_force_locate(arg0)) != NULL) {
+		if ((uuid_session = switch_core_session_locate(arg0)) != NULL) {
 			switch_channel_t *uuid_channel = switch_core_session_get_channel(uuid_session);
 			if (arg) {
 				char *array[256] = {0};
@@ -443,6 +443,75 @@ SWITCH_STANDARD_APP(kz_endless_playback_function)
 
 }
 
+SWITCH_STANDARD_APP(kz_moh_function)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	switch_status_t status = SWITCH_STATUS_SUCCESS;
+	switch_file_handle_t fh = { 0 };
+	const char *var_samples = switch_channel_get_variable_dup(channel, "moh_playback_samples", SWITCH_FALSE, -1);
+	unsigned int samples =  0;
+	char * my_data = NULL;
+
+	if (var_samples) {
+		fh.samples = samples = atoi(var_samples);
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "SETTING SAMPLES %d\n", samples);
+	}
+
+	switch_channel_set_variable(channel, SWITCH_PLAYBACK_TERMINATOR_USED, "");
+
+	/*
+	 * hack for proper position
+	 */
+	if (!strncmp(data, "http_cache://", 13) && session) {
+		switch_channel_t *channel = switch_core_session_get_channel(session);
+		char * resolve = switch_mprintf("${http_get({prefetch=true}%s)}", data+13);
+		my_data = switch_channel_expand_variables_check(channel, resolve, NULL, NULL, 0);
+	} else {
+		my_data = strdup(data);
+	}
+
+	status = switch_ivr_play_file(session, &fh, my_data, NULL);
+//	status = switch_ivr_play_file(session, &fh, data, NULL);
+
+	switch_assert(!(fh.flags & SWITCH_FILE_OPEN));
+
+	switch (status) {
+	case SWITCH_STATUS_SUCCESS:
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH PLAYED SUCCESS\n");
+		switch_channel_set_variable(channel, SWITCH_CURRENT_APPLICATION_RESPONSE_VARIABLE, "MOH FILE PLAYED");
+		switch_channel_set_variable(channel, "moh_playback_samples", "0");
+		break;
+	case SWITCH_STATUS_BREAK:
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH PLAYED BREAK\n");
+		switch_channel_set_variable(channel, SWITCH_CURRENT_APPLICATION_RESPONSE_VARIABLE, "MOH FILE PLAYED");
+		if ((var_samples = switch_channel_get_variable_dup(channel, "playback_samples", SWITCH_FALSE, -1)) != NULL) {
+			samples += atoi(var_samples);
+			if (samples >= fh.samples) {
+				samples = 0;
+			}
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "SETTING MOH SAMPLES %d\n", samples);
+			switch_channel_set_variable_printf(channel, "moh_playback_samples", "%d", samples);
+		}
+		break;
+	case SWITCH_STATUS_NOTFOUND:
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH PLAYED NOT FOUND\n");
+		switch_channel_set_variable(channel, SWITCH_CURRENT_APPLICATION_RESPONSE_VARIABLE, "MOH FILE NOT FOUND");
+		break;
+	default:
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "MOH PLAYED DEFAULT\n");
+		switch_channel_set_variable(channel, SWITCH_CURRENT_APPLICATION_RESPONSE_VARIABLE, "MOH PLAYBACK ERROR");
+		break;
+	}
+
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH duration %" SWITCH_INT64_T_FMT "\n", fh.duration);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH offset_pos %d\n", fh.offset_pos);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH pos %" SWITCH_INT64_T_FMT "\n", fh.pos);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH sample_count %" SWITCH_SIZE_T_FMT "\n", fh.sample_count);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG1, "MOH samples %d\n", fh.samples);
+
+	switch_safe_free(my_data);
+}
+
 SWITCH_STANDARD_APP(noop_function)
 {
 	switch_channel_t *channel = switch_core_session_get_channel(session);
@@ -696,11 +765,19 @@ static switch_status_t kz_att_xfer_hanguphook(switch_core_session_t *session)
 	switch_channel_t *channel = switch_core_session_get_channel(session);
 	switch_channel_state_t state = switch_channel_get_state(channel);
 	const char *id = NULL;
+	const char *peer_uuid = NULL;
 
 	if (state == CS_HANGUP || state == CS_ROUTING) {
 		if ((id = switch_channel_get_variable(channel, "xfer_uuids"))) {
 			switch_stream_handle_t stream = { 0 };
 			SWITCH_STANDARD_STREAM(stream);
+			if ((peer_uuid = switch_channel_get_variable(channel, "xfer_peer_uuid"))) {
+				switch_core_session_t *peer_session = NULL;
+				if ((peer_session = switch_core_session_locate(peer_uuid)) != NULL ) {
+					switch_ivr_transfer_recordings(session, peer_session);
+					switch_core_session_rwunlock(peer_session);
+				}
+			}
 			switch_api_execute("uuid_bridge", id, NULL, &stream);
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "\nHangup Command uuid_bridge(%s):\n%s\n", id,
 							  switch_str_nil((char *) stream.data));
@@ -734,9 +811,12 @@ void *SWITCH_THREAD_FUNC kz_att_thread_run(switch_thread_t *thread, void *obj)
 	switch_call_cause_t cause = SWITCH_CAUSE_NORMAL_CLEARING;
 	switch_channel_t *channel = switch_core_session_get_channel(session), *peer_channel = NULL;
 	const char *bond = NULL;
-	switch_core_session_t *b_session = NULL;
 	switch_bool_t follow_recording = switch_true(switch_channel_get_variable(channel, "recording_follow_attxfer"));
 	const char *attxfer_cancel_key = NULL, *attxfer_hangup_key = NULL, *attxfer_conf_key = NULL;
+	int br = 0;
+	switch_event_t *event = NULL;
+	switch_core_session_t *b_session = NULL;
+	switch_channel_t *b_channel = NULL;
 
 	att->running = 1;
 
@@ -746,18 +826,12 @@ void *SWITCH_THREAD_FUNC kz_att_thread_run(switch_thread_t *thread, void *obj)
 
 	bond = switch_channel_get_partner_uuid(channel);
 	if ((b_session = switch_core_session_locate(bond)) == NULL) {
-		switch_core_session_rwunlock(peer_session);
+		switch_core_session_rwunlock(session);
 		return NULL;
 	}
-
 	switch_channel_set_variable(channel, SWITCH_SOFT_HOLDING_UUID_VARIABLE, bond);
 	switch_core_event_hook_add_state_change(session, kz_att_xfer_tmp_hanguphook);
 
-	if (follow_recording && (b_session = switch_core_session_locate(bond))) {
-		switch_ivr_transfer_recordings(b_session, session);
-		switch_core_session_rwunlock(b_session);
-	}
-
 	if (switch_ivr_originate(session, &peer_session, &cause, data, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL)
 		!= SWITCH_STATUS_SUCCESS || !peer_session) {
 		switch_channel_set_variable(channel, SWITCH_SIGNAL_BOND_VARIABLE, bond);
@@ -799,45 +873,39 @@ void *SWITCH_THREAD_FUNC kz_att_thread_run(switch_thread_t *thread, void *obj)
 	switch_channel_clear_flag(peer_channel, CF_INNER_BRIDGE);
 	switch_channel_clear_flag(channel, CF_INNER_BRIDGE);
 
-	if (zstr(bond) && switch_channel_down(peer_channel)) {
+	if (switch_channel_down(peer_channel)) {
 		switch_core_session_rwunlock(peer_session);
 		switch_channel_set_variable(channel, SWITCH_SIGNAL_BOND_VARIABLE, bond);
 		goto end;
 	}
 
-	if (bond) {
-		int br = 0;
-
-		switch_channel_set_variable(channel, SWITCH_SIGNAL_BOND_VARIABLE, bond);
+	/*
+	 * we're emiting the transferee event so that callctl can update
+	 */
+	b_channel = switch_core_session_get_channel(b_session);
+	if (switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, "sofia::transferee") == SWITCH_STATUS_SUCCESS) {
+		switch_channel_event_set_data(b_channel, event);
+		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "att_xfer_replaced_call_id", switch_core_session_get_uuid(peer_session));
+		switch_event_fire(&event);
+	}
 
-		if (!switch_channel_down(peer_channel)) {
-			/*
-			 * we're emiting the transferee event so that callctl can update
-			 */
-			switch_event_t *event = NULL;
-			switch_channel_t *b_channel = switch_core_session_get_channel(b_session);
-			if (switch_event_create_subclass(&event, SWITCH_EVENT_CUSTOM, "sofia::transferee") == SWITCH_STATUS_SUCCESS) {
-				switch_channel_event_set_data(b_channel, event);
-				switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "att_xfer_replaced_call_id", switch_core_session_get_uuid(peer_session));
-				switch_event_fire(&event);
-			}
-			if (!switch_channel_ready(channel)) {
-				switch_status_t status;
+	if (!switch_channel_ready(channel)) {
+		switch_status_t status;
 
-				if (follow_recording) {
-					switch_ivr_transfer_recordings(session, peer_session);
-				}
-				status = switch_ivr_uuid_bridge(switch_core_session_get_uuid(peer_session), bond);
-				kz_att_xfer_set_result(peer_channel, status);
-				br++;
-			} else {
-				switch_channel_set_variable_printf(b_channel, "xfer_uuids", "%s %s", switch_core_session_get_uuid(peer_session), switch_core_session_get_uuid(session));
-				switch_channel_set_variable_printf(channel, "xfer_uuids", "%s %s", switch_core_session_get_uuid(peer_session), bond);
-
-				switch_core_event_hook_add_state_change(session, kz_att_xfer_hanguphook);
-				switch_core_event_hook_add_state_change(b_session, kz_att_xfer_hanguphook);
-			}
+		if (follow_recording) {
+			switch_ivr_transfer_recordings(session, peer_session);
 		}
+		status = switch_ivr_uuid_bridge(switch_core_session_get_uuid(peer_session), bond);
+		kz_att_xfer_set_result(peer_channel, status);
+		br++;
+	} else {
+		// switch_channel_set_variable_printf(b_channel, "xfer_uuids", "%s %s", switch_core_session_get_uuid(peer_session), switch_core_session_get_uuid(session));
+		switch_channel_set_variable_printf(channel, "xfer_uuids", "%s %s", switch_core_session_get_uuid(peer_session), bond);
+		switch_channel_set_variable(channel, "xfer_peer_uuid", switch_core_session_get_uuid(peer_session));
+
+		switch_core_event_hook_add_state_change(session, kz_att_xfer_hanguphook);
+		// switch_core_event_hook_add_state_change(b_session, kz_att_xfer_hanguphook);
+	}
 
 /*
  * this was commented so that the existing bridge
@@ -849,8 +917,6 @@ void *SWITCH_THREAD_FUNC kz_att_thread_run(switch_thread_t *thread, void *obj)
 		}
 */
 
-	}
-
 	switch_core_session_rwunlock(peer_session);
 
   end:
@@ -910,4 +976,5 @@ void add_kz_dptools(switch_loadable_module_interface_t **module_interface) {
 	SWITCH_ADD_APP(app_interface, "kz_bridge", "Bridge Audio", "Bridge the audio between two sessions", kz_audio_bridge_function, "<channel_url>", SAF_SUPPORT_NOMEDIA|SAF_SUPPORT_TEXT_ONLY);
 	SWITCH_ADD_APP(app_interface, "kz_bridge_uuid", "Bridge Audio", "Bridge the audio between two sessions", kz_audio_bridge_uuid_function, "<channel_url>", SAF_SUPPORT_NOMEDIA|SAF_SUPPORT_TEXT_ONLY);
 	SWITCH_ADD_APP(app_interface, "kz_att_xfer", "Attended Transfer", "Attended Transfer", kz_att_xfer_function, "<channel_url>", SAF_NONE);
+	SWITCH_ADD_APP(app_interface, "kz_moh", "Kazoo MOH Playback", "Kazoo MOH Playback", kz_moh_function, "", SAF_NONE);
 }
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_ei.h b/src/mod/event_handlers/mod_kazoo/kazoo_ei.h
index 1fe826b8461..5f617b5f1da 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_ei.h
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_ei.h
@@ -73,6 +73,7 @@ struct ei_event_stream_s {
 	switch_socket_t *socket;
 	switch_mutex_t *socket_mutex;
 	switch_bool_t connected;
+	switch_time_t connected_time;
 	char remote_ip[48];
 	uint16_t remote_port;
 	char local_ip[48];
@@ -81,6 +82,8 @@ struct ei_event_stream_s {
 	uint32_t flags;
 	ei_node_t *node;
 	short event_stream_framing;
+	short event_stream_keepalive;
+	switch_interval_time_t queue_timeout;
 	struct ei_event_stream_s *next;
 };
 
@@ -103,6 +106,10 @@ struct ei_node_s {
 	uint32_t flags;
 	int legacy;
 	short event_stream_framing;
+	short event_stream_keepalive;
+	switch_interval_time_t event_stream_queue_timeout;
+	switch_interval_time_t receiver_queue_timeout;
+	switch_interval_time_t sender_queue_timeout;
 	struct ei_node_s *next;
 };
 
@@ -158,7 +165,7 @@ struct kz_globals_s {
 
 	switch_hash_t *event_filter;
 	int epmdfd;
-	int num_worker_threads;
+	int node_worker_threads;
 	switch_bool_t nat_map;
 	switch_bool_t ei_shortname;
 	int ei_compat_rel;
@@ -171,14 +178,19 @@ struct kz_globals_s {
 	int send_all_headers;
 	int send_all_private_headers;
 	int connection_timeout;
-	int receive_timeout;
+	int ei_receive_timeout;
+	switch_interval_time_t node_sender_queue_timeout;
+	switch_interval_time_t node_receiver_queue_timeout;
 	int receive_msg_preallocate;
 	int event_stream_preallocate;
 	int send_msg_batch;
 	short event_stream_framing;
+	short event_stream_keepalive;
+	switch_interval_time_t event_stream_queue_timeout;
 	switch_port_t port;
 	int config_fetched;
 	int io_fault_tolerance;
+	switch_interval_time_t io_fault_tolerance_sleep;
 	kazoo_event_profile_ptr events;
 	kazoo_config_ptr definitions;
 	kazoo_config_ptr event_handlers;
@@ -192,6 +204,8 @@ struct kz_globals_s {
 	uint8_t tweaks[KZ_TWEAK_MAX];
 	switch_bool_t expand_headers_on_fetch;
 
+	switch_interval_time_t delay_before_initial_fetch;
+
 
 };
 typedef struct kz_globals_s kz_globals_t;
@@ -233,6 +247,9 @@ int ei_decode_string_or_binary_limited(char *buf, int *index, int maxsize, char
 int ei_decode_string_or_binary(char *buf, int *index, char **dst);
 switch_status_t create_acceptor();
 switch_hash_t *create_default_filter();
+void kz_erl_init();
+void kz_erl_shutdown();
+SWITCH_DECLARE(switch_status_t) ei_queue_pop(switch_queue_t *queue, void **data, switch_interval_time_t timeout);
 
 void fetch_config();
 
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_ei_config.c b/src/mod/event_handlers/mod_kazoo/kazoo_ei_config.c
index 83828a25a27..b2f3baf9632 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_ei_config.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_ei_config.c
@@ -118,16 +118,23 @@ switch_status_t kazoo_ei_config(switch_xml_t cfg) {
 	kazoo_globals.send_all_headers = 0;
 	kazoo_globals.send_all_private_headers = 1;
 	kazoo_globals.connection_timeout = 500;
-	kazoo_globals.receive_timeout = 200;
+	kazoo_globals.ei_receive_timeout = 200;
 	kazoo_globals.receive_msg_preallocate = 2000;
 	kazoo_globals.event_stream_preallocate = KZ_DEFAULT_STREAM_PRE_ALLOCATE;
 	kazoo_globals.send_msg_batch = 10;
 	kazoo_globals.event_stream_framing = 2;
+	kazoo_globals.event_stream_keepalive = 1;
+	kazoo_globals.event_stream_queue_timeout = 200000;
+	kazoo_globals.node_receiver_queue_timeout = 100000;
+	kazoo_globals.node_sender_queue_timeout = 0;
 	kazoo_globals.port = 0;
-	kazoo_globals.io_fault_tolerance = 10;
+	kazoo_globals.io_fault_tolerance = 3;
+	kazoo_globals.io_fault_tolerance_sleep = 100000; // 100 ms
 	kazoo_globals.json_encoding = ERLANG_TUPLE;
+	kazoo_globals.delay_before_initial_fetch = 10000000;
 
 	kazoo_globals.legacy_events = SWITCH_FALSE;
+	kazoo_globals.expand_headers_on_fetch = SWITCH_TRUE;
 
 	kz_set_tweak(KZ_TWEAK_INTERACTION_ID);
 	kz_set_tweak(KZ_TWEAK_EXPORT_VARS);
@@ -189,7 +196,7 @@ switch_status_t kazoo_ei_config(switch_xml_t cfg) {
 				kazoo_globals.connection_timeout = atoi(val);
 			} else if (!strcmp(var, "receive-timeout")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set receive-timeout: %s\n", val);
-				kazoo_globals.receive_timeout = atoi(val);
+				kazoo_globals.ei_receive_timeout = atoi(val);
 			} else if (!strcmp(var, "receive-msg-preallocate")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set receive-msg-preallocate: %s\n", val);
 				kazoo_globals.receive_msg_preallocate = atoi(val);
@@ -202,12 +209,28 @@ switch_status_t kazoo_ei_config(switch_xml_t cfg) {
 			} else if (!strcmp(var, "event-stream-framing")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set event-stream-framing: %s\n", val);
 				kazoo_globals.event_stream_framing = atoi(val);
+
+			} else if (!strcmp(var, "event-stream-keep-alive")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set event-stream-keep-alive: %s\n", val);
+				kazoo_globals.event_stream_keepalive = switch_true(val);
+
 			} else if (!strcmp(var, "io-fault-tolerance")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set io-fault-tolerance: %s\n", val);
 				kazoo_globals.io_fault_tolerance = atoi(val);
-			} else if (!strcmp(var, "num-worker-threads")) {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set num-worker-threads: %s\n", val);
-				kazoo_globals.num_worker_threads = atoi(val);
+			} else if (!strcmp(var, "io-fault-tolerance-sleep-micro")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.io_fault_tolerance_sleep = atoi(val);
+			} else if (!strcmp(var, "io-fault-tolerance-sleep-ms")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.io_fault_tolerance_sleep = atoi(val) * 1000;
+			} else if (!strcmp(var, "io-fault-tolerance-sleep-sec")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.io_fault_tolerance_sleep = atoi(val) * 1000000;
+
+
+			} else if (!strcmp(var, "node-worker-threads")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set node-worker-threads: %s\n", val);
+				kazoo_globals.node_worker_threads = atoi(val);
 			} else if (!strcmp(var, "json-term-encoding")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set json-term-encoding: %s\n", val);
 				if(!strcmp(val, "map")) {
@@ -219,13 +242,33 @@ switch_status_t kazoo_ei_config(switch_xml_t cfg) {
 			} else if (!strcmp(var, "expand-headers-on-fetch")) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set expand-headers-on-fetch: %s\n", val);
 				kazoo_globals.expand_headers_on_fetch = switch_true(val);
+			} else if (!strcmp(var, "node-receiver-queue-timeout")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.node_receiver_queue_timeout = atoi(val);
+			} else if (!strcmp(var, "node-sender-queue-timeout")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.node_sender_queue_timeout = atoi(val);
+			} else if (!strcmp(var, "event-stream-queue-timeout")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.event_stream_queue_timeout = atoi(val);
+			} else if (!strcmp(var, "delay-before-initial-fetch-micro")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.delay_before_initial_fetch = atoi(val);
+			} else if (!strcmp(var, "delay-before-initial-fetch-ms")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.delay_before_initial_fetch = atoi(val) * 1000;
+			} else if (!strcmp(var, "delay-before-initial-fetch-sec")) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set %s : %s\n", var, val);
+				kazoo_globals.delay_before_initial_fetch = atoi(val) * 1000000;
+			} else {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "unknown config option %s : %s\n", var, val);
 			}
 		}
 	}
 
 	if ((child = switch_xml_child(cfg, "tweaks"))) {
-		char *default_tweaks = (char *) switch_xml_attr_soft(param, "default");
-		if (default_tweaks) {
+		char *default_tweaks = (char *) switch_xml_attr_soft(child, "default");
+		if (default_tweaks && !zstr(default_tweaks)) {
 			int i, v = switch_true(default_tweaks) ? 1 : 0;
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Set tweak default : %s\n", default_tweaks);
 			for (i = 0; i < KZ_TWEAK_MAX; i++) kazoo_globals.tweaks[i] = v;
@@ -319,9 +362,9 @@ switch_status_t kazoo_ei_config(switch_xml_t cfg) {
 		kazoo_globals.profile_vars_prefixes[i] = switch_core_strdup(kazoo_globals.pool, sep_array[i]);
 	}
 
-	if (!kazoo_globals.num_worker_threads) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Number of worker threads not found in configuration, using default\n");
-		kazoo_globals.num_worker_threads = 10;
+	if (!kazoo_globals.node_worker_threads) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Number of node worker threads not found in configuration, using default\n");
+		kazoo_globals.node_worker_threads = 10;
 	}
 
 	if (zstr(kazoo_globals.ip)) {
@@ -422,12 +465,9 @@ switch_status_t kazoo_config_handlers(switch_xml_t cfg)
 
 		if(events == NULL) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to get default handler for events\n");
-			if(kazoo_globals.event_handlers != event_handlers) destroy_config(&event_handlers);
-			if(kazoo_globals.fetch_handlers != fetch_handlers) destroy_config(&fetch_handlers);
-			if(kazoo_globals.definitions != definitions) destroy_config(&definitions);
-			switch_xml_free(def);
-			switch_safe_free(xml);
-			return SWITCH_STATUS_GENERR;
+			destroy_config(&event_handlers);
+			event_handlers = kazoo_config_event_handlers(definitions, def);
+			events = (kazoo_event_profile_ptr) switch_core_hash_find(event_handlers->hash, "default");
 		}
 
 		if(kazoo_globals.events != events) {
@@ -505,9 +545,10 @@ switch_status_t kazoo_config_events(kazoo_config_ptr definitions, switch_memory_
 			cur->name = switch_core_strdup(pool, var);
 			kazoo_config_filters(pool, event, &cur->filter);
 			kazoo_config_fields(definitions, pool, event, &cur->fields);
-
+			if (switch_xml_child(event, "logging") != NULL) {
+				kazoo_config_loglevels(pool, event, &cur->logging);
+			}
 		}
-
 	}
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_ei_utils.c b/src/mod/event_handlers/mod_kazoo/kazoo_ei_utils.c
index 7f751db4f97..a460586ff3d 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_ei_utils.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_ei_utils.c
@@ -54,91 +54,87 @@
 
 #ifdef EI_DEBUG
 static void ei_x_print_reg_msg(ei_x_buff *buf, char *dest, int send) {
-    char *mbuf = NULL;
-    int i = 1;
+	char *mbuf = NULL;
+	int i = 1;
 
-    ei_s_print_term(&mbuf, buf->buff, &i);
+	ei_s_print_term(&mbuf, buf->buff, &i);
 
-    if (send) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Encoded term %s to '%s'\n", mbuf, dest);
-    } else {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Decoded term %s for '%s'\n", mbuf, dest);
-    }
+	if (send) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Encoded term %s to '%s'\n", mbuf, dest);
+	} else {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Decoded term %s for '%s'\n", mbuf, dest);
+	}
 
-    free(mbuf);
+	free(mbuf);
 }
 
 static void ei_x_print_msg(ei_x_buff *buf, erlang_pid *pid, int send) {
-    char *pbuf = NULL;
-    int i = 0;
-    ei_x_buff pidbuf;
+	char *pbuf = NULL;
+	int i = 0;
+	ei_x_buff pidbuf;
 
-    ei_x_new(&pidbuf);
-    ei_x_encode_pid(&pidbuf, pid);
+	ei_x_new(&pidbuf);
+	ei_x_encode_pid(&pidbuf, pid);
 
-    ei_s_print_term(&pbuf, pidbuf.buff, &i);
+	ei_s_print_term(&pbuf, pidbuf.buff, &i);
 
-    ei_x_print_reg_msg(buf, pbuf, send);
-    free(pbuf);
+	ei_x_print_reg_msg(buf, pbuf, send);
+	free(pbuf);
 }
 #endif
 
-void ei_encode_switch_event_headers(ei_x_buff *ebuf, switch_event_t *event) {
+void ei_encode_switch_event_headers(ei_x_buff *ebuf, switch_event_t *event)
+{
 	ei_encode_switch_event_headers_2(ebuf, event, 1);
 }
 
-void ei_encode_switch_event_headers_2(ei_x_buff *ebuf, switch_event_t *event, int encode) {
-    switch_event_header_t *hp;
-    char *uuid = switch_event_get_header(event, "unique-id");
-    int i;
+void ei_encode_switch_event_headers_2(ei_x_buff *ebuf, switch_event_t *event, int encode)
+{
+	switch_event_header_t *hp;
+	char *uuid = switch_event_get_header(event, "unique-id");
+	int i;
 
-    for (i = 0, hp = event->headers; hp; hp = hp->next, i++);
+	for (i = 0, hp = event->headers; hp; hp = hp->next, i++)
+		;
 
-    if (event->body)
-        i++;
+	if (event->body)
+		i++;
 
-    ei_x_encode_list_header(ebuf, i + 1);
+	ei_x_encode_list_header(ebuf, i + 1);
 
-    if (uuid) {
+	if (uuid) {
 		char *unique_id = switch_event_get_header(event, "unique-id");
 		ei_x_encode_binary(ebuf, unique_id, strlen(unique_id));
-    } else {
-        ei_x_encode_atom(ebuf, "undefined");
-    }
-
-    for (hp = event->headers; hp; hp = hp->next) {
-        ei_x_encode_tuple_header(ebuf, 2);
-        ei_x_encode_binary(ebuf, hp->name, strlen(hp->name));
-	    if(encode) {
-	        switch_url_decode(hp->value);
-	    }
-        ei_x_encode_binary(ebuf, hp->value, strlen(hp->value));
-    }
-
-    if (event->body) {
-        ei_x_encode_tuple_header(ebuf, 2);
-        ei_x_encode_binary(ebuf, "body", strlen("body"));
-        ei_x_encode_binary(ebuf, event->body, strlen(event->body));
-    }
-
-    ei_x_encode_empty_list(ebuf);
+	} else {
+		ei_x_encode_atom(ebuf, "undefined");
+	}
+
+	for (hp = event->headers; hp; hp = hp->next) {
+		ei_x_encode_tuple_header(ebuf, 2);
+		ei_x_encode_binary(ebuf, hp->name, strlen(hp->name));
+		if (encode) {
+			switch_url_decode(hp->value);
+		}
+		ei_x_encode_binary(ebuf, hp->value, strlen(hp->value));
+	}
+
+	if (event->body) {
+		ei_x_encode_tuple_header(ebuf, 2);
+		ei_x_encode_binary(ebuf, "body", strlen("body"));
+		ei_x_encode_binary(ebuf, event->body, strlen(event->body));
+	}
+
+	ei_x_encode_empty_list(ebuf);
 }
 
 int ei_json_child_count(cJSON *JObj)
 {
-	int mask = cJSON_False
-			| cJSON_True
-			| cJSON_NULL
-			| cJSON_Number
-			| cJSON_String
-			| cJSON_Array
-			| cJSON_Object
-			| cJSON_Raw;
+	int mask = cJSON_False | cJSON_True | cJSON_NULL | cJSON_Number | cJSON_String | cJSON_Array | cJSON_Object | cJSON_Raw;
 
 	cJSON *item = JObj->child;
 	int i = 0;
-	while(item) {
-		if(item->type & mask)
+	while (item) {
+		if (item->type & mask)
 			i++;
 		item = item->next;
 	}
@@ -146,154 +142,165 @@ int ei_json_child_count(cJSON *JObj)
 
 }
 
-void ei_encode_json_array(ei_x_buff *ebuf, cJSON *JObj) {
+void ei_encode_json_array(ei_x_buff *ebuf, cJSON *JObj)
+{
 	cJSON *item;
 	int count = ei_json_child_count(JObj);
 
-    ei_x_encode_list_header(ebuf, count);
-    if(count == 0)
-    	return;
-
-    item = JObj->child;
-    while(item) {
-    	switch(item->type) {
-    	case cJSON_String:
-    		ei_x_encode_binary(ebuf, item->valuestring, strlen(item->valuestring));
-    		break;
-
-    	case cJSON_Number:
-   			ei_x_encode_double(ebuf, item->valuedouble);
-    		break;
-
-    	case cJSON_True:
-    		ei_x_encode_boolean(ebuf, 1);
-    		break;
-
-    	case cJSON_False:
-    		ei_x_encode_boolean(ebuf, 0);
-    		break;
-
-    	case cJSON_Object:
-    		ei_encode_json(ebuf, item);
-    		break;
-
-    	case cJSON_Array:
-    		ei_encode_json_array(ebuf, item);
-    		break;
-
-    	case cJSON_Raw:
-    	{
-    		cJSON *Decoded = cJSON_Parse(item->valuestring);
-    		if(!Decoded) {
-    			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ERROR DECODING RAW JSON %s\n", item->valuestring);
-    			ei_x_encode_tuple_header(ebuf, 0);
-    		} else {
-    			ei_encode_json(ebuf, Decoded);
-    			cJSON_Delete(Decoded);
-    		}
-    		break;
-    	}
-
-    	case cJSON_NULL:
-    		ei_x_encode_atom(ebuf, "null");
-    		break;
-
-    	default:
-    		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "NOT ENCODED %i\n", item->type);
-    		break;
-
-    	}
-    	item = item->next;
-    }
-
-    ei_x_encode_empty_list(ebuf);
+	ei_x_encode_list_header(ebuf, count);
+	if (count == 0)
+		return;
+
+	item = JObj->child;
+	while (item) {
+		switch (item->type){
+		case cJSON_String:
+			ei_x_encode_binary(ebuf, item->valuestring, strlen(item->valuestring));
+			break;
+
+		case cJSON_Number:
+			if ((fabs(((double) item->valueint) - item->valuedouble) <= DBL_EPSILON) && (item->valuedouble <= INT_MAX) && (item->valuedouble >= INT_MIN)) {
+				ei_x_encode_longlong(ebuf, item->valueint);
+			} else {
+				if (fmod(item->valuedouble, 1) == 0) {
+					ei_x_encode_ulonglong(ebuf, item->valuedouble);
+				} else {
+					ei_x_encode_double(ebuf, item->valuedouble);
+				}
+			}
+			break;
+
+		case cJSON_True:
+			ei_x_encode_boolean(ebuf, 1);
+			break;
+
+		case cJSON_False:
+			ei_x_encode_boolean(ebuf, 0);
+			break;
+
+		case cJSON_Object:
+			ei_encode_json(ebuf, item);
+			break;
+
+		case cJSON_Array:
+			ei_encode_json_array(ebuf, item);
+			break;
+
+		case cJSON_Raw: {
+			cJSON *Decoded = cJSON_Parse(item->valuestring);
+			if (!Decoded) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ERROR DECODING RAW JSON %s\n", item->valuestring);
+				ei_x_encode_tuple_header(ebuf, 0);
+			} else {
+				ei_encode_json(ebuf, Decoded);
+				cJSON_Delete(Decoded);
+			}
+			break;
+		}
+
+		case cJSON_NULL:
+			ei_x_encode_atom(ebuf, "null");
+			break;
+
+		default:
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "NOT ENCODED %i\n", item->type);
+			break;
+
+		}
+		item = item->next;
+	}
+
+	ei_x_encode_empty_list(ebuf);
 
 }
 
-void ei_encode_json(ei_x_buff *ebuf, cJSON *JObj) {
+void ei_encode_json(ei_x_buff *ebuf, cJSON *JObj)
+{
 	cJSON *item;
 	int count = ei_json_child_count(JObj);
 
-	if(kazoo_globals.json_encoding == ERLANG_TUPLE) {
-	    ei_x_encode_tuple_header(ebuf, 1);
-	    ei_x_encode_list_header(ebuf, count);
+	if (kazoo_globals.json_encoding == ERLANG_TUPLE) {
+		ei_x_encode_tuple_header(ebuf, 1);
+		ei_x_encode_list_header(ebuf, count);
 	} else {
 		ei_x_encode_map_header(ebuf, count);
 	}
 
-    if(count == 0)
-    	return;
-
-    item = JObj->child;
-    while(item) {
-    	if(kazoo_globals.json_encoding == ERLANG_TUPLE) {
-    		ei_x_encode_tuple_header(ebuf, 2);
-    	}
-    	ei_x_encode_binary(ebuf, item->string, strlen(item->string));
-
-    	switch(item->type) {
-    	case cJSON_String:
-    		ei_x_encode_binary(ebuf, item->valuestring, strlen(item->valuestring));
-    		break;
-
-    	case cJSON_Number:
-    		if ((fabs(((double)item->valueint) - item->valuedouble) <= DBL_EPSILON)
-    				&& (item->valuedouble <= INT_MAX)
-    				&& (item->valuedouble >= INT_MIN)) {
-    			ei_x_encode_longlong(ebuf, item->valueint);
-    		} else {
-    			ei_x_encode_double(ebuf, item->valuedouble);
-    		}
-    		break;
-
-    	case cJSON_True:
-    		ei_x_encode_boolean(ebuf, 1);
-    		break;
-
-    	case cJSON_False:
-    		ei_x_encode_boolean(ebuf, 0);
-    		break;
-
-    	case cJSON_Object:
-    		ei_encode_json(ebuf, item);
-    		break;
-
-    	case cJSON_Array:
-    		ei_encode_json_array(ebuf, item);
-    		break;
-
-    	case cJSON_Raw:
-    	{
-    		cJSON *Decoded = cJSON_Parse(item->valuestring);
-    		if(!Decoded) {
-    			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ERROR DECODING RAW JSON %s\n", item->valuestring);
-    			ei_x_encode_tuple_header(ebuf, 0);
-    		} else {
-    			ei_encode_json(ebuf, Decoded);
-    			cJSON_Delete(Decoded);
-    		}
-    		break;
-    	}
-
-    	case cJSON_NULL:
-    		ei_x_encode_atom(ebuf, "null");
-    		break;
-
-    	default:
-    		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "NOT ENCODED %i\n", item->type);
-    		break;
-
-    	}
-    	item = item->next;
-    }
-
-	if(kazoo_globals.json_encoding == ERLANG_TUPLE) {
+	if (count == 0)
+		return;
+
+	item = JObj->child;
+	while (item) {
+		if (kazoo_globals.json_encoding == ERLANG_TUPLE) {
+			ei_x_encode_tuple_header(ebuf, 2);
+		}
+		ei_x_encode_binary(ebuf, item->string, strlen(item->string));
+
+		switch (item->type){
+		case cJSON_String:
+			ei_x_encode_binary(ebuf, item->valuestring, strlen(item->valuestring));
+			break;
+
+		case cJSON_Number:
+			if ((fabs(((double) item->valueint) - item->valuedouble) <= DBL_EPSILON) && (item->valuedouble <= INT_MAX) && (item->valuedouble >= INT_MIN)) {
+				ei_x_encode_longlong(ebuf, item->valueint);
+			} else {
+				if (fmod(item->valuedouble, 1) == 0) {
+					ei_x_encode_ulonglong(ebuf, item->valuedouble);
+				} else {
+					ei_x_encode_double(ebuf, item->valuedouble);
+				}
+			}
+			break;
+
+		case cJSON_True:
+			ei_x_encode_boolean(ebuf, 1);
+			break;
+
+		case cJSON_False:
+			ei_x_encode_boolean(ebuf, 0);
+			break;
+
+		case cJSON_Object:
+			ei_encode_json(ebuf, item);
+			break;
+
+		case cJSON_Array:
+			ei_encode_json_array(ebuf, item);
+			break;
+
+		case cJSON_Raw: {
+			cJSON *Decoded = cJSON_Parse(item->valuestring);
+			if (!Decoded) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "ERROR DECODING RAW JSON %s\n", item->valuestring);
+				ei_x_encode_tuple_header(ebuf, 0);
+			} else {
+				ei_encode_json(ebuf, Decoded);
+				cJSON_Delete(Decoded);
+			}
+			break;
+		}
+
+		case cJSON_NULL:
+			ei_x_encode_atom(ebuf, "null");
+			break;
+
+		default:
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "NOT ENCODED %i\n", item->type);
+			break;
+
+		}
+		item = item->next;
+	}
+
+	if (kazoo_globals.json_encoding == ERLANG_TUPLE) {
 		ei_x_encode_empty_list(ebuf);
 	}
 
 }
 
-void close_socket(switch_socket_t ** sock) {
+void close_socket(switch_socket_t ** sock)
+{
 	if (*sock) {
 		switch_socket_shutdown(*sock, SWITCH_SHUTDOWN_READWRITE);
 		switch_socket_close(*sock);
@@ -301,18 +308,20 @@ void close_socket(switch_socket_t ** sock) {
 	}
 }
 
-void close_socketfd(int *sockfd) {
+void close_socketfd(int *sockfd)
+{
 	if (*sockfd) {
 		shutdown(*sockfd, SHUT_RDWR);
 		close(*sockfd);
 	}
 }
 
-switch_socket_t *create_socket_with_port(switch_memory_pool_t *pool, switch_port_t port) {
+switch_socket_t *create_socket_with_port(switch_memory_pool_t *pool, switch_port_t port)
+{
 	switch_sockaddr_t *sa;
 	switch_socket_t *socket;
 
-	if(switch_sockaddr_info_get(&sa, kazoo_globals.ip, SWITCH_UNSPEC, port, 0, pool)) {
+	if (switch_sockaddr_info_get(&sa, kazoo_globals.ip, SWITCH_UNSPEC, port, 0, pool)) {
 		return NULL;
 	}
 
@@ -328,7 +337,7 @@ switch_socket_t *create_socket_with_port(switch_memory_pool_t *pool, switch_port
 		return NULL;
 	}
 
-	if (switch_socket_listen(socket, 5)){
+	if (switch_socket_listen(socket, 5)) {
 		return NULL;
 	}
 
@@ -339,30 +348,32 @@ switch_socket_t *create_socket_with_port(switch_memory_pool_t *pool, switch_port
 	return socket;
 }
 
-switch_socket_t *create_socket(switch_memory_pool_t *pool) {
+switch_socket_t *create_socket(switch_memory_pool_t *pool)
+{
 	return create_socket_with_port(pool, 0);
 
 }
 
-switch_status_t create_ei_cnode(const char *ip_addr, const char *name, struct ei_cnode_s *ei_cnode) {
-    char hostname[EI_MAXHOSTNAMELEN + 1];
-    char nodename[MAXNODELEN + 1];
-    char cnodename[EI_MAXALIVELEN + 1];
-    char *atsign;
-
-    /* copy the erlang interface nodename into something we can modify */
-    strncpy(cnodename, name, EI_MAXALIVELEN);
-
-    if ((atsign = strchr(cnodename, '@'))) {
-        /* we got a qualified node name, don't guess the host/domain */
-        snprintf(nodename, MAXNODELEN + 1, "%s", name);
-        /* truncate the alivename at the @ */
-        *atsign++ = '\0';
-        strncpy(hostname, atsign, EI_MAXHOSTNAMELEN);
-    } else {
-        strncpy(hostname, kazoo_globals.hostname, EI_MAXHOSTNAMELEN);
-        snprintf(nodename, MAXNODELEN + 1, "%s@%s", name, hostname);
-    }
+switch_status_t create_ei_cnode(const char *ip_addr, const char *name, struct ei_cnode_s *ei_cnode)
+{
+	char hostname[EI_MAXHOSTNAMELEN + 1];
+	char nodename[MAXNODELEN + 1];
+	char cnodename[EI_MAXALIVELEN + 1];
+	char *atsign;
+
+	/* copy the erlang interface nodename into something we can modify */
+	strncpy(cnodename, name, EI_MAXALIVELEN);
+
+	if ((atsign = strchr(cnodename, '@'))) {
+		/* we got a qualified node name, don't guess the host/domain */
+		snprintf(nodename, MAXNODELEN + 1, "%s", name);
+		/* truncate the alivename at the @ */
+		*atsign++ = '\0';
+		strncpy(hostname, atsign, EI_MAXHOSTNAMELEN);
+	} else {
+		strncpy(hostname, kazoo_globals.hostname, EI_MAXHOSTNAMELEN);
+		snprintf(nodename, MAXNODELEN + 1, "%s@%s", name, hostname);
+	}
 
 	if (kazoo_globals.ei_shortname) {
 		char *off;
@@ -373,94 +384,97 @@ switch_status_t create_ei_cnode(const char *ip_addr, const char *name, struct ei
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "creating nodename: %s\n", nodename);
 
-    /* init the ec stuff */
-    if (ei_connect_xinit(ei_cnode, hostname, cnodename, nodename, (Erl_IpAddr) ip_addr, kazoo_globals.ei_cookie, 0) < 0) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to initialize the erlang interface connection structure\n");
-        return SWITCH_STATUS_FALSE;
-    }
+	/* init the ec stuff */
+	if (ei_connect_xinit(ei_cnode, hostname, cnodename, nodename, (Erl_IpAddr) ip_addr, kazoo_globals.ei_cookie, 0) < 0) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Failed to initialize the erlang interface connection structure\n");
+		return SWITCH_STATUS_FALSE;
+	}
 
-    return SWITCH_STATUS_SUCCESS;
+	return SWITCH_STATUS_SUCCESS;
 }
 
-switch_status_t ei_compare_pids(const erlang_pid *pid1, const erlang_pid *pid2) {
-    if ((!strcmp(pid1->node, pid2->node))
-		&& pid1->creation == pid2->creation
-		&& pid1->num == pid2->num
-		&& pid1->serial == pid2->serial) {
-        return SWITCH_STATUS_SUCCESS;
-    } else {
-        return SWITCH_STATUS_FALSE;
-    }
+switch_status_t ei_compare_pids(const erlang_pid *pid1, const erlang_pid *pid2)
+{
+	if ((!strcmp(pid1->node, pid2->node)) && pid1->creation == pid2->creation && pid1->num == pid2->num && pid1->serial == pid2->serial) {
+		return SWITCH_STATUS_SUCCESS;
+	} else {
+		return SWITCH_STATUS_FALSE;
+	}
 }
 
-void ei_link(ei_node_t *ei_node, erlang_pid * from, erlang_pid * to) {
-    char msgbuf[2048];
-    char *s;
-    int index = 0;
-
-    index = 5; /* max sizes: */
-    ei_encode_version(msgbuf, &index); /*   1 */
-    ei_encode_tuple_header(msgbuf, &index, 3);
-    ei_encode_long(msgbuf, &index, ERL_LINK);
-    ei_encode_pid(msgbuf, &index, from); /* 268 */
-    ei_encode_pid(msgbuf, &index, to); /* 268 */
-
-    /* 5 byte header missing */
-    s = msgbuf;
-    put32be(s, index - 4); /*   4 */
-    put8(s, ERL_PASS_THROUGH); /*   1 */
-    /* sum:  542 */
-
-    if (write(ei_node->nodefd, msgbuf, index) == -1) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Failed to link to process on %s\n", ei_node->peer_nodename);
-    }
+void ei_link(ei_node_t *ei_node, erlang_pid * from, erlang_pid * to)
+{
+	char msgbuf[2048];
+	char *s;
+	int index = 0;
+
+	index = 5; /* max sizes: */
+	ei_encode_version(msgbuf, &index); /*   1 */
+	ei_encode_tuple_header(msgbuf, &index, 3);
+	ei_encode_long(msgbuf, &index, ERL_LINK);
+	ei_encode_pid(msgbuf, &index, from); /* 268 */
+	ei_encode_pid(msgbuf, &index, to); /* 268 */
+
+	/* 5 byte header missing */
+	s = msgbuf;
+	put32be(s, index - 4); /*   4 */
+	put8(s, ERL_PASS_THROUGH); /*   1 */
+	/* sum:  542 */
+
+	if (write(ei_node->nodefd, msgbuf, index) == -1) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Failed to link to process on %s\n", ei_node->peer_nodename);
+	}
 }
 
-void ei_encode_switch_event(ei_x_buff *ebuf, switch_event_t *event) {
-    ei_x_encode_tuple_header(ebuf, 2);
-    ei_x_encode_atom(ebuf, "event");
-    ei_encode_switch_event_headers(ebuf, event);
+void ei_encode_switch_event(ei_x_buff *ebuf, switch_event_t *event)
+{
+	ei_x_encode_tuple_header(ebuf, 2);
+	ei_x_encode_atom(ebuf, "event");
+	ei_encode_switch_event_headers(ebuf, event);
 }
 
-int ei_helper_send(ei_node_t *ei_node, erlang_pid *to, ei_x_buff *buf) {
-    int ret = 0;
+int ei_helper_send(ei_node_t *ei_node, erlang_pid *to, ei_x_buff *buf)
+{
+	int ret = 0;
 
-    if (ei_node->nodefd) {
+	if (ei_node->nodefd) {
 #ifdef EI_DEBUG
 		ei_x_print_msg(buf, to, 1);
 #endif
-        ret = ei_send(ei_node->nodefd, to, buf->buff, buf->index);
-    }
+		ret = ei_send(ei_node->nodefd, to, buf->buff, buf->index);
+	}
 
-    return ret;
+	return ret;
 }
 
-int ei_decode_atom_safe(char *buf, int *index, char *dst) {
-    int type, size;
+int ei_decode_atom_safe(char *buf, int *index, char *dst)
+{
+	int type, size;
 
-    ei_get_type(buf, index, &type, &size);
+	ei_get_type(buf, index, &type, &size);
 
 	if (type != ERL_ATOM_EXT) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed atom\n", type, size);
-        return -1;
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed atom\n", type, size);
+		return -1;
 	} else if (size > MAXATOMLEN) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Requested decoding of atom with size %d into a buffer of size %d\n", size, MAXATOMLEN);
-        return -1;
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Requested decoding of atom with size %d into a buffer of size %d\n", size, MAXATOMLEN);
+		return -1;
 	} else {
 		return ei_decode_atom(buf, index, dst);
 	}
 }
 
-int ei_decode_string_or_binary(char *buf, int *index, char **dst) {
-    int type, size, res;
-    long len;
+int ei_decode_string_or_binary(char *buf, int *index, char **dst)
+{
+	int type, size, res;
+	long len;
 
-    ei_get_type(buf, index, &type, &size);
+	ei_get_type(buf, index, &type, &size);
 
-    if (type != ERL_STRING_EXT && type != ERL_BINARY_EXT && type != ERL_LIST_EXT && type != ERL_NIL_EXT) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed binary or string\n", type, size);
-        return -1;
-    }
+	if (type != ERL_STRING_EXT && type != ERL_BINARY_EXT && type != ERL_LIST_EXT && type != ERL_NIL_EXT) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed binary or string\n", type, size);
+		return -1;
+	}
 
 	*dst = malloc(size + 1);
 
@@ -468,29 +482,30 @@ int ei_decode_string_or_binary(char *buf, int *index, char **dst) {
 		res = 0;
 		**dst = '\0';
 	} else if (type == ERL_BINARY_EXT) {
-        res = ei_decode_binary(buf, index, *dst, &len);
-        (*dst)[len] = '\0';
-    } else {
-        res = ei_decode_string(buf, index, *dst);
-    }
+		res = ei_decode_binary(buf, index, *dst, &len);
+		(*dst)[len] = '\0';
+	} else {
+		res = ei_decode_string(buf, index, *dst);
+	}
 
-    return res;
+	return res;
 }
 
-int ei_decode_string_or_binary_limited(char *buf, int *index, int maxsize, char *dst) {
-    int type, size, res;
-    long len;
+int ei_decode_string_or_binary_limited(char *buf, int *index, int maxsize, char *dst)
+{
+	int type, size, res;
+	long len;
 
-    ei_get_type(buf, index, &type, &size);
+	ei_get_type(buf, index, &type, &size);
 
-    if (type != ERL_STRING_EXT && type != ERL_BINARY_EXT && type != ERL_LIST_EXT && type != ERL_NIL_EXT) {
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed binary or string\n", type, size);
-        return -1;
-    }
+	if (type != ERL_STRING_EXT && type != ERL_BINARY_EXT && type != ERL_LIST_EXT && type != ERL_NIL_EXT) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unexpected erlang term type %d (size %d), needed binary or string\n", type, size);
+		return -1;
+	}
 
 	if (size > maxsize) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Requested decoding of %s with size %d into a buffer of size %d\n",
-						  type == ERL_BINARY_EXT ? "binary" : "string", size, maxsize);
+		                  type == ERL_BINARY_EXT ? "binary" : "string", size, maxsize);
 		return -1;
 	}
 
@@ -498,25 +513,21 @@ int ei_decode_string_or_binary_limited(char *buf, int *index, int maxsize, char
 		res = 0;
 		*dst = '\0';
 	} else if (type == ERL_BINARY_EXT) {
-        res = ei_decode_binary(buf, index, dst, &len);
-        dst[len] = '\0'; /* binaries aren't null terminated */
-    } else {
-        res = ei_decode_string(buf, index, dst);
-    }
+		res = ei_decode_binary(buf, index, dst, &len);
+		dst[len] = '\0'; /* binaries aren't null terminated */
+	} else {
+		res = ei_decode_string(buf, index, dst);
+	}
 
-    return res;
+	return res;
 }
 
-
-switch_status_t create_acceptor() {
+switch_status_t create_acceptor()
+{
 	switch_sockaddr_t *sa;
 	uint16_t port;
-    char ipbuf[48];
-    const char *ip_addr;
-
-#if (ERLANG_MAJOR == 10 && ERLANG_MINOR >= 3) || ERLANG_MAJOR >= 11
-	ei_init();
-#endif
+	char ipbuf[48];
+	const char *ip_addr;
 
 	/* if the config has specified an erlang release compatibility then pass that along to the erlang interface */
 	if (kazoo_globals.ei_compat_rel) {
@@ -531,7 +542,7 @@ switch_status_t create_acceptor() {
 	switch_socket_addr_get(&sa, SWITCH_FALSE, kazoo_globals.acceptor);
 
 	port = switch_sockaddr_get_port(sa);
-	ip_addr = switch_get_addr(ipbuf, sizeof (ipbuf), sa);
+	ip_addr = switch_get_addr(ipbuf, sizeof(ipbuf), sa);
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Erlang connection acceptor listening on %s:%u\n", ip_addr, port);
 
@@ -545,7 +556,7 @@ switch_status_t create_acceptor() {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Failed to publish port to epmd, trying to start epmd via system()\n");
 		if (system("fs_epmd -daemon")) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR,
-				"Failed to start epmd manually! Is epmd in $PATH? If not, start it yourself or run an erl shell with -sname or -name\n");
+			                  "Failed to start epmd manually! Is epmd in $PATH? If not, start it yourself or run an erl shell with -sname or -name\n");
 			return SWITCH_STATUS_SOCKERR;
 		}
 		switch_yield(100000);
@@ -555,12 +566,14 @@ switch_status_t create_acceptor() {
 		}
 	}
 
-	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Connected to epmd and published erlang cnode name %s at port %d\n", kazoo_globals.ei_cnode.thisnodename, port);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Connected to epmd and published erlang cnode name %s at port %d\n", kazoo_globals.ei_cnode.thisnodename,
+	                  port);
 
 	return SWITCH_STATUS_SUCCESS;
 }
 
-switch_hash_t *create_default_filter() {
+switch_hash_t *create_default_filter()
+{
 	switch_hash_t *filter;
 
 	switch_core_hash_init(&filter);
@@ -882,6 +895,7 @@ static void fetch_config_filters(switch_memory_pool_t *pool)
 		kazoo_globals.config_fetched = 1;
 		switch_xml_free(xml);
 	}
+	switch_event_destroy(&params);
 
 }
 
@@ -901,28 +915,50 @@ static void fetch_config_handlers(switch_memory_pool_t *pool)
 		kazoo_globals.config_fetched = 1;
 		switch_xml_free(xml);
 	}
+	switch_event_destroy(&params);
 
 }
 
 static void *SWITCH_THREAD_FUNC fetch_config_exec(switch_thread_t *thread, void *obj)
 {
-	switch_memory_pool_t *pool = (switch_memory_pool_t *)obj;
-	fetch_config_filters(pool);
-	fetch_config_handlers(pool);
+	switch_memory_pool_t *pool = (switch_memory_pool_t *) obj;
+	ei_node_t *node;
+	int fetch_filters = 0, fetch_handlers = 0;
+
+	// give some time for node initialization
+	switch_sleep(kazoo_globals.delay_before_initial_fetch);
+
+	for (node = kazoo_globals.ei_nodes; node != NULL; node = node->next) {
+		if (node->legacy ) {
+			fetch_filters++;
+		} else {
+			fetch_handlers++;
+		}
+	}
+
+	if (fetch_filters) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "fetching filters for kazoo\n");
+		fetch_config_filters(pool);
+	}
+
+	if (fetch_handlers) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "fetching kazoo handlers\n");
+		fetch_config_handlers(pool);
+	}
 
 	kazoo_globals.config_fetched = 1;
 
 	return NULL;
 }
 
-void fetch_config() {
+void fetch_config()
+{
 	switch_memory_pool_t *pool;
 	switch_thread_t *thread;
 	switch_threadattr_t *thd_attr = NULL;
 	switch_uuid_t uuid;
 
-
-	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "fetching kazoo config\n");
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "scheduling fetch for kazoo config\n");
 
 	switch_core_new_memory_pool(&pool);
 
@@ -935,11 +971,66 @@ void fetch_config() {
 
 }
 
+#ifdef WITH_KAZOO_ERL_SHUTDOWN
+#if (ERLANG_MAJOR == 10 && ERLANG_MINOR >= 3) || ERLANG_MAJOR >= 11
+typedef struct ei_mutex_s {
+#ifdef __WIN32__
+	HANDLE lock;
+#elif VXWORKS
+	SEM_ID lock;
+#else /* unix */
+#if defined(HAVE_MIT_PTHREAD_H) || defined(HAVE_PTHREAD_H)
+	pthread_mutex_t *lock;
+#else /* ! (HAVE_MIT_PTHREAD_H || HAVE_PTHREAD_H) */
+	void *dummy; /* Actually never used */
+#endif /* ! (HAVE_MIT_PTHREAD_H || HAVE_PTHREAD_H) */
+#endif /* unix */
+}ei_mutex_t;
+
+typedef struct ei_socket_info_s {
+	int socket;
+	ei_socket_callbacks *cbs;
+	void *ctx;
+	int dist_version;
+	ei_cnode cnode; /* A copy, not a pointer. We don't know when freed */
+	char cookie[EI_MAX_COOKIE_SIZE+1];
+}ei_socket_info;
+
+extern ei_socket_info *ei_sockets;
+extern ei_mutex_t* ei_sockets_lock;
+extern int ei_n_sockets;
+extern int ei_sz_sockets;
+
+int ei_mutex_free(ei_mutex_t *l, int nblock);
+
+#endif
+#endif
+
+void kz_erl_init()
+{
+#if (ERLANG_MAJOR == 10 && ERLANG_MINOR >= 3) || ERLANG_MAJOR >= 11
+	ei_init();
+#endif
+}
+
+void kz_erl_shutdown()
+{
+#ifdef WITH_KAZOO_ERL_SHUTDOWN
+#if (ERLANG_MAJOR == 10 && ERLANG_MINOR >= 3) || ERLANG_MAJOR >= 11
+	ei_mutex_free(ei_sockets_lock, 1);
+	ei_sockets_lock = NULL;
+	free(ei_sockets);
+	ei_sockets = NULL;
+	ei_n_sockets = ei_sz_sockets = 0;
+#endif
+#endif
+}
 
-SWITCH_MODULE_RUNTIME_FUNCTION(mod_kazoo_runtime) {
+SWITCH_MODULE_RUNTIME_FUNCTION(mod_kazoo_runtime)
+{
 	switch_os_socket_t os_socket;
 
-	if(create_acceptor() != SWITCH_STATUS_SUCCESS) {
+	if (create_acceptor() != SWITCH_STATUS_SUCCESS) {
 		// TODO: what would we need to clean up here
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to create erlang connection acceptor!\n");
 		close_socket(&kazoo_globals.acceptor);
@@ -966,7 +1057,7 @@ SWITCH_MODULE_RUNTIME_FUNCTION(mod_kazoo_runtime) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Erlang connection acceptor socket error %d %d\n", erl_errno, errno);
 			} else {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING,
-								  "Erlang node connection failed - ensure your cookie matches '%s' and you are using a good nodename\n", kazoo_globals.ei_cookie);
+				                  "Erlang node connection failed - ensure your cookie matches '%s' and you are using a good nodename\n", kazoo_globals.ei_cookie);
 			}
 			continue;
 		}
@@ -986,6 +1077,14 @@ SWITCH_MODULE_RUNTIME_FUNCTION(mod_kazoo_runtime) {
 	return SWITCH_STATUS_TERM;
 }
 
+SWITCH_DECLARE(switch_status_t) ei_queue_pop(switch_queue_t *queue, void **data, switch_interval_time_t timeout)
+{
+	if (timeout == 0) {
+		return switch_queue_trypop(queue, data);
+	} else {
+		return switch_queue_pop_timeout(queue, data, timeout);
+	}
+}
 /* For Emacs:
  * Local Variables:
  * mode:c
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_endpoints.c b/src/mod/event_handlers/mod_kazoo/kazoo_endpoints.c
index bec5b5e1ea3..cb2b28cb261 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_endpoints.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_endpoints.c
@@ -56,20 +56,9 @@ static void kz_tweaks_variables_to_event(switch_core_session_t *session, switch_
 	}
 }
 
-/* kazoo endpoint */
-switch_endpoint_interface_t *kz_endpoint_interface;
-static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *session,
-							switch_event_t *var_event,
-							switch_caller_profile_t *outbound_profile,
-							switch_core_session_t **new_session, switch_memory_pool_t **pool, switch_originate_flag_t flags,
-							switch_call_cause_t *cancel_cause);
-switch_io_routines_t kz_endpoint_io_routines = {
-	/*.outgoing_channel */ kz_endpoint_outgoing_channel
-};
-
 static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *session,
 							switch_event_t *var_event,
-							switch_caller_profile_t *outbound_profile,
+							switch_caller_profile_t *outbound_profile_in,
 							switch_core_session_t **new_session, switch_memory_pool_t **pool, switch_originate_flag_t flags,
 							switch_call_cause_t *cancel_cause)
 {
@@ -97,14 +86,15 @@ static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *s
 	char *cid_num_override = NULL;
 	switch_event_t *event = NULL;
 	switch_status_t status = SWITCH_STATUS_SUCCESS;
+	switch_caller_profile_t *outbound_profile = NULL;
 
 
-	if (zstr(outbound_profile->destination_number)) {
+	if (zstr(outbound_profile_in->destination_number)) {
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_WARNING, "NO DESTINATION NUMBER\n");
 		goto done;
 	}
 
-	user = strdup(outbound_profile->destination_number);
+	user = strdup(outbound_profile_in->destination_number);
 
 	if (!user)
 		goto done;
@@ -271,7 +261,7 @@ static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *s
 	} else if(var_event) {
 		const char* uuid_e_session = switch_event_get_header(var_event, "ent_originate_aleg_uuid");
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "CHECKING ORIGINATE-UUID : %s\n", uuid_e_session);
-		if (uuid_e_session && (e_session = switch_core_session_force_locate(uuid_e_session)) != NULL) {
+		if (uuid_e_session && (e_session = switch_core_session_locate(uuid_e_session)) != NULL) {
 			a_session = e_session;
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "FOUND ORIGINATE-UUID : %s\n", uuid_e_session);
 		}
@@ -382,6 +372,32 @@ static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *s
 		goto done;
 	}
 
+	//
+	outbound_profile = outbound_profile_in;
+	/*
+	outbound_profile = switch_caller_profile_dup(outbound_profile_in->pool, outbound_profile_in);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(*new_session), SWITCH_LOG_DEBUG1, "CHECKING CALLER-ID\n");
+	if ((x_params = switch_xml_child(x_user, "profile-variables"))) {
+		const char* val = NULL;
+		outbound_profile->soft = NULL;
+		for (x_param = switch_xml_child(x_params, "variable"); x_param; x_param = x_param->next) {
+			const char *pvar = switch_xml_attr(x_param, "name");
+			const char *val = switch_xml_attr(x_param, "value");
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(*new_session), SWITCH_LOG_DEBUG1, "setting profile var %s = %s\n", pvar, val);
+			switch_caller_profile_set_var(outbound_profile, pvar, val);
+		}
+		// * TODO * verify onnet/offnet
+		if((val=switch_caller_get_field_by_name(outbound_profile, "Endpoint-Caller-ID-Name"))) {
+			outbound_profile->callee_id_name = val;
+			outbound_profile->orig_caller_id_name = val;
+		}
+		if((val=switch_caller_get_field_by_name(outbound_profile, "Endpoint-Caller-ID-Number"))) {
+			outbound_profile->callee_id_number = val;
+			outbound_profile->orig_caller_id_number = val;
+		}
+	}
+	*/
+
 	status = switch_ivr_originate(session, new_session, &cause, d_dest, timelimit, NULL,
 	 	                         cid_name_override, cid_num_override, outbound_profile, var_event, myflags,
 	 	                         cancel_cause, NULL);
@@ -409,17 +425,20 @@ static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *s
 			}
 		}
 
+
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(*new_session), SWITCH_LOG_DEBUG1, "CHECKING CALLER-ID\n");
 		if ((x_params = switch_xml_child(x_user, "profile-variables"))) {
-			switch_caller_profile_t *cp = NULL;
+			switch_caller_profile_t *cp = switch_channel_get_caller_profile(new_channel);
 			const char* val = NULL;
+			cp->soft = NULL;
 			for (x_param = switch_xml_child(x_params, "variable"); x_param; x_param = x_param->next) {
 				const char *pvar = switch_xml_attr(x_param, "name");
 				const char *val = switch_xml_attr(x_param, "value");
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(*new_session), SWITCH_LOG_DEBUG1, "setting profile var %s = %s\n", pvar, val);
 				switch_channel_set_profile_var(new_channel, pvar, val);
+				//switch_caller_profile_set_var(cp, pvar, val);
 			}
-			cp = switch_channel_get_caller_profile(new_channel);
+			// * TODO * verify onnet/offnet
 			if((val=switch_caller_get_field_by_name(cp, "Endpoint-Caller-ID-Name"))) {
 					cp->callee_id_name = val;
 					cp->orig_caller_id_name = val;
@@ -466,7 +485,15 @@ static switch_call_cause_t kz_endpoint_outgoing_channel(switch_core_session_t *s
 }
 
 
+/* kazoo endpoint */
+
+
+switch_io_routines_t kz_endpoint_io_routines = {
+	/*.outgoing_channel */ kz_endpoint_outgoing_channel
+};
+
 void add_kz_endpoints(switch_loadable_module_interface_t **module_interface) {
+	switch_endpoint_interface_t *kz_endpoint_interface;
 	kz_endpoint_interface = (switch_endpoint_interface_t *) switch_loadable_module_create_interface(*module_interface, SWITCH_ENDPOINT_INTERFACE);
 	kz_endpoint_interface->interface_name = "kz";
 	kz_endpoint_interface->io_routines = &kz_endpoint_io_routines;
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_event_stream.c b/src/mod/event_handlers/mod_kazoo/kazoo_event_stream.c
index 5e5fb8dff4a..3d1f18326b2 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_event_stream.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_event_stream.c
@@ -202,14 +202,17 @@ static void event_handler(switch_event_t *event) {
 	ei_x_encode_version(ebuf);
 
 	switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Target-Node", event_binding->stream->node->peer_nodename);
+	switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Switch-Nodename", kazoo_globals.ei_cnode.thisnodename);
 
 	if(event_stream->node->legacy) {
-		switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Switch-Nodename", kazoo_globals.ei_cnode.thisnodename);
 		res = encode_event_old(event, ebuf);
 	} else {
 		res = encode_event_new(event, ebuf);
 	}
 
+	switch_event_del_header(event, "Switch-Nodename");
+	switch_event_del_header(event, "Target-Node");
+
 	if(!res) {
 		ei_x_free(ebuf);
 		switch_safe_free(ebuf);
@@ -239,6 +242,7 @@ static void *SWITCH_THREAD_FUNC event_stream_loop(switch_thread_t *thread, void
     const char *ip_addr;
 	void *pop;
 	short event_stream_framing;
+	short event_stream_keepalive;
 	short ok = 1;
 
 	switch_atomic_inc(&kazoo_globals.threads);
@@ -246,6 +250,7 @@ static void *SWITCH_THREAD_FUNC event_stream_loop(switch_thread_t *thread, void
 	switch_assert(event_stream != NULL);
 
 	event_stream_framing = event_stream->event_stream_framing;
+	event_stream_keepalive = event_stream->event_stream_keepalive;
 
 	/* figure out what socket we just opened */
 	switch_socket_addr_get(&sa, SWITCH_FALSE, event_stream->acceptor);
@@ -274,6 +279,12 @@ static void *SWITCH_THREAD_FUNC event_stream_loop(switch_thread_t *thread, void
                         switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Couldn't set socket as non-blocking\n");
                     }
 
+                    if (event_stream_keepalive) {
+                    	if (switch_socket_opt_set(newsocket, SWITCH_SO_KEEPALIVE, TRUE)) {
+                    		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Couldn't set socket keep-alive\n");
+                    	}
+                    }
+
                     if (switch_socket_opt_set(newsocket, SWITCH_SO_TCP_NODELAY, 1)) {
                         switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Couldn't disable Nagle.\n");
                     }
@@ -294,6 +305,8 @@ static void *SWITCH_THREAD_FUNC event_stream_loop(switch_thread_t *thread, void
 					switch_get_addr(event_stream->local_ip, sizeof (event_stream->local_ip), sa);
 
 					event_stream->connected = SWITCH_TRUE;
+					event_stream->connected_time = switch_micro_time_now();
+
 					switch_mutex_unlock(event_stream->socket_mutex);
 
 					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Erlang event stream %p client %s:%u\n", (void *)event_stream, event_stream->remote_ip, event_stream->remote_port);
@@ -302,7 +315,7 @@ static void *SWITCH_THREAD_FUNC event_stream_loop(switch_thread_t *thread, void
 		}
 
 		/* if there was an event waiting in our queue send it to the client */
-		if (switch_queue_pop_timeout(event_stream->queue, &pop, 200000) == SWITCH_STATUS_SUCCESS) {
+		if (ei_queue_pop(event_stream->queue, &pop, event_stream->queue_timeout) == SWITCH_STATUS_SUCCESS) {
 			ei_x_buff *ebuf = (ei_x_buff *) pop;
 
 			if (event_stream->socket) {
@@ -391,7 +404,7 @@ ei_event_stream_t *new_event_stream(ei_node_t *ei_node, const erlang_pid *from)
 	/* from the memory pool, allocate the event stream structure */
 	if (!(event_stream = switch_core_alloc(pool, sizeof (*event_stream)))) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Out of memory: I may have Alzheimers but at least I dont have Alzheimers.\n");
-		return NULL;
+		goto cleanup;
 	}
 
 	/* prepare the event stream */
@@ -401,34 +414,32 @@ ei_event_stream_t *new_event_stream(ei_node_t *ei_node, const erlang_pid *from)
 	event_stream->connected = SWITCH_FALSE;
 	event_stream->node = ei_node;
 	event_stream->event_stream_framing = ei_node->event_stream_framing;
+	event_stream->event_stream_keepalive = ei_node->event_stream_keepalive;
+	event_stream->queue_timeout = ei_node->event_stream_queue_timeout;
 	memcpy(&event_stream->pid, from, sizeof(erlang_pid));
 	switch_queue_create(&event_stream->queue, MAX_QUEUE_LEN, pool);
 
 	/* create a socket for accepting the event stream client */
     if (!(event_stream->acceptor = create_socket(pool))) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Like car accidents, most hardware problems are due to driver error.\n");
-		/* TODO: clean up */
-        return NULL;
+		goto cleanup;
     }
 
 	if (switch_socket_opt_set(event_stream->acceptor, SWITCH_SO_NONBLOCK, TRUE)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Hey, it compiles!\n");
-		/* TODO: clean up */
-        return NULL;
+		goto cleanup;
 	}
 
 	/* create a pollset so we can efficiently check for new client connections */
 	if (switch_pollset_create(&event_stream->pollset, 1000, pool, 0) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "My software never has bugs. It just develops random features.\n");
-		/* TODO: clean up */
-        return NULL;
+		goto cleanup;
 	}
 
 	switch_socket_create_pollfd(&event_stream->pollfd, event_stream->acceptor, SWITCH_POLLIN | SWITCH_POLLERR, NULL, pool);
 	if (switch_pollset_add(event_stream->pollset, event_stream->pollfd) != SWITCH_STATUS_SUCCESS) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "If you saw a heat wave, would you wave back?\n");
-		/* TODO: clean up */
-        return NULL;
+		goto cleanup;
 	}
 
 	switch_mutex_init(&event_stream->socket_mutex, SWITCH_MUTEX_DEFAULT, pool);
@@ -453,6 +464,26 @@ ei_event_stream_t *new_event_stream(ei_node_t *ei_node, const erlang_pid *from)
 	switch_thread_create(&thread, thd_attr, event_stream_loop, event_stream, event_stream->pool);
 
 	return event_stream;
+
+cleanup:
+
+	if (event_stream) {
+		/* remove the acceptor pollset */
+		if (event_stream->pollset) {
+			switch_pollset_remove(event_stream->pollset, event_stream->pollfd);
+		}
+
+		/* close any open sockets */
+		if (event_stream->acceptor) {
+			close_socket(&event_stream->acceptor);
+		}
+	}
+
+	/* clean up the memory */
+	switch_core_destroy_memory_pool(&pool);
+
+    return NULL;
+
 }
 
 unsigned long get_stream_port(const ei_event_stream_t *event_stream) {
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_fetch_agent.c b/src/mod/event_handlers/mod_kazoo/kazoo_fetch_agent.c
index 555fe942bfd..5023e393440 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_fetch_agent.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_fetch_agent.c
@@ -147,6 +147,14 @@ static switch_xml_t fetch_handler(const char *section, const char *tag_name, con
 		return xml;
 	}
 
+	/* no profile, no work required */
+	if (!profile) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "weird case where client is available but there's no profile for %s. try reloading mod_kazoo.\n"
+						  ,section);
+		switch_thread_rwlock_unlock(agent->lock);
+		return xml;
+	}
+
 	if(event == NULL) {
 		if (switch_event_create(&event, SWITCH_EVENT_GENERAL) != SWITCH_STATUS_SUCCESS) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error creating event for fetch handler\n");
@@ -154,6 +162,8 @@ static switch_xml_t fetch_handler(const char *section, const char *tag_name, con
 		}
 	}
 
+	switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "Switch-Nodename", kazoo_globals.ei_cnode.thisnodename);
+
 	/* prepare the reply collector */
 	switch_uuid_get(&uuid);
 	switch_uuid_format(reply.uuid_str, &uuid);
@@ -165,7 +175,7 @@ static switch_xml_t fetch_handler(const char *section, const char *tag_name, con
 		for(i = 0; fetch_uuid_sources[i] != NULL; i++) {
 			if((fetch_call_id = switch_event_get_header(event, fetch_uuid_sources[i])) != NULL) {
 				switch_core_session_t *session = NULL;
-				if((session = switch_core_session_force_locate(fetch_call_id)) != NULL) {
+				if((session = switch_core_session_locate(fetch_call_id)) != NULL) {
 					switch_channel_t *channel = switch_core_session_get_channel(session);
 					uint32_t verbose = switch_channel_test_flag(channel, CF_VERBOSE_EVENTS);
 					switch_channel_set_flag(channel, CF_VERBOSE_EVENTS);
@@ -309,8 +319,9 @@ static switch_xml_t fetch_handler(const char *section, const char *tag_name, con
 						  ,reply.uuid_str
 						  ,(unsigned int) (switch_micro_time_now() - now) / 1000
 						  ,reply.xml_str);
-
-		xml = switch_xml_parse_str_dynamic(reply.xml_str, SWITCH_FALSE);
+		if ((xml = switch_xml_parse_str_dynamic(reply.xml_str, SWITCH_FALSE)) == NULL) {
+			switch_safe_free(reply.xml_str);
+		}
 	} else {
 		/* facepalm */
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Request for %s XML (%s) timed-out after %dms\n"
@@ -327,6 +338,8 @@ void bind_fetch_profile(ei_xml_agent_t *agent, kazoo_config_ptr fetch_handlers)
 	switch_hash_index_t *hi;
 	kazoo_fetch_profile_ptr val = NULL, ptr = NULL;
 
+	if (!fetch_handlers) return;
+
 	for (hi = switch_core_hash_first(fetch_handlers->hash); hi; hi = switch_core_hash_next(&hi)) {
 		switch_core_hash_this(hi, NULL, NULL, (void**) &val);
 		if (val && val->section == agent->section) {
@@ -335,6 +348,7 @@ void bind_fetch_profile(ei_xml_agent_t *agent, kazoo_config_ptr fetch_handlers)
 		}
 	}
 	agent->profile = ptr;
+	switch_safe_free(hi);
 }
 
 void rebind_fetch_profiles(kazoo_config_ptr fetch_handlers)
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_fields.h b/src/mod/event_handlers/mod_kazoo/kazoo_fields.h
index 6c8111bfe44..8d17378768e 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_fields.h
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_fields.h
@@ -55,6 +55,10 @@ struct kazoo_log_levels
 	switch_log_level_t time_log_level;
 	switch_log_level_t filtered_event_log_level;
 	switch_log_level_t filtered_field_log_level;
+	switch_log_level_t trace_log_level;
+	switch_log_level_t debug_log_level;
+	switch_log_level_t error_log_level;
+	switch_log_level_t hashing_log_level;
 
 };
 
@@ -138,6 +142,7 @@ struct kazoo_field_t {
 	switch_bool_t exclude_prefix;
 	kazoo_field_type in_type;
 	kazoo_json_field_type out_type;
+	int out_type_as_array;
 	kazoo_filter_ptr filter;
 
 	kazoo_definition_ptr ref;
@@ -162,6 +167,7 @@ struct kazoo_event {
 	char *name;
 	kazoo_fields_ptr fields;
 	kazoo_filter_ptr filter;
+	kazoo_loglevels_ptr logging;
 
 	kazoo_event_t* next;
 };
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_message.c b/src/mod/event_handlers/mod_kazoo/kazoo_message.c
index a35dfab080f..e43d656b4ce 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_message.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_message.c
@@ -45,7 +45,7 @@ void kazoo_cJSON_AddItemToObject(cJSON *object, const char *string, cJSON *item)
 	cJSON_AddItemToObject(object, string, item);
 }
 
-static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter)
+static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter, kazoo_logging_ptr logging)
 {
 	switch_event_header_t *header;
 	int hasValue = 0, n;
@@ -55,6 +55,7 @@ static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter)
 
 	case FILTER_COMPARE_EXISTS:
 		hasValue = switch_event_get_header(evt, filter->name) != NULL ? 1 : 0;
+		switch_log_printf(SWITCH_CHANNEL_LOG, logging->levels->trace_log_level, "profile[%s] event %s checking if %s exists => %s\n", logging->profile_name, logging->event_name, filter->name, hasValue ? "true" : "false");
 		break;
 
 	case FILTER_COMPARE_VALUE:
@@ -64,6 +65,7 @@ static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter)
 			value = switch_event_get_header(evt, filter->name);
 		}
 		hasValue = value ? !strcmp(value, filter->value) : 0;
+		switch_log_printf(SWITCH_CHANNEL_LOG, logging->levels->trace_log_level, "profile[%s] event %s compare value %s to %s => %s == %s => %s\n", logging->profile_name, logging->event_name, filter->name, filter->value, value, filter->value, hasValue ? "true" : "false");
 		break;
 
 	case FILTER_COMPARE_FIELD:
@@ -73,6 +75,7 @@ static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter)
 			value = switch_event_get_header(evt, filter->name);
 		}
 		hasValue = value ? !strcmp(value, switch_event_get_header_nil(evt, filter->value)) : 0;
+		switch_log_printf(SWITCH_CHANNEL_LOG, logging->levels->trace_log_level, "profile[%s] event %s compare field %s to %s => %s == %s => %s\n", logging->profile_name, logging->event_name, filter->name, filter->value, value, switch_event_get_header_nil(evt, filter->value), hasValue ? "true" : "false");
 		break;
 
 	case FILTER_COMPARE_PREFIX:
@@ -112,10 +115,10 @@ static int inline filter_compare(switch_event_t* evt, kazoo_filter_ptr filter)
 	return hasValue;
 }
 
-static kazoo_filter_ptr inline filter_event(switch_event_t* evt, kazoo_filter_ptr filter)
+static kazoo_filter_ptr inline filter_event(switch_event_t* evt, kazoo_filter_ptr filter, kazoo_logging_ptr logging)
 {
 	while(filter) {
-		int hasValue = filter_compare(evt, filter);
+		int hasValue = filter_compare(evt, filter, logging);
 		if(filter->type == FILTER_EXCLUDE) {
 			if(hasValue)
 				break;
@@ -132,31 +135,37 @@ static void kazoo_event_init_json_fields(switch_event_t *event, cJSON *json)
 {
 	switch_event_header_t *hp;
 	for (hp = event->headers; hp; hp = hp->next) {
-		if (hp->idx) {
-			cJSON *a = cJSON_CreateArray();
-			int i;
+		if (strncmp(hp->name, "_json_", 6)) {
+			if (hp->idx) {
+				cJSON *a = cJSON_CreateArray();
+				int i;
 
-			for(i = 0; i < hp->idx; i++) {
-				cJSON_AddItemToArray(a, cJSON_CreateString(hp->array[i]));
-			}
+				for(i = 0; i < hp->idx; i++) {
+					cJSON_AddItemToArray(a, cJSON_CreateString(hp->array[i]));
+				}
 
-			cJSON_AddItemToObject(json, hp->name, a);
+				cJSON_AddItemToObject(json, hp->name, a);
 
-		} else {
-			cJSON_AddItemToObject(json, hp->name, cJSON_CreateString(hp->value));
+			} else {
+				cJSON_AddItemToObject(json, hp->name, cJSON_CreateString(hp->value));
+			}
 		}
 	}
 }
 
 static switch_status_t kazoo_event_init_json(kazoo_fields_ptr fields1, kazoo_fields_ptr fields2, switch_event_t* evt, cJSON** clone)
 {
-	switch_status_t status;
+	switch_status_t status = SWITCH_STATUS_SUCCESS;
 	if( (fields2 && fields2->verbose)
 			|| (fields1 && fields1->verbose)
 			|| ( (!fields2) &&  (!fields1)) ) {
-		status = switch_event_serialize_json_obj(evt, clone);
+		*clone = cJSON_CreateObject();
+		if((*clone) == NULL) {
+			status = SWITCH_STATUS_GENERR;
+		} else {
+			kazoo_event_init_json_fields(evt, *clone);
+		}
 	} else {
-		status = SWITCH_STATUS_SUCCESS;
 		*clone = cJSON_CreateObject();
 		if((*clone) == NULL) {
 			status = SWITCH_STATUS_GENERR;
@@ -185,7 +194,11 @@ static cJSON * kazoo_event_json_value(kazoo_json_field_type type, const char *va
 		break;
 
 	case JSON_RAW:
-		item = cJSON_CreateRaw(value);
+		item = cJSON_Parse(value);
+		if (!item) {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "parse from raw error!\n");
+			item = cJSON_CreateRaw(value);
+		}
 		break;
 
 	default:
@@ -210,18 +223,24 @@ cJSON * kazoo_event_add_field_to_json(cJSON *dst, switch_event_t *src, kazoo_fie
 {
 	switch_event_header_t *header;
 	char *expanded;
-	uint i, n;
+	int i, n;
 	cJSON *item = NULL;
 
 	switch(field->in_type) {
 		case FIELD_COPY:
-			if((header = switch_event_get_header_ptr(src, field->name)) != NULL) {
+			if (!strcmp(field->name, "_body")) {
+				item = kazoo_event_add_json_value(dst, field, field->as ? field->as : field->name, src->body);
+			} else if((header = switch_event_get_header_ptr(src, field->name)) != NULL) {
 				if (header->idx) {
 					item = cJSON_CreateArray();
 					for(i = 0; i < header->idx; i++) {
 						cJSON_AddItemToArray(item, kazoo_event_json_value(field->out_type, header->array[i]));
 					}
 					kazoo_cJSON_AddItemToObject(dst, field->as ? field->as : field->name, item);
+				} else if (field->out_type_as_array) {
+					item = cJSON_CreateArray();
+					cJSON_AddItemToArray(item, kazoo_event_json_value(field->out_type, header->value));
+					kazoo_cJSON_AddItemToObject(dst, field->as ? field->as : field->name, item);
 				} else {
 					item = kazoo_event_add_json_value(dst, field, field->as ? field->as : field->name, header->value);
 				}
@@ -299,7 +318,7 @@ static switch_status_t kazoo_event_add_fields_to_json(kazoo_logging_ptr logging,
 	while(field) {
 		if(field->in_type == FIELD_REFERENCE) {
 			if(field->ref) {
-				if((filter = filter_event(src, field->ref->filter)) != NULL) {
+				if((filter = filter_event(src, field->ref->filter, logging)) != NULL) {
 					switch_log_printf(SWITCH_CHANNEL_LOG, logging->levels->filtered_field_log_level, "profile[%s] event %s, referenced field %s filtered by settings %s : %s\n", logging->profile_name, logging->event_name, field->ref->name, filter->name, filter->value);
 				} else {
 					kazoo_event_add_fields_to_json(logging, dst, src, field->ref->head);
@@ -308,7 +327,7 @@ static switch_status_t kazoo_event_add_fields_to_json(kazoo_logging_ptr logging,
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "profile[%s] event %s, referenced field %s not found\n", logging->profile_name, logging->event_name, field->name);
 			}
 		} else {
-			if((filter = filter_event(src, field->filter)) != NULL) {
+			if((filter = filter_event(src, field->filter, logging)) != NULL) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, logging->levels->filtered_field_log_level, "profile[%s] event %s, field %s filtered by settings %s : %s\n", logging->profile_name, logging->event_name, field->name, filter->name, filter->value);
 			} else {
 				item = kazoo_event_add_field_to_json(dst, src, field);
@@ -351,12 +370,9 @@ kazoo_message_ptr kazoo_message_create_event(switch_event_t* evt, kazoo_event_pt
 	kazoo_logging_t logging;
 
 	logging.levels = profile->logging;
-	logging.event_name = switch_event_get_header_nil(evt, "Event-Name");
+	logging.event_name = evt->subclass_name ? evt->subclass_name : switch_event_get_header_nil(evt, "Event-Name");
 	logging.profile_name = profile->name;
 
-	switch_event_add_header_string(evt, SWITCH_STACK_BOTTOM, "Switch-Nodename", kazoo_globals.ei_cnode.thisnodename);
-
-
 	message = malloc(sizeof(kazoo_message_t));
 	if(message == NULL) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error allocating memory for serializing event to json\n");
@@ -366,15 +382,19 @@ kazoo_message_ptr kazoo_message_create_event(switch_event_t* evt, kazoo_event_pt
 
 	if(profile->filter) {
 		// filtering
-		if((filtered = filter_event(evt, profile->filter)) != NULL) {
+		if((filtered = filter_event(evt, profile->filter, &logging)) != NULL) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, logging.levels->filtered_event_log_level, "profile[%s] event %s filtered by profile settings %s : %s\n", logging.profile_name, logging.event_name, filtered->name, filtered->value);
 			kazoo_message_destroy(&message);
 			return NULL;
 		}
 	}
 
-	if(event && event->filter) {
-		if((filtered = filter_event(evt, event->filter)) != NULL) {
+	if (event->logging) {
+		logging.levels = event->logging;
+	}
+
+	if (event && event->filter) {
+		if((filtered = filter_event(evt, event->filter, &logging)) != NULL) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, logging.levels->filtered_event_log_level, "profile[%s] event %s filtered by event settings %s : %s\n", logging.profile_name, logging.event_name, filtered->name, filtered->value);
 			kazoo_message_destroy(&message);
 			return NULL;
@@ -409,8 +429,6 @@ kazoo_message_ptr kazoo_message_create_fetch(switch_event_t* evt, kazoo_fetch_pr
 	logging.event_name = switch_event_get_header_nil(evt, "Event-Name");
 	logging.profile_name = profile->name;
 
-	switch_event_add_header_string(evt, SWITCH_STACK_BOTTOM, "Switch-Nodename", kazoo_globals.ei_cnode.thisnodename);
-
 	message = malloc(sizeof(kazoo_message_t));
 	if(message == NULL) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error allocating memory for serializing event to json\n");
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_node.c b/src/mod/event_handlers/mod_kazoo/kazoo_node.c
index a92fe49aa23..fae09bee2fb 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_node.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_node.c
@@ -106,6 +106,7 @@ static switch_status_t find_request(char *atom, int *request) {
 static void destroy_node_handler(ei_node_t *ei_node) {
 	int pending = 0;
 	void *pop;
+	switch_memory_pool_t *pool = ei_node->pool;
 
 	switch_clear_flag(ei_node, LFLAG_RUNNING);
 
@@ -145,7 +146,7 @@ static void destroy_node_handler(ei_node_t *ei_node) {
 
 	switch_mutex_destroy(ei_node->event_streams_mutex);
 
-	switch_core_destroy_memory_pool(&ei_node->pool);
+	switch_core_destroy_memory_pool(&pool);
 }
 
 static switch_status_t add_to_ei_nodes(ei_node_t *this_ei_node) {
@@ -207,15 +208,13 @@ SWITCH_DECLARE(switch_status_t) kazoo_api_execute(const char *cmd, const char *a
 	char *arg_used;
 	char *cmd_used;
 	int  fire_event = 0;
-	char *arg_expanded;
+	char *arg_expanded = NULL;
 	switch_event_t* evt;
 
 	switch_assert(stream != NULL);
 	switch_assert(stream->data != NULL);
 	switch_assert(stream->write_function != NULL);
 
-	arg_expanded = (char *) arg;
-
 	switch_event_create(&evt, SWITCH_EVENT_GENERAL);
 	arg_expanded = switch_event_expand_headers(evt, arg);
 	switch_event_destroy(&evt);
@@ -319,15 +318,15 @@ static void *SWITCH_THREAD_FUNC bgapi3_exec(switch_thread_t *thread, void *obj)
 	ei_node_t *ei_node = acs->ei_node;
 	ei_send_msg_t *send_msg;
 
-	switch_malloc(send_msg, sizeof(*send_msg));
-	memcpy(&send_msg->pid, &acs->pid, sizeof(erlang_pid));
-
 	if(!switch_test_flag(ei_node, LFLAG_RUNNING) || !switch_test_flag(&kazoo_globals, LFLAG_RUNNING)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Ignoring command while shuting down\n");
 		switch_atomic_dec(&ei_node->pending_bgapi);
 		return NULL;
 	}
 
+	switch_malloc(send_msg, sizeof(*send_msg));
+	memcpy(&send_msg->pid, &acs->pid, sizeof(erlang_pid));
+
 	ei_x_new_with_version(&send_msg->buf);
 
 	ei_x_encode_tuple_header(&send_msg->buf, 3);
@@ -464,6 +463,7 @@ static void log_sendmsg_request(char *uuid, switch_event_t *event)
 				}
 
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "log|%s|building xferext extension: %s %s\n", uuid, app_name, app_arg);
+				switch_safe_free(app_name);
 			}
 		}
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "log|%s|transfered call to xferext extension\n", uuid);
@@ -505,7 +505,7 @@ static switch_status_t build_event(switch_event_t *event, ei_x_buff * buf) {
 			if(!strcasecmp(key, "Call-ID")) {
 				switch_core_session_t *session = NULL;
 				if(!zstr(value)) {
-					if ((session = switch_core_session_force_locate(value)) != NULL) {
+					if ((session = switch_core_session_locate(value)) != NULL) {
 						switch_channel_t *channel = switch_core_session_get_channel(session);
 						switch_channel_event_set_data(channel, event);
 						switch_core_session_rwunlock(session);
@@ -692,20 +692,22 @@ static switch_status_t handle_request_command(ei_node_t *ei_node, erlang_pid *pi
 		return erlang_response_badarg(rbuf);
 	}
 
+	if (zstr_buf(uuid_str) || !(session = switch_core_session_locate(uuid_str))) {
+		return erlang_response_baduuid(rbuf);
+	}
+
 	switch_uuid_get(&cmd_uuid);
 	switch_uuid_format(cmd_uuid_str, &cmd_uuid);
 
 	switch_event_create(&event, SWITCH_EVENT_COMMAND);
 	if (build_event(event, buf) != SWITCH_STATUS_SUCCESS) {
+		switch_core_session_rwunlock(session);
 		return erlang_response_badarg(rbuf);
 	}
 
 	log_sendmsg_request(uuid_str, event);
 	switch_event_add_header_string(event, SWITCH_STACK_BOTTOM, "event-uuid", cmd_uuid_str);
 
-	if (zstr_buf(uuid_str) || !(session = switch_core_session_locate(uuid_str))) {
-		return erlang_response_baduuid(rbuf);
-	}
 	switch_core_session_queue_private_event(session, &event, SWITCH_FALSE);
 	switch_core_session_rwunlock(session);
 
@@ -767,16 +769,18 @@ static switch_status_t handle_request_sendmsg(ei_node_t *ei_node, erlang_pid *pi
 		return erlang_response_badarg(rbuf);
 	}
 
+	if (zstr_buf(uuid_str) || !(session = switch_core_session_locate(uuid_str))) {
+		return erlang_response_baduuid(rbuf);
+	}
+
 	switch_event_create(&event, SWITCH_EVENT_SEND_MESSAGE);
 	if (build_event(event, buf) != SWITCH_STATUS_SUCCESS) {
+		switch_core_session_rwunlock(session);
 		return erlang_response_badarg(rbuf);
 	}
 
 	log_sendmsg_request(uuid_str, event);
 
-	if (zstr_buf(uuid_str) || !(session = switch_core_session_locate(uuid_str))) {
-		return erlang_response_baduuid(rbuf);
-	}
 	switch_core_session_queue_private_event(session, &event, SWITCH_FALSE);
 	switch_core_session_rwunlock(session);
 
@@ -903,25 +907,25 @@ static switch_status_t handle_request_bgapi4(ei_node_t *ei_node, erlang_pid *pid
 }
 
 static switch_status_t handle_request_api4(ei_node_t *ei_node, erlang_pid *pid, ei_x_buff *buf, ei_x_buff *rbuf) {
-        char cmd[MAXATOMLEN + 1];
-        char *arg;
+	char cmd[MAXATOMLEN + 1];
+	char *arg;
 	switch_stream_handle_t stream = { 0 };
 
 	SWITCH_STANDARD_STREAM(stream);
 	switch_event_create(&stream.param_event, SWITCH_EVENT_API);
 
-        if (ei_decode_atom_safe(buf->buff, &buf->index, cmd)) {
-                return erlang_response_badarg(rbuf);
-        }
+	if (ei_decode_atom_safe(buf->buff, &buf->index, cmd)) {
+		return erlang_response_badarg(rbuf);
+	}
 
-        if (ei_decode_string_or_binary(buf->buff, &buf->index, &arg)) {
-                return erlang_response_badarg(rbuf);
-        }
+	if (ei_decode_string_or_binary(buf->buff, &buf->index, &arg)) {
+		return erlang_response_badarg(rbuf);
+	}
 
-        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "exec: %s(%s)\n", cmd, arg);
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "exec: %s(%s)\n", cmd, arg);
 
-        if (rbuf) {
-                char *reply;
+	if (rbuf) {
+		char *reply;
 		switch_status_t status;
 
 		status = api_exec_stream(cmd, arg, &stream, &reply);
@@ -940,17 +944,17 @@ static switch_status_t handle_request_api4(ei_node_t *ei_node, erlang_pid *pid,
 			ei_encode_switch_event_headers(rbuf, stream.param_event);
 		}
 
-                switch_safe_free(reply);
-        }
+		switch_safe_free(reply);
+	}
 
 	if (stream.param_event) {
 		switch_event_fire(&stream.param_event);
 	}
 
-        switch_safe_free(arg);
+	switch_safe_free(arg);
 	switch_safe_free(stream.data);
 
-        return SWITCH_STATUS_SUCCESS;
+	return SWITCH_STATUS_SUCCESS;
 }
 
 static switch_status_t handle_request_json_api(ei_node_t *ei_node, erlang_pid *pid, ei_x_buff *buf, ei_x_buff *rbuf)
@@ -976,11 +980,14 @@ static switch_status_t handle_request_json_api(ei_node_t *ei_node, erlang_pid *p
    		ei_x_encode_tuple_header(rbuf, 2);
    		ei_x_encode_atom(rbuf, "parse_error");
    		_ei_x_encode_string(rbuf, parse_end);
+		switch_safe_free(arg);
    		return status;
 	}
 
 	if ((uuid = cJSON_GetObjectCstr(jcmd, "uuid"))) {
 		if (!(session = switch_core_session_locate(uuid))) {
+			cJSON_Delete(jcmd);
+			switch_safe_free(arg);
 			return erlang_response_baduuid(rbuf);
 		}
 	}
@@ -1060,7 +1067,6 @@ static switch_status_t handle_request_event(ei_node_t *ei_node, erlang_pid *pid,
 	}
 
 	for (i = 1; i <= length; i++) {
-
 		if (ei_decode_atom_safe(buf->buff, &buf->index, event_name)) {
 			switch_mutex_unlock(ei_node->event_streams_mutex);
 			return erlang_response_badarg(rbuf);
@@ -1237,13 +1243,10 @@ static switch_status_t handle_mod_kazoo_request(ei_node_t *ei_node, erlang_msg *
         /* {'$gen_call', {_, _}, {_, _}} = Buf */
 	} else if (arity == 3 && !strncmp(atom, "$gen_call", 9)) {
 		switch_status_t status;
-		ei_send_msg_t *send_msg;
+		ei_send_msg_t *send_msg = NULL;
 		erlang_ref ref;
 
 		switch_malloc(send_msg, sizeof(*send_msg));
-
-		ei_x_new(&send_msg->buf);
-
 		ei_x_new_with_version(&send_msg->buf);
 
 		/* ...{_, _}, {_, _}} = Buf */
@@ -1252,6 +1255,8 @@ static switch_status_t handle_mod_kazoo_request(ei_node_t *ei_node, erlang_msg *
 		/* is_tuple(Type) */
 		if (type != ERL_SMALL_TUPLE_EXT) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Received erlang call message of an unexpected type (ensure you are using Kazoo v2.14+).\n");
+			ei_x_free(&send_msg->buf);
+			switch_safe_free(send_msg);
 			return SWITCH_STATUS_GENERR;
 		}
 
@@ -1261,12 +1266,16 @@ static switch_status_t handle_mod_kazoo_request(ei_node_t *ei_node, erlang_msg *
 		/* ...pid(), _}, {_, _}} = Buf */
 		if (ei_decode_pid(buf->buff, &buf->index, &send_msg->pid)) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Received erlang call without a reply pid (ensure you are using Kazoo v2.14+).\n");
+			ei_x_free(&send_msg->buf);
+			switch_safe_free(send_msg);
 			return SWITCH_STATUS_GENERR;
 		}
 
 		/* ...ref()}, {_, _}} = Buf */
 		if (ei_decode_ref(buf->buff, &buf->index, &ref)) {
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Received erlang call without a reply tag (ensure you are using Kazoo v2.14+).\n");
+			ei_x_free(&send_msg->buf);
+			switch_safe_free(send_msg);
 			return SWITCH_STATUS_GENERR;
 		}
 
@@ -1277,6 +1286,7 @@ static switch_status_t handle_mod_kazoo_request(ei_node_t *ei_node, erlang_msg *
 		status = handle_kazoo_request(ei_node, &msg->from, buf, &send_msg->buf);
 
 		if (switch_queue_trypush(ei_node->send_msgs, send_msg) != SWITCH_STATUS_SUCCESS) {
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "error queuing reply\n");
 			ei_x_free(&send_msg->buf);
 			switch_safe_free(send_msg);
 		}
@@ -1292,15 +1302,14 @@ static switch_status_t handle_mod_kazoo_request(ei_node_t *ei_node, erlang_msg *
 static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg *msg, ei_x_buff *buf) {
 	int version, size, type, arity;
 	char atom[MAXATOMLEN + 1];
-	ei_send_msg_t *send_msg;
+	ei_send_msg_t *send_msg = NULL;
 	erlang_ref ref;
 
-	switch_malloc(send_msg, sizeof(*send_msg));
-
-	ei_x_new(&send_msg->buf);
-
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Received net_kernel message, attempting to reply\n");
 
+	switch_malloc(send_msg, sizeof(*send_msg));
+	ei_x_new_with_version(&send_msg->buf);
+
 	buf->index = 0;
 	ei_decode_version(buf->buff, &buf->index, &version);
 	ei_get_type(buf->buff, &buf->index, &type, &size);
@@ -1308,7 +1317,7 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* is_tuple(Buff) */
 	if (type != ERL_SMALL_TUPLE_EXT) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Received net_kernel message of an unexpected type\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	ei_decode_tuple_header(buf->buff, &buf->index, &arity);
@@ -1316,13 +1325,13 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* {_, _, _} = Buf */
 	if (arity != 3) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Received net_kernel tuple has an unexpected arity\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	/* {'$gen_call', _, _} = Buf */
 	if (ei_decode_atom_safe(buf->buff, &buf->index, atom) || strncmp(atom, "$gen_call", 9)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Received net_kernel message tuple does not begin with the atom '$gen_call'\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	ei_get_type(buf->buff, &buf->index, &type, &size);
@@ -1330,7 +1339,7 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* {_, Sender, _}=Buff, is_tuple(Sender) */
 	if (type != ERL_SMALL_TUPLE_EXT) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Second element of the net_kernel tuple is an unexpected type\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	ei_decode_tuple_header(buf->buff, &buf->index, &arity);
@@ -1338,13 +1347,13 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* {_, _}=Sender */
 	if (arity != 2) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Second element of the net_kernel message has an unexpected arity\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	/* {Pid, Ref}=Sender */
 	if (ei_decode_pid(buf->buff, &buf->index, &send_msg->pid) || ei_decode_ref(buf->buff, &buf->index, &ref)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Unable to decode erlang pid or ref of the net_kernel tuple second element\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	ei_get_type(buf->buff, &buf->index, &type, &size);
@@ -1352,7 +1361,7 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* {_, _, Request}=Buff, is_tuple(Request) */
 	if (type != ERL_SMALL_TUPLE_EXT) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Third element of the net_kernel message is an unexpected type\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	ei_decode_tuple_header(buf->buff, &buf->index, &arity);
@@ -1360,27 +1369,31 @@ static switch_status_t handle_net_kernel_request(ei_node_t *ei_node, erlang_msg
 	/* {_, _}=Request */
 	if (arity != 2) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Third element of the net_kernel message has an unexpected arity\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	/* {is_auth, _}=Request */
 	if (ei_decode_atom_safe(buf->buff, &buf->index, atom) || strncmp(atom, "is_auth", MAXATOMLEN)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "The net_kernel message third element does not begin with the atom 'is_auth'\n");
-		return SWITCH_STATUS_GENERR;
+		goto error;
 	}
 
 	/* To ! {Tag, Reply} */
-	ei_x_new_with_version(&send_msg->buf);
 	ei_x_encode_tuple_header(&send_msg->buf, 2);
 	ei_x_encode_ref(&send_msg->buf, &ref);
 	ei_x_encode_atom(&send_msg->buf, "yes");
 
 	if (switch_queue_trypush(ei_node->send_msgs, send_msg) != SWITCH_STATUS_SUCCESS) {
-		ei_x_free(&send_msg->buf);
-		switch_safe_free(send_msg);
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "unable to queue net kernel message\n");
+		goto error;
 	}
 
 	return SWITCH_STATUS_SUCCESS;
+
+error:
+	ei_x_free(&send_msg->buf);
+	switch_safe_free(send_msg);
+	return SWITCH_STATUS_GENERR;
 }
 
 static switch_status_t handle_erl_send(ei_node_t *ei_node, erlang_msg *msg, ei_x_buff *buf) {
@@ -1435,9 +1448,9 @@ static void *SWITCH_THREAD_FUNC receive_handler(switch_thread_t *thread, void *o
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Starting erlang receive handler %p: %s (%s:%d)\n", (void *)ei_node, ei_node->peer_nodename, ei_node->remote_ip, ei_node->remote_port);
 
 	while (switch_test_flag(ei_node, LFLAG_RUNNING) && switch_test_flag(&kazoo_globals, LFLAG_RUNNING)) {
-		void *pop;
+		void *pop = NULL;
 
-		if (switch_queue_pop_timeout(ei_node->received_msgs, &pop, 100000) == SWITCH_STATUS_SUCCESS) {
+		if (ei_queue_pop(ei_node->received_msgs, &pop, ei_node->receiver_queue_timeout) == SWITCH_STATUS_SUCCESS) {
 			ei_received_msg_t *received_msg = (ei_received_msg_t *) pop;
 			handle_erl_msg(ei_node, &received_msg->msg, &received_msg->buf);
 			ei_x_free(&received_msg->buf);
@@ -1469,13 +1482,13 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 	while (switch_test_flag(ei_node, LFLAG_RUNNING) && switch_test_flag(&kazoo_globals, LFLAG_RUNNING)) {
 		int status;
 		int send_msg_count = 0;
-		void *pop;
+		void *pop = NULL;
 
 		if (!received_msg) {
 			switch_malloc(received_msg, sizeof(*received_msg));
 			/* create a new buf for the erlang message and a rbuf for the reply */
 			if(kazoo_globals.receive_msg_preallocate > 0) {
-				received_msg->buf.buff = malloc(kazoo_globals.receive_msg_preallocate);
+				switch_malloc(received_msg->buf.buff, kazoo_globals.receive_msg_preallocate);
 				received_msg->buf.buffsz = kazoo_globals.receive_msg_preallocate;
 				received_msg->buf.index = 0;
 				if(received_msg->buf.buff == NULL) {
@@ -1485,10 +1498,12 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 			} else {
 				ei_x_new(&received_msg->buf);
 			}
+		} else {
+			received_msg->buf.index = 0;
 		}
 
 		while (++send_msg_count <= kazoo_globals.send_msg_batch
-			   && switch_queue_pop_timeout(ei_node->send_msgs, &pop, 20000) == SWITCH_STATUS_SUCCESS) {
+			   && ei_queue_pop(ei_node->send_msgs, &pop, ei_node->sender_queue_timeout) == SWITCH_STATUS_SUCCESS) {
 			ei_send_msg_t *send_msg = (ei_send_msg_t *) pop;
 			ei_helper_send(ei_node, &send_msg->pid, &send_msg->buf);
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Sent erlang message to %s <%d.%d.%d>\n"
@@ -1501,24 +1516,26 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 		}
 
 		/* wait for a erlang message, or timeout to check if the module is still running */
-		status = ei_xreceive_msg_tmo(ei_node->nodefd, &received_msg->msg, &received_msg->buf, kazoo_globals.receive_timeout);
+		status = ei_xreceive_msg_tmo(ei_node->nodefd, &received_msg->msg, &received_msg->buf, kazoo_globals.ei_receive_timeout);
 
 		switch (status) {
 		case ERL_TICK:
 			/* erlang nodes send ticks to eachother to validate they are still reachable, we dont have to do anything here */
+			fault_count = 0;
 			break;
 		case ERL_MSG:
 			fault_count = 0;
 
+			if (kazoo_globals.receive_msg_preallocate > 0 && received_msg->buf.buffsz > kazoo_globals.receive_msg_preallocate) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "increased received message buffer size to %d\n", received_msg->buf.buffsz);
+			}
+
 			if (switch_queue_trypush(ei_node->received_msgs, received_msg) != SWITCH_STATUS_SUCCESS) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "failed to push erlang received message from %s <%d.%d.%d> into queue\n", received_msg->msg.from.node, received_msg->msg.from.creation, received_msg->msg.from.num, received_msg->msg.from.serial);
 				ei_x_free(&received_msg->buf);
 				switch_safe_free(received_msg);
 			}
 
-			if (kazoo_globals.receive_msg_preallocate > 0 && received_msg->buf.buffsz > kazoo_globals.receive_msg_preallocate) {
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "increased received message buffer size to %d\n", received_msg->buf.buffsz);
-			}
-
 			received_msg = NULL;
 			break;
 		case ERL_ERROR:
@@ -1527,6 +1544,7 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 			case EAGAIN:
 				/* if ei_xreceive_msg_tmo just timed out, ignore it and let the while loop check if we are still running */
 				/* the erlang lib just wants us to try to receive again, so we will! */
+				fault_count = 0;
 				break;
 			case EMSGSIZE:
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Erlang communication fault with node %p %s (%s:%d): my spoon is too big\n", (void *)ei_node, ei_node->peer_nodename, ei_node->remote_ip, ei_node->remote_port);
@@ -1537,6 +1555,8 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 
 				if (fault_count >= kazoo_globals.io_fault_tolerance) {
 					switch_clear_flag(ei_node, LFLAG_RUNNING);
+				} else {
+					switch_sleep(kazoo_globals.io_fault_tolerance_sleep);
 				}
 
 				break;
@@ -1573,6 +1593,9 @@ static void *SWITCH_THREAD_FUNC handle_node(switch_thread_t *thread, void *obj)
 	destroy_node_handler(ei_node);
 
 	switch_atomic_dec(&kazoo_globals.threads);
+
+	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Shutdown Complete for erlang node handler %p: %s (%s:%d)\n", (void *)ei_node, ei_node->peer_nodename, ei_node->remote_ip, ei_node->remote_port);
+
 	return NULL;
 }
 
@@ -1608,6 +1631,10 @@ switch_status_t new_kazoo_node(int nodefd, ErlConnect *conn) {
 	ei_node->created_time = switch_micro_time_now();
 	ei_node->legacy = kazoo_globals.legacy_events;
 	ei_node->event_stream_framing = kazoo_globals.event_stream_framing;
+	ei_node->event_stream_keepalive = kazoo_globals.event_stream_keepalive;
+	ei_node->event_stream_queue_timeout = kazoo_globals.event_stream_queue_timeout;
+	ei_node->receiver_queue_timeout = kazoo_globals.node_receiver_queue_timeout;
+	ei_node->sender_queue_timeout = kazoo_globals.node_sender_queue_timeout;
 
 	/* store the IP and node name we are talking with */
 	switch_os_sock_put(&ei_node->socket, (switch_os_socket_t *)&nodefd, pool);
@@ -1630,7 +1657,7 @@ switch_status_t new_kazoo_node(int nodefd, ErlConnect *conn) {
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "New erlang connection from node %s (%s:%d) -> (%s:%d)\n", ei_node->peer_nodename, ei_node->remote_ip, ei_node->remote_port, ei_node->local_ip, ei_node->local_port);
 
-	for(i = 0; i < kazoo_globals.num_worker_threads; i++) {
+	for(i = 0; i < kazoo_globals.node_worker_threads; i++) {
 		switch_threadattr_create(&thd_attr, ei_node->pool);
 		switch_threadattr_detach_set(thd_attr, 1);
 		switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_tweaks.c b/src/mod/event_handlers/mod_kazoo/kazoo_tweaks.c
index 2afcdf1a6a2..bb7c792b889 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_tweaks.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_tweaks.c
@@ -64,7 +64,7 @@ static switch_status_t kz_tweaks_signal_bridge_on_hangup(switch_core_session_t *
 
 	const char *peer_uuid = switch_channel_get_variable(channel, "Bridge-B-Unique-ID");
 
-	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "tweak signal bridge on hangup: %s , %s\n", switch_core_session_get_uuid(session), peer_uuid);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "tweak signal bridge on hangup: %s , %s\n", switch_core_session_get_uuid(session), peer_uuid);
 
 	if (switch_event_create(&my_event, SWITCH_EVENT_CHANNEL_UNBRIDGE) == SWITCH_STATUS_SUCCESS) {
 		switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-A-Unique-ID", switch_core_session_get_uuid(session));
@@ -96,88 +96,38 @@ static void kz_tweaks_handle_bridge_variables(switch_event_t *event)
 	switch_core_session_t *a_session = NULL, *b_session = NULL;
 	const char *a_leg = switch_event_get_header(event, "Bridge-A-Unique-ID");
 	const char *b_leg = switch_event_get_header(event, "Bridge-B-Unique-ID");
+	const char *reentry = switch_event_get_header(event, "Bridge-Event-Processed");
 	int i;
 
 	if (!kz_test_tweak(KZ_TWEAK_BRIDGE_VARIABLES)) return;
 
-	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "tweak bridge event handler: variables : %s , %s\n", a_leg, b_leg);
+	if(reentry) return;
 
-	if (a_leg && (a_session = switch_core_session_force_locate(a_leg)) != NULL) {
+	switch_log_printf(SWITCH_CHANNEL_UUID_LOG(a_leg), SWITCH_LOG_DEBUG, "tweak bridge event handler: variables : %s , %s\n", a_leg, b_leg);
+
+	if (a_leg && (a_session = switch_core_session_locate(a_leg)) != NULL) {
 		switch_channel_t *a_channel = switch_core_session_get_channel(a_session);
-		if(switch_channel_get_variable_dup(a_channel, bridge_variables[0], SWITCH_FALSE, -1) == NULL) {
-			if(b_leg && (b_session = switch_core_session_force_locate(b_leg)) != NULL) {
-				switch_channel_t *b_channel = switch_core_session_get_channel(b_session);
+		switch_channel_timetable_t *a_times = switch_channel_get_timetable(a_channel);
+		if(b_leg && (b_session = switch_core_session_locate(b_leg)) != NULL) {
+			switch_channel_t *b_channel = switch_core_session_get_channel(b_session);
+			switch_channel_timetable_t *b_times = switch_channel_get_timetable(b_channel);
+			if (a_times->created <= b_times->created) {
+				for(i = 0; bridge_variables[i] != NULL; i++) {
+					const char *val = switch_channel_get_variable_dup(a_channel, bridge_variables[i], SWITCH_FALSE, -1);
+					switch_channel_set_variable(b_channel, bridge_variables[i], val);
+				}
+			} else {
 				for(i = 0; bridge_variables[i] != NULL; i++) {
 					const char *val = switch_channel_get_variable_dup(b_channel, bridge_variables[i], SWITCH_FALSE, -1);
 					switch_channel_set_variable(a_channel, bridge_variables[i], val);
 				}
-				switch_core_session_rwunlock(b_session);
-			}
-		} else {
-			if(b_leg && (b_session = switch_core_session_force_locate(b_leg)) != NULL) {
-				switch_channel_t *b_channel = switch_core_session_get_channel(b_session);
-				if(switch_channel_get_variable_dup(b_channel, bridge_variables[0], SWITCH_FALSE, -1) == NULL) {
-					for(i = 0; bridge_variables[i] != NULL; i++) {
-						const char *val = switch_channel_get_variable_dup(a_channel, bridge_variables[i], SWITCH_FALSE, -1);
-						switch_channel_set_variable(b_channel, bridge_variables[i], val);
-					}
-				}
-				switch_core_session_rwunlock(b_session);
 			}
+			switch_core_session_rwunlock(b_session);
 		}
 		switch_core_session_rwunlock(a_session);
 	} else {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "NOT FOUND : %s\n", a_leg);
-	}
-
-}
-
-static void kz_tweaks_handle_bridge_replaces_aleg(switch_event_t *event)
-{
-	switch_event_t *my_event;
-
-	const char *replaced_call_id =	switch_event_get_header(event, "variable_sip_replaces_call_id");
-	const char *a_leg_call_id =	switch_event_get_header(event, "variable_sip_replaces_a-leg");
-	const char *peer_uuid = switch_event_get_header(event, "Unique-ID");
-	int processed = 0;
-
-	if (!kz_test_tweak(KZ_TWEAK_BRIDGE_REPLACES_ALEG)) return;
-
-
-	if(a_leg_call_id && replaced_call_id) {
-		const char *call_id = switch_event_get_header(event, "Bridge-B-Unique-ID");
-		switch_core_session_t *session = NULL;
-		if ((session = switch_core_session_locate(peer_uuid)) != NULL) {
-			switch_channel_t *channel = switch_core_session_get_channel(session);
-			processed = switch_true(switch_channel_get_variable_dup(channel, "Bridge-Event-Processed", SWITCH_FALSE, -1));
-			switch_channel_set_variable(channel, "Bridge-Event-Processed", "true");
-			switch_core_session_rwunlock(session);
-		}
-
-		if(processed) {
-			if(call_id) {
-				if((session = switch_core_session_locate(call_id)) != NULL) {
-					switch_channel_t *channel = switch_core_session_get_channel(session);
-					switch_channel_set_variable(channel, "Bridge-Event-Processed", "true");
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "creating channel_bridge event A - %s , B - %s\n", switch_core_session_get_uuid(session), peer_uuid);
-					if (switch_event_create(&my_event, SWITCH_EVENT_CHANNEL_BRIDGE) == SWITCH_STATUS_SUCCESS) {
-						switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-A-Unique-ID", switch_core_session_get_uuid(session));
-						switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-B-Unique-ID", peer_uuid);
-						switch_channel_event_set_data(channel, my_event);
-						switch_event_fire(&my_event);
-					}
-					switch_channel_set_variable(channel, "Bridge-B-Unique-ID", peer_uuid);
-					switch_channel_add_state_handler(channel, &kz_tweaks_signal_bridge_state_handlers);
-					switch_core_session_rwunlock(session);
-				} else {
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "invalid session : %s\n", call_id);
-					DUMP_EVENT(event);
-				}
-			}
-		}
-
+		switch_log_printf(SWITCH_CHANNEL_UUID_LOG(a_leg), SWITCH_LOG_DEBUG1, "NOT FOUND : %s\n", a_leg);
 	}
-
 }
 
 static void kz_tweaks_handle_bridge_replaces_call_id(switch_event_t *event)
@@ -185,19 +135,22 @@ static void kz_tweaks_handle_bridge_replaces_call_id(switch_event_t *event)
 	switch_event_t *my_event;
 
 	const char *replaced_call_id =	switch_event_get_header(event, "variable_sip_replaces_call_id");
-	const char *a_leg_call_id =	switch_event_get_header(event, "variable_sip_replaces_a-leg");
 	const char *peer_uuid = switch_event_get_header(event, "Unique-ID");
+	const char *reentry = switch_event_get_header(event, "Bridge-Event-Processed");
 
 	if (!kz_test_tweak(KZ_TWEAK_BRIDGE_REPLACES_CALL_ID)) return;
 
-	if(a_leg_call_id && replaced_call_id) {
+	if(reentry) return;
+
+	if(replaced_call_id) {
 		switch_core_session_t *call_session = NULL;
 		const char *call_id = switch_event_get_header(event, "Bridge-B-Unique-ID");
-		if (call_id && (call_session = switch_core_session_force_locate(call_id)) != NULL) {
+		if (call_id && (call_session = switch_core_session_locate(call_id)) != NULL) {
 			switch_channel_t *call_channel = switch_core_session_get_channel(call_session);
 			if (switch_event_create(&my_event, SWITCH_EVENT_CHANNEL_BRIDGE) == SWITCH_STATUS_SUCCESS) {
 				switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-A-Unique-ID", switch_core_session_get_uuid(call_session));
 				switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-B-Unique-ID", peer_uuid);
+				switch_event_add_header_string(my_event, SWITCH_STACK_BOTTOM, "Bridge-Event-Processed", "true");
 				switch_channel_event_set_data(call_channel, my_event);
 				switch_event_fire(&my_event);
 			}
@@ -208,7 +161,6 @@ static void kz_tweaks_handle_bridge_replaces_call_id(switch_event_t *event)
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG1, "NOT FOUND : %s\n", call_id);
 		}
 	}
-
 }
 
 static void kz_tweaks_channel_bridge_event_handler(switch_event_t *event)
@@ -216,7 +168,6 @@ static void kz_tweaks_channel_bridge_event_handler(switch_event_t *event)
 	if (!kz_test_tweak(KZ_TWEAK_BRIDGE)) return;
 
 	kz_tweaks_handle_bridge_replaces_call_id(event);
-	kz_tweaks_handle_bridge_replaces_aleg(event);
 	kz_tweaks_handle_bridge_variables(event);
 }
 
@@ -259,13 +210,13 @@ static void kz_tweaks_channel_transferor_event_handler(switch_event_t *event)
 	if (!kz_test_tweak(KZ_TWEAK_TRANSFERS)) return;
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "TRANSFEROR : %s , %s , %s, %s, %s , %s , %s \n", uuid, orig_call_id, dest_peer_uuid, dest_call_id, file, func, line);
-	if ((uuid_session = switch_core_session_force_locate(uuid)) != NULL) {
+	if ((uuid_session = switch_core_session_locate(uuid)) != NULL) {
 		switch_channel_t *uuid_channel = switch_core_session_get_channel(uuid_session);
 		const char* interaction_id = switch_channel_get_variable_dup(uuid_channel, INTERACTION_VARIABLE, SWITCH_TRUE, -1);
 		// set to uuid & peer_uuid
 		if(interaction_id != NULL) {
 			switch_core_session_t *session = NULL;
-			if(dest_call_id && (session = switch_core_session_force_locate(dest_call_id)) != NULL) {
+			if(dest_call_id && (session = switch_core_session_locate(dest_call_id)) != NULL) {
 				switch_channel_t *channel = switch_core_session_get_channel(session);
 				const char* prv_interaction_id = switch_channel_get_variable_dup(channel, INTERACTION_VARIABLE, SWITCH_TRUE, -1);
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "LOCATING UUID PRV : %s : %s\n", prv_interaction_id, interaction_id);
@@ -279,7 +230,7 @@ static void kz_tweaks_channel_transferor_event_handler(switch_event_t *event)
 			} else {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "TRANSFEROR NO UUID SESSION: %s , %s , %s \n", uuid, dest_call_id, dest_peer_uuid);
 			}
-			if(dest_peer_uuid && (session = switch_core_session_force_locate(dest_peer_uuid)) != NULL) {
+			if(dest_peer_uuid && (session = switch_core_session_locate(dest_peer_uuid)) != NULL) {
 				switch_channel_t *channel = switch_core_session_get_channel(session);
 				const char* prv_interaction_id = switch_channel_get_variable_dup(channel, INTERACTION_VARIABLE, SWITCH_TRUE, -1);
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "LOCATING PEER UUID PRV : %s : %s\n", prv_interaction_id, interaction_id);
@@ -293,7 +244,7 @@ static void kz_tweaks_channel_transferor_event_handler(switch_event_t *event)
 			} else {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "TRANSFEROR NO PEER SESSION: %s , %s , %s \n", uuid, dest_call_id, dest_peer_uuid);
 			}
-			if(orig_call_id && (session = switch_core_session_force_locate(orig_call_id)) != NULL) {
+			if(orig_call_id && (session = switch_core_session_locate(orig_call_id)) != NULL) {
 				switch_channel_t *channel = switch_core_session_get_channel(session);
 				const char* prv_interaction_id = switch_channel_get_variable_dup(channel, INTERACTION_VARIABLE, SWITCH_TRUE, -1);
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "LOCATING PEER UUID PRV : %s : %s\n", prv_interaction_id, interaction_id);
@@ -482,7 +433,7 @@ static switch_status_t kz_tweaks_handle_replaces_call_id(switch_core_session_t *
 		const char *replaced_call_id = switch_channel_get_variable(channel, "sip_replaces_call_id");
 		const char *core_uuid = switch_channel_get_variable(channel, "sip_h_X-FS-From-Core-UUID");
 		if((!core_uuid) && replaced_call_id) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "checking replaces header tweak for %s\n", replaced_call_id);
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "checking replaces header tweak for %s\n", replaced_call_id);
 			if ((replace_call_session = switch_core_session_locate(replaced_call_id))) {
 				switch_channel_t *replaced_call_channel = switch_core_session_get_channel(replace_call_session);
 				int i;
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_utils.c b/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
index 34b33c8ae1e..1986f817175 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_utils.c
@@ -386,7 +386,6 @@ SWITCH_DECLARE(char *) kz_event_expand_headers_check(switch_event_t *event, cons
 				switch_safe_free(expanded_sub_val);
 				sub_val = NULL;
 				vname = NULL;
-				vtype = 0;
 				br = 0;
 			}
 
@@ -438,21 +437,18 @@ SWITCH_DECLARE(char *) kz_event_expand_headers_pool(switch_memory_pool_t *pool,
 	return dup;
 }
 
-SWITCH_DECLARE(char *) kz_event_expand(const char *in)
-{
-	switch_event_t *event = NULL;
-	char *ret = NULL;
-	kz_switch_core_base_headers_for_expand(&event);
-	ret = kz_event_expand_headers_check(event, in, NULL, NULL, 0);
-	switch_event_destroy(&event);
-	return ret;
-}
-
-SWITCH_DECLARE(char *) kz_expand(const char *in)
+SWITCH_DECLARE(char *) kz_expand(const char *in, const char *uuid)
 {
 	switch_event_t *event = NULL;
 	char *ret = NULL;
 	kz_switch_core_base_headers_for_expand(&event);
+	if (uuid != NULL) {
+		switch_core_session_t *nsession = NULL;
+		if ((nsession = switch_core_session_locate(uuid))) {
+			switch_channel_event_set_data(switch_core_session_get_channel(nsession), event);
+			switch_core_session_rwunlock(nsession);
+		}
+	}
 	ret = kz_event_expand_headers_check(event, in, NULL, NULL, 0);
 	switch_event_destroy(&event);
 	return ret;
@@ -463,7 +459,7 @@ SWITCH_DECLARE(char *) kz_expand_pool(switch_memory_pool_t *pool, const char *in
 	char *expanded;
 	char *dup = NULL;
 
-	if(!(expanded = kz_expand(in))) {
+	if(!(expanded = kz_expand(in, NULL))) {
 		return NULL;
 	}
 	dup = switch_core_strdup(pool, expanded);
@@ -584,12 +580,14 @@ void kz_event_decode(switch_event_t *event)
 	switch_event_header_t *hp;
 	int i;
 	for (hp = event->headers; hp; hp = hp->next) {
-		if (hp->idx) {
-			for(i = 0; i < hp->idx; i++) {
-				switch_url_decode(hp->array[i]);
+		if (strncmp(hp->name, "_json_", 6)) {
+			if (hp->idx) {
+				for(i = 0; i < hp->idx; i++) {
+					switch_url_decode(hp->array[i]);
+				}
+			} else {
+				switch_url_decode(hp->value);
 			}
-		} else {
-			switch_url_decode(hp->value);
 		}
 	}
 }
diff --git a/src/mod/event_handlers/mod_kazoo/kazoo_utils.h b/src/mod/event_handlers/mod_kazoo/kazoo_utils.h
index c5ddc3e037f..cb5549c7600 100644
--- a/src/mod/event_handlers/mod_kazoo/kazoo_utils.h
+++ b/src/mod/event_handlers/mod_kazoo/kazoo_utils.h
@@ -26,9 +26,7 @@ SWITCH_DECLARE(char *) kz_event_expand_headers(switch_event_t *event, const char
 
 SWITCH_DECLARE(char *) kz_event_expand_headers_pool(switch_memory_pool_t *pool, switch_event_t *event, char *val);
 
-SWITCH_DECLARE(char *) kz_event_expand(const char *in);
-
-SWITCH_DECLARE(char *) kz_expand(const char *in);
+SWITCH_DECLARE(char *) kz_expand(const char *in, const char *uuid);
 
 SWITCH_DECLARE(char *) kz_expand_pool(switch_memory_pool_t *pool, const char *in);
 
diff --git a/src/mod/event_handlers/mod_kazoo/mod_kazoo.c b/src/mod/event_handlers/mod_kazoo/mod_kazoo.c
index 1be309c28d8..15b8fb469ce 100644
--- a/src/mod/event_handlers/mod_kazoo/mod_kazoo.c
+++ b/src/mod/event_handlers/mod_kazoo/mod_kazoo.c
@@ -38,7 +38,9 @@ kz_globals_t kazoo_globals = {0};
 
 SWITCH_MODULE_DEFINITION(mod_kazoo, mod_kazoo_load, mod_kazoo_shutdown, mod_kazoo_runtime);
 
-SWITCH_MODULE_LOAD_FUNCTION(mod_kazoo_load) {
+SWITCH_MODULE_LOAD_FUNCTION(mod_kazoo_load)
+{
+	kz_erl_init();
 
 	memset(&kazoo_globals, 0, sizeof(kazoo_globals));
 	kazoo_globals.pool = pool;
@@ -77,6 +79,9 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_kazoo_load) {
 	/* add tweaks */
 	kz_tweaks_start();
 
+	/* add our cdr */
+	kz_cdr_start();
+
 	/* indicate that the module should continue to be loaded */
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -84,9 +89,10 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_kazoo_load) {
 SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_kazoo_shutdown) {
 	int sanity = 0;
 
-
 	remove_cli_api();
 
+	kz_cdr_stop();
+
 	kz_tweaks_stop();
 
 	/* stop taking new requests and start shuting down the threads */
@@ -128,6 +134,8 @@ SWITCH_MODULE_SHUTDOWN_FUNCTION(mod_kazoo_shutdown) {
 	switch_safe_free(kazoo_globals.ei_cookie);
 	switch_safe_free(kazoo_globals.ei_nodename);
 
+	kz_erl_shutdown();
+
 	return SWITCH_STATUS_SUCCESS;
 }
 
diff --git a/src/mod/event_handlers/mod_kazoo/mod_kazoo.h b/src/mod/event_handlers/mod_kazoo/mod_kazoo.h
index c9f16f48267..37fdd861e78 100644
--- a/src/mod/event_handlers/mod_kazoo/mod_kazoo.h
+++ b/src/mod/event_handlers/mod_kazoo/mod_kazoo.h
@@ -58,6 +58,9 @@ switch_status_t kz_json_api(const char * command, cJSON *args, cJSON **res);
 /* kazoo_endpoints.c */
 void add_kz_endpoints(switch_loadable_module_interface_t **module_interface);
 
+/* kazoo_cdr.c */
+void kz_cdr_start();
+void kz_cdr_stop();
 
 /* kazoo_tweaks.c */
 void kz_tweaks_start();
diff --git a/src/mod/event_handlers/mod_odbc_cdr/mod_odbc_cdr.c b/src/mod/event_handlers/mod_odbc_cdr/mod_odbc_cdr.c
index 1ca10a72d75..a2b00e193a4 100644
--- a/src/mod/event_handlers/mod_odbc_cdr/mod_odbc_cdr.c
+++ b/src/mod/event_handlers/mod_odbc_cdr/mod_odbc_cdr.c
@@ -224,7 +224,6 @@ static void write_cdr(const char *path, const char *log_line)
 		wrote += write(fd, "\n", 1);
 		wrote++;
 		close(fd);
-		fd = -1;
 	}
 }
 
@@ -283,7 +282,7 @@ static switch_status_t odbc_cdr_reporting(switch_core_session_t *session)
 
 			if (!table) {
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "Table [%s] not found, ignoring leg\n", table_name);
-				skip_leg = SWITCH_TRUE;
+				continue;
 			}
 
 			if (table->log_leg == ODBC_CDR_LOG_A && is_b) {
diff --git a/src/mod/event_handlers/mod_rayo/Makefile.am b/src/mod/event_handlers/mod_rayo/Makefile.am
index 872329fde3d..f2b71780945 100644
--- a/src/mod/event_handlers/mod_rayo/Makefile.am
+++ b/src/mod/event_handlers/mod_rayo/Makefile.am
@@ -5,11 +5,15 @@ IKS_DIR=$(switch_srcdir)/libs/iksemel
 IKS_BUILDDIR=$(switch_builddir)/libs/iksemel
 IKS_LA=$(IKS_BUILDDIR)/src/libiksemel.la
 
+noinst_LTLIBRARIES = librayomod.la
+librayomod_la_SOURCES = mod_rayo.c iks_helpers.c nlsml.c rayo_components.c rayo_cpa_component.c rayo_cpa_detector.c rayo_elements.c rayo_fax_components.c
+librayomod_la_SOURCES += rayo_input_component.c rayo_output_component.c rayo_prompt_component.c rayo_record_component.c sasl.c srgs.c xmpp_streams.c rayo_exec_component.c
+librayomod_la_CFLAGS = $(AM_CFLAGS) -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS)
+
 mod_LTLIBRARIES = mod_rayo.la
-mod_rayo_la_SOURCES  = mod_rayo.c iks_helpers.c nlsml.c rayo_components.c rayo_cpa_component.c rayo_cpa_detector.c rayo_elements.c rayo_fax_components.c
-mod_rayo_la_SOURCES += rayo_input_component.c rayo_output_component.c rayo_prompt_component.c rayo_record_component.c sasl.c srgs.c xmpp_streams.c rayo_exec_component.c
+mod_rayo_la_SOURCES  = 
 mod_rayo_la_CFLAGS   = $(AM_CFLAGS) -I$(IKS_DIR)/include $(PCRE_CFLAGS)
-mod_rayo_la_LIBADD   = $(switch_builddir)/libfreeswitch.la $(IKS_LA) $(PCRE_LIBS)
+mod_rayo_la_LIBADD   = $(switch_builddir)/libfreeswitch.la $(IKS_LA) $(PCRE_LIBS) librayomod.la
 mod_rayo_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
 
 BUILT_SOURCES=$(IKS_LA)
@@ -18,4 +22,23 @@ $(IKS_LA): $(IKS_BUILDDIR) $(IKS_DIR) $(IKS_DIR)/.update
 	@cd $(IKS_BUILDDIR) && $(MAKE)
 	@$(TOUCH_TARGET)
 
-SUBDIRS=. test
+noinst_PROGRAMS = test/test_iks test/test_nlsml test/test_srgs
+
+test_test_iks_SOURCES = test/test_iks.c
+test_test_iks_CFLAGS = $(AM_CFLAGS) -I. -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS) -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_iks_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_iks_LDADD = librayomod.la $(IKS_LA) $(PCRE_LIBS)
+
+test_test_nlsml_SOURCES = test/test_nlsml.c
+test_test_nlsml_CFLAGS = $(AM_CFLAGS) -I. -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS) -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_nlsml_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_nlsml_LDADD = librayomod.la $(IKS_LA) $(PCRE_LIBS)
+
+test_test_srgs_SOURCES = test/test_srgs.c
+test_test_srgs_CFLAGS = $(AM_CFLAGS) -I. -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS) -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_srgs_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_srgs_LDADD = librayomod.la $(IKS_LA) $(PCRE_LIBS)
+
+
+TESTS = $(noinst_PROGRAMS)
+
diff --git a/src/mod/event_handlers/mod_rayo/mod_rayo.c b/src/mod/event_handlers/mod_rayo/mod_rayo.c
index 25ab87b9758..b2e3ef27346 100644
--- a/src/mod/event_handlers/mod_rayo/mod_rayo.c
+++ b/src/mod/event_handlers/mod_rayo/mod_rayo.c
@@ -4854,7 +4854,7 @@ static int alias_api(struct rayo_cmd_alias *alias, char *args, switch_stream_han
 	cmd = strdup(alias->cmd);
 	for (i = 1; i < argc; i++) {
 		char *cmd_new;
-		char to_replace[4] = { 0 };
+		char to_replace[12] = { 0 };
 		sprintf(to_replace, "$%i", i);
 		cmd_new = switch_string_replace(cmd, to_replace, argv[i]);
 		free(cmd);
diff --git a/src/mod/event_handlers/mod_rayo/nlsml.c b/src/mod/event_handlers/mod_rayo/nlsml.c
index 732687900db..61730127c49 100644
--- a/src/mod/event_handlers/mod_rayo/nlsml.c
+++ b/src/mod/event_handlers/mod_rayo/nlsml.c
@@ -356,6 +356,12 @@ enum nlsml_match_type nlsml_parse(const char *nlsml_result, const char *uuid)
 	}
  end:
 
+	while (parser.cur) {
+		struct nlsml_node *node = parser.cur;
+		parser.cur = node->parent;
+		free(node);
+	}
+
 	if ( p ) {
 		iks_parser_delete(p);
 	}
diff --git a/src/mod/event_handlers/mod_rayo/srgs.c b/src/mod/event_handlers/mod_rayo/srgs.c
index 6a3111704fb..7981c24ef2e 100644
--- a/src/mod/event_handlers/mod_rayo/srgs.c
+++ b/src/mod/event_handlers/mod_rayo/srgs.c
@@ -851,8 +851,8 @@ static void srgs_grammar_destroy(struct srgs_grammar *grammar)
 	if (grammar->jsgf_file_name) {
 		switch_file_remove(grammar->jsgf_file_name, pool);
 	}
-	switch_core_destroy_memory_pool(&pool);
 	switch_core_hash_destroy(&grammar->rules);
+	switch_core_destroy_memory_pool(&pool);
 }
 
 /**
diff --git a/src/mod/event_handlers/mod_rayo/test/Makefile.am b/src/mod/event_handlers/mod_rayo/test/Makefile.am
deleted file mode 100644
index b670dcf10fa..00000000000
--- a/src/mod/event_handlers/mod_rayo/test/Makefile.am
+++ /dev/null
@@ -1,13 +0,0 @@
-include $(top_srcdir)/build/modmake.rulesam
-noinst_PROGRAMS = test_iks test_nlsml test_srgs
-
-test_iks_CFLAGS = $(AM_CFLAGS) -I../ -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS)
-test_iks_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) ../mod_rayo.la $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
-
-test_nlsml_CFLAGS = $(AM_CFLAGS) -I../ -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS)
-test_nlsml_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) ../mod_rayo.la $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
-
-test_srgs_CFLAGS = $(AM_CFLAGS) -I../ -I$(switch_builddir)/libs/iksemel/include $(PCRE_CFLAGS)
-test_srgs_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) ../mod_rayo.la $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
-
-TESTS = $(noinst_PROGRAMS)
diff --git a/src/mod/event_handlers/mod_rayo/test/test_iks.c b/src/mod/event_handlers/mod_rayo/test/test_iks.c
index 02a17303d9f..29b424afc04 100644
--- a/src/mod/event_handlers/mod_rayo/test/test_iks.c
+++ b/src/mod/event_handlers/mod_rayo/test/test_iks.c
@@ -164,15 +164,27 @@ FST_TEST_END()
 
 FST_TEST_BEGIN(dialback_key)
 {
-	fst_check_string_equals("37c69b1cf07a3f67c04a5ef5902fa5114f2c76fe4a2686482ba5b89323075643", iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("", "xmpp.example.com", "example.org", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", "", "example.org", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", ""));
-	fst_check(NULL == iks_server_dialback_key(NULL, "xmpp.example.com", "example.org", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", NULL, "example.org", "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", NULL, "D60000229F"));
-	fst_check(NULL == iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", NULL));
+	char *dialback_key;
+
+	dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", "D60000229F");
+	fst_check_string_equals("37c69b1cf07a3f67c04a5ef5902fa5114f2c76fe4a2686482ba5b89323075643", dialback_key);
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("", "xmpp.example.com", "example.org", "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "", "example.org", "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "", "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", "")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key(NULL, "xmpp.example.com", "example.org", "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", NULL, "example.org", "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", NULL, "D60000229F")));
+	switch_safe_free(dialback_key);
+	fst_check(NULL == (dialback_key = iks_server_dialback_key("s3cr3tf0rd14lb4ck", "xmpp.example.com", "example.org", NULL)));
+	switch_safe_free(dialback_key);
 }
 FST_TEST_END()
 
diff --git a/src/mod/event_handlers/mod_rayo/test/test_nlsml.c b/src/mod/event_handlers/mod_rayo/test/test_nlsml.c
index df5948f05a3..7319a04f4c2 100644
--- a/src/mod/event_handlers/mod_rayo/test/test_nlsml.c
+++ b/src/mod/event_handlers/mod_rayo/test/test_nlsml.c
@@ -301,6 +301,7 @@ FST_TEST_BEGIN(create_dtmf_match)
 	result_str = iks_string(NULL, result);
 	fst_check_string_equals(nlsml_dtmf_result, result_str);
 	iks_free(result_str);
+	iks_delete(result);
 }
 FST_TEST_END()
 
@@ -316,6 +317,7 @@ FST_TEST_BEGIN(create_dtmf_instance)
 	result_str = iks_string(NULL, result);
 	fst_check_string_equals(nlsml_dtmf_instance_result, result_str);
 	iks_free(result_str);
+	iks_delete(result);
 }
 FST_TEST_END()
 
@@ -325,8 +327,13 @@ FST_TEST_END()
 FST_TEST_BEGIN(normalize)
 {
 	iks *result = nlsml_normalize(nlsml_good);
+	char *result_str;
+
 	fst_requires(result);
-	fst_check_string_equals(nlsml_good_normalized, iks_string(NULL, result));
+	result_str = iks_string(NULL, result);
+	fst_check_string_equals(nlsml_good_normalized, result_str);
+	iks_free(result_str);
+	iks_delete(result);
 }
 FST_TEST_END()
 
diff --git a/src/mod/event_handlers/mod_rayo/test/test_srgs.c b/src/mod/event_handlers/mod_rayo/test/test_srgs.c
index 97284639bb3..9bd6fc1eb5d 100644
--- a/src/mod/event_handlers/mod_rayo/test/test_srgs.c
+++ b/src/mod/event_handlers/mod_rayo/test/test_srgs.c
@@ -1200,6 +1200,7 @@ FST_TEST_BEGIN(repeat_item_range_ambiguous_grammar)
 	fst_check(SMT_MATCH == srgs_grammar_match(grammar, "1", &interpretation));
 	fst_check(SMT_MATCH == srgs_grammar_match(grammar, "12", &interpretation));
 	fst_check(SMT_MATCH_END == srgs_grammar_match(grammar, "123", &interpretation));
+	srgs_parser_destroy(parser);
 }
 FST_TEST_END()
 
@@ -1217,6 +1218,7 @@ FST_TEST_BEGIN(repeat_item_range_optional_pound_grammar)
 	fst_check(SMT_MATCH == srgs_grammar_match(grammar, "12", &interpretation));
 	fst_check(SMT_MATCH_END == srgs_grammar_match(grammar, "12#", &interpretation));
 	fst_check(SMT_MATCH_END == srgs_grammar_match(grammar, "123", &interpretation));
+	srgs_parser_destroy(parser);
 }
 FST_TEST_END()
 
@@ -1253,6 +1255,7 @@ FST_TEST_BEGIN(w3c_example_grammar)
 
 	fst_requires((grammar = srgs_parse(parser, w3c_example_grammar)));
 	fst_check(srgs_grammar_to_jsgf(grammar));
+	srgs_parser_destroy(parser);
 }
 FST_TEST_END()
 
@@ -1266,6 +1269,7 @@ FST_TEST_BEGIN(metadata_grammar)
 
 	fst_requires((grammar = srgs_parse(parser, metadata_grammar)));
 	fst_check(srgs_grammar_to_jsgf(grammar));
+	srgs_parser_destroy(parser);
 }
 FST_TEST_END()
 
diff --git a/src/mod/formats/mod_imagick/mod_imagick.c b/src/mod/formats/mod_imagick/mod_imagick.c
index d008632b74f..64ee6762e9f 100644
--- a/src/mod/formats/mod_imagick/mod_imagick.c
+++ b/src/mod/formats/mod_imagick/mod_imagick.c
@@ -147,7 +147,6 @@ static switch_status_t imagick_file_open(switch_file_handle_t *handle, const cha
 {
 	pdf_file_context_t *context;
 	char *ext;
-	unsigned int flags = 0;
 	char range_path[1024];
 
 	if ((ext = strrchr((char *)path, '.')) == 0) {
@@ -173,10 +172,6 @@ static switch_status_t imagick_file_open(switch_file_handle_t *handle, const cha
 		return SWITCH_STATUS_GENERR;
 	}
 
-	if (switch_test_flag(handle, SWITCH_FILE_FLAG_READ)) {
-		flags |= SWITCH_FOPEN_READ;
-	}
-
 	if (ext && !strcmp(ext, "gif")) {
 		context->autoplay = 1;
 	}
diff --git a/src/mod/formats/mod_native_file/mod_native_file.c b/src/mod/formats/mod_native_file/mod_native_file.c
index 585109351e8..2a40396a4f3 100644
--- a/src/mod/formats/mod_native_file/mod_native_file.c
+++ b/src/mod/formats/mod_native_file/mod_native_file.c
@@ -97,6 +97,7 @@ static switch_status_t native_file_file_open(switch_file_handle_t *handle, const
 	handle->speed = 0;
 	handle->private_info = context;
 	handle->flags |= SWITCH_FILE_NATIVE;
+	handle->flags |= SWITCH_FILE_NOMUX;
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Opening File [%s] %dhz\n", path, handle->samplerate);
 
 	return SWITCH_STATUS_SUCCESS;
diff --git a/src/mod/formats/mod_opusfile/Makefile.am b/src/mod/formats/mod_opusfile/Makefile.am
index abf251e7008..f02f8212d7d 100644
--- a/src/mod/formats/mod_opusfile/Makefile.am
+++ b/src/mod/formats/mod_opusfile/Makefile.am
@@ -1,15 +1,30 @@
 include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_opusfile
 
+noinst_LTLIBRARIES = libopusfilemod.la
+
+libopusfilemod_la_SOURCES  = mod_opusfile.c
+libopusfilemod_la_CFLAGS   = $(AM_CFLAGS) $(freeswitch_LDFLAGS) $(OPUSFILE_DECODE_CFLAGS)
+
 mod_LTLIBRARIES = mod_opusfile.la
-mod_opusfile_la_SOURCES  = mod_opusfile.c
-mod_opusfile_la_CFLAGS   = $(AM_CFLAGS)
-mod_opusfile_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
+mod_opusfile_la_SOURCES  =
+mod_opusfile_la_CFLAGS   = $(AM_CFLAGS) $(freeswitch_LDFLAGS) $(OPUSFILE_DECODE_CFLAGS)
+mod_opusfile_la_LIBADD   = libopusfilemod.la $(switch_builddir)/libfreeswitch.la
 mod_opusfile_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
 
 if HAVE_OPUSFILE_DECODE
 mod_opusfile_la_CFLAGS += $(OPUSFILE_DECODE_CFLAGS)
 mod_opusfile_la_LIBADD += $(OPUSFILE_DECODE_LIBS)
+
+noinst_PROGRAMS = test/test_opusfile
+
+test_test_opusfile_SOURCES = test/test_opusfile.c
+test_test_opusfile_CFLAGS = $(AM_CFLAGS) -I./ -I../ -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\" $(OPUSFILE_DECODE_CFLAGS)
+test_test_opusfile_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS) $(OPUSFILE_DECODE_LIBS)
+test_test_opusfile_LDADD = libopusfilemod.la $(switch_builddir)/libfreeswitch.la
+
+TESTS = $(noinst_PROGRAMS)
+
 endif
 
 if HAVE_OPUSFILE_ENCODE
diff --git a/src/mod/formats/mod_opusfile/mod_opusfile.c b/src/mod/formats/mod_opusfile/mod_opusfile.c
index 418628b8e4b..25246a7ced2 100644
--- a/src/mod/formats/mod_opusfile/mod_opusfile.c
+++ b/src/mod/formats/mod_opusfile/mod_opusfile.c
@@ -148,7 +148,7 @@ static switch_status_t switch_opusfile_decode(opus_file_context *context, void *
 			if (globals.debug) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS Decoder]: EOF reached [%d]\n", ret);
 			}
-			context->eof = TRUE;
+			context->eof = SWITCH_TRUE;
 			break;
 		} else /* (ret > 0)*/ {
 			/*The number of samples read per channel on success*/
@@ -161,7 +161,6 @@ static switch_status_t switch_opusfile_decode(opus_file_context *context, void *
 		}
 	}
 	switch_mutex_unlock(context->audio_mutex);
-	context->eof = FALSE; // for next page 
 	return SWITCH_STATUS_SUCCESS;
 }
 
@@ -266,6 +265,8 @@ static switch_status_t switch_opusfile_open(switch_file_handle_t *handle, const
 	if(context->pcm_offset!=0){
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS File] Non-zero starting PCM offset: [%li]\n", (long)context->pcm_offset);
 	}
+
+	context->eof = SWITCH_FALSE;
 	context->pcm_print_offset = context->pcm_offset - DEFAULT_RATE;
 	context->bitrate = 0;
 	context->buffer_seconds = 1;
@@ -282,7 +283,6 @@ static switch_status_t switch_opusfile_open(switch_file_handle_t *handle, const
 			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "[OGG/OPUS File] Channels: %i\n", head->channel_count);
 			if (head->input_sample_rate) {
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "[OGG/OPUS File] Original sampling rate: %lu Hz\n", (unsigned long)head->input_sample_rate);
-				handle->samplerate = context->samplerate = head->input_sample_rate;
 			}
 		}
 		if (op_seekable(context->of)) {
@@ -365,20 +365,29 @@ static switch_status_t switch_opusfile_read(switch_file_handle_t *handle, void *
 
 	if (!handle->handler) {
 		if (switch_opusfile_decode(context, data, bytes, handle->real_channels) == SWITCH_STATUS_FALSE) {
-			context->eof = 1;
+			context->eof = SWITCH_TRUE;
 		}
 	}
 	switch_mutex_lock(context->audio_mutex);
 	rb = switch_buffer_read(context->audio_buffer, data, bytes);
 	switch_mutex_unlock(context->audio_mutex);
 
+	if (globals.debug) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS File] rb: [%"SWITCH_SIZE_T_FMT"] bytes: [%"SWITCH_SIZE_T_FMT"]\n", rb, bytes);
+	}
+
 	if (!rb && (context->eof)) {
+		if (globals.debug) {
+			// should be same as returned by op_pcm_total()
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS File] EOF. sample count: [%"SWITCH_SIZE_T_FMT"]\n", handle->sample_count);
+		}
+		*len = 0;
 		return SWITCH_STATUS_FALSE;
 	}
 	if (rb) {
 		*len = rb / sizeof(int16_t) / handle->real_channels;
 		if (globals.debug) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS File] rb: [%d] *len: [%d]\n",  (int)rb, (int)*len);
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "[OGG/OPUS File] rb: [%"SWITCH_SIZE_T_FMT"] *len: [%"SWITCH_SIZE_T_FMT"]\n", rb, *len);
 		}
 	} else {
 		newbytes = (2 * handle->samplerate * handle->real_channels) * context->buffer_seconds;
diff --git a/src/mod/formats/mod_opusfile/test/freeswitch.xml b/src/mod/formats/mod_opusfile/test/freeswitch.xml
new file mode 100644
index 00000000000..80591e86ac9
--- /dev/null
+++ b/src/mod/formats/mod_opusfile/test/freeswitch.xml
@@ -0,0 +1,21 @@
+<document type="freeswitch/xml">
+
+  <section name="configuration" description="Various Configuration">
+    <configuration name="modules.conf" description="Modules">
+      <modules>
+        <load module="mod_loopback"/>
+        <load module="mod_opusfile"/>
+      </modules>
+    </configuration>
+  </section>
+
+  <section name="dialplan" description="Regex/XML Dialplan">
+    <context name="default">
+      <extension name="sample">
+        <condition>
+          <action application="info"/>
+        </condition>
+      </extension>
+    </context>
+  </section>
+</document>
diff --git a/src/mod/formats/mod_opusfile/test/sounds/hi.opus b/src/mod/formats/mod_opusfile/test/sounds/hi.opus
new file mode 100644
index 00000000000..ae13c23cca7
Binary files /dev/null and b/src/mod/formats/mod_opusfile/test/sounds/hi.opus differ
diff --git a/src/mod/formats/mod_opusfile/test/test_opusfile.c b/src/mod/formats/mod_opusfile/test/test_opusfile.c
new file mode 100644
index 00000000000..0cb5e8b00be
--- /dev/null
+++ b/src/mod/formats/mod_opusfile/test/test_opusfile.c
@@ -0,0 +1,123 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2018, Anthony Minessale II <anthm@freeswitch.org>
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ *
+ * The Initial Developer of the Original Code is
+ * Anthony Minessale II <anthm@freeswitch.org>
+ * Portions created by the Initial Developer are Copyright (C)
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Dragos Oancea <dragos@signalwire.com>
+ *
+ *
+ * test_opusfile.c -- tests mod_opusfile
+ *
+ */
+#include <switch.h>
+#include <stdlib.h>
+
+#include <test/switch_test.h>
+
+
+FST_CORE_BEGIN(".")
+{
+	FST_SUITE_BEGIN(test_opusfile)
+	{
+		FST_SETUP_BEGIN()
+		{
+			fst_requires_module("mod_loopback");
+			fst_requires_module("mod_opusfile");
+		}
+		FST_SETUP_END()
+
+		FST_TEARDOWN_BEGIN()
+		{
+		}
+		FST_TEARDOWN_END()
+
+		FST_TEST_BEGIN(opusfile_read)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			/*
+			$ mediainfo hi.opus
+			General
+			Complete name                            : hi.opus
+			Format                                   : OGG
+			File size                                : 8.55 KiB
+			Duration                                 : 2s 157ms
+			Overall bit rate                         : 32.5 Kbps
+			Writing application                      : opusenc from opus-tools 0.1.10
+
+			Audio
+			ID                                       : 277454932 (0x1089A054)
+			Format                                   : Opus
+			Duration                                 : 2s 157ms
+			Channel(s)                               : 1 channel
+			Channel positions                        : Front: C
+			Sampling rate                            : 16.0 KHz
+			Compression mode                         : Lossy
+			Writing library                          : libopus 1.2~alpha2
+			*/
+			static char filename[] = "sounds/hi.opus"; // duration in samples: 103200
+			char path[4096];
+			switch_file_handle_t fh = { 0 };
+			int16_t *audiobuf;
+			switch_size_t len; 
+
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, filename);
+
+			status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", 2, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(session);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			status = switch_core_file_open(&fh, path, 1, 48000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			len = 128 * 1024 * sizeof(*audiobuf);
+			switch_zmalloc(audiobuf, len);
+
+			status = switch_core_file_read(&fh, audiobuf, &len);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			/* [INFO] mod_opusfile.c:292 [OGG/OPUS File] Duration (samples): 103200 */
+			/* compare the read sample count with the one in the OGG/OPUS header. */
+			fst_check(len == 103200);
+
+			status = switch_core_file_close(&fh);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			switch_safe_free(audiobuf);
+			channel = switch_core_session_get_channel(session);
+			fst_requires(channel);
+
+			switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+			fst_check(!switch_channel_ready(channel));
+
+			switch_core_session_rwunlock(session);
+
+			switch_sleep(1000000);
+		}
+
+		FST_TEST_END()
+	}
+	FST_SUITE_END()
+}
+FST_CORE_END()
diff --git a/src/mod/formats/mod_png/Makefile.am b/src/mod/formats/mod_png/Makefile.am
new file mode 100644
index 00000000000..9585065aa22
--- /dev/null
+++ b/src/mod/formats/mod_png/Makefile.am
@@ -0,0 +1,8 @@
+include $(top_srcdir)/build/modmake.rulesam
+MODNAME=mod_png
+
+mod_LTLIBRARIES = mod_png.la
+mod_png_la_SOURCES  = mod_png.c
+mod_png_la_CFLAGS   = $(AM_CFLAGS)
+mod_png_la_LIBADD   = $(switch_builddir)/libfreeswitch.la
+mod_png_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
diff --git a/src/mod/formats/mod_png/mod_png.c b/src/mod/formats/mod_png/mod_png.c
index 67ea0904956..7508dc4fd70 100644
--- a/src/mod/formats/mod_png/mod_png.c
+++ b/src/mod/formats/mod_png/mod_png.c
@@ -60,7 +60,6 @@ static switch_status_t png_file_open(switch_file_handle_t *handle, const char *p
 {
 	png_file_context_t *context;
 	char *ext;
-	unsigned int flags = 0;
 
 	if ((ext = strrchr((char *)path, '.')) == 0) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Invalid Format\n");
@@ -76,10 +75,6 @@ static switch_status_t png_file_open(switch_file_handle_t *handle, const char *p
 		return SWITCH_STATUS_GENERR;
 	}
 
-	if (switch_test_flag(handle, SWITCH_FILE_FLAG_READ)) {
-		flags |= SWITCH_FOPEN_READ;
-	}
-
 	memset(context, 0, sizeof(png_file_context_t));
 	context->max = 10000;
 	switch_mutex_init(&context->mutex, SWITCH_MUTEX_NESTED, handle->memory_pool);
diff --git a/src/mod/formats/mod_portaudio_stream/mod_portaudio_stream.c b/src/mod/formats/mod_portaudio_stream/mod_portaudio_stream.c
index 1d2882ca747..e0878a0fe78 100644
--- a/src/mod/formats/mod_portaudio_stream/mod_portaudio_stream.c
+++ b/src/mod/formats/mod_portaudio_stream/mod_portaudio_stream.c
@@ -248,7 +248,7 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 {
 	portaudio_stream_source_t *source = obj;
 	portaudio_stream_context_t *cp;
-	int samples = 0;
+	int samples;
 	int bused, bytesToWrite;
 
 
@@ -282,7 +282,6 @@ static void *SWITCH_THREAD_FUNC read_stream_thread(switch_thread_t *thread, void
 				source->ready = 0;
 			} else {
 				while (globals.running && !source->stopped) {
-					samples = 0;
 					switch_mutex_lock(source->device_lock);
 					samples = ReadAudioStream(source->audio_stream, source->databuf,
 								  source->read_codec.implementation->samples_per_packet, 0, &source->timer);
diff --git a/src/mod/formats/mod_shout/mod_shout.c b/src/mod/formats/mod_shout/mod_shout.c
index 99e99ebe3c7..6799b383914 100644
--- a/src/mod/formats/mod_shout/mod_shout.c
+++ b/src/mod/formats/mod_shout/mod_shout.c
@@ -736,7 +736,7 @@ static switch_status_t shout_file_open(switch_file_handle_t *handle, const char
 		mpg123_getformat(context->mh, &rate, &channels, &encoding);
 
 		if (!channels || !rate) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error opening %s (invalid rate or channel count)\n", path);
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Error opening %s (invalid rate or channel count)\n", path);
 			goto error;
 		}
 
diff --git a/src/mod/formats/mod_sndfile/Makefile.am b/src/mod/formats/mod_sndfile/Makefile.am
index e94b9cb1007..27f77440e31 100644
--- a/src/mod/formats/mod_sndfile/Makefile.am
+++ b/src/mod/formats/mod_sndfile/Makefile.am
@@ -3,12 +3,30 @@ MODNAME=mod_sndfile
 
 if HAVE_SNDFILE
 
+noinst_LTLIBRARIES = libsndfilemod.la
+libsndfilemod_la_SOURCES  = mod_sndfile.c
+libsndfilemod_la_CFLAGS   = $(AM_CFLAGS) $(SNDFILE_CFLAGS)
+
 mod_LTLIBRARIES = mod_sndfile.la
-mod_sndfile_la_SOURCES  = mod_sndfile.c
+mod_sndfile_la_SOURCES  = 
 mod_sndfile_la_CFLAGS   = $(AM_CFLAGS) $(SNDFILE_CFLAGS)
-mod_sndfile_la_LIBADD   = $(switch_builddir)/libfreeswitch.la $(SNDFILE_LIBS)
+mod_sndfile_la_LIBADD   = libsndfilemod.la $(switch_builddir)/libfreeswitch.la $(SNDFILE_LIBS)
 mod_sndfile_la_LDFLAGS  = -avoid-version -module -no-undefined -shared
 
+noinst_PROGRAMS = test/test_sndfile test/test_sndfile_conf
+
+test_test_sndfile_SOURCES = test/test_sndfile.c
+test_test_sndfile_CFLAGS = $(AM_CFLAGS) -I./ -I../ -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_sndfile_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_sndfile_LDADD = libsndfilemod.la
+
+test_test_sndfile_conf_SOURCES = test/test_sndfile_conf.c
+test_test_sndfile_conf_CFLAGS = $(AM_CFLAGS) -I./ -I../ -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_sndfile_conf_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_sndfile_conf_LDADD = libsndfilemod.la
+
+TESTS = $(noinst_PROGRAMS)
+
 else
 install: error
 all: error
diff --git a/src/mod/formats/mod_sndfile/mod_sndfile.c b/src/mod/formats/mod_sndfile/mod_sndfile.c
index a03e28af5c3..5d2962a8c4d 100644
--- a/src/mod/formats/mod_sndfile/mod_sndfile.c
+++ b/src/mod/formats/mod_sndfile/mod_sndfile.c
@@ -39,6 +39,9 @@ SWITCH_MODULE_DEFINITION(mod_sndfile, mod_sndfile_load, mod_sndfile_shutdown, NU
 
 static struct {
 	switch_hash_t *format_hash;
+	int debug;
+	char *allowed_extensions[100];
+	int allowed_extensions_count;
 } globals;
 
 struct format_map {
@@ -56,6 +59,16 @@ typedef struct sndfile_context sndfile_context;
 
 static switch_status_t sndfile_perform_open(sndfile_context *context, const char *path, int mode, switch_file_handle_t *handle);
 
+static void reverse_channel_count(switch_file_handle_t *handle) {
+	/* for recording stereo conferences and stereo calls in audio file formats that support only 1 channel.
+	 * "{force_channels=1}" does similar, but here switch_core_open_file() was already called and we 
+	 * have the handle and we chane the count before _read_ or _write_ are called (where muxing is done). */
+	if (handle->channels > 1) {
+		handle->real_channels = handle->channels;
+		handle->channels = handle->mm.channels = 1;
+	}
+}
+
 static switch_status_t sndfile_file_open(switch_file_handle_t *handle, const char *path)
 {
 	sndfile_context *context;
@@ -119,41 +132,107 @@ static switch_status_t sndfile_file_open(switch_file_handle_t *handle, const cha
 
 	if (!strcmp(ext, "raw")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_PCM_16;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 8000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "r8")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_PCM_16;
-		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 8000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "r16")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_PCM_16;
-		context->sfinfo.samplerate = 16000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 16000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "r24")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_PCM_24;
-		context->sfinfo.samplerate = 24000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 24000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "r32")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_PCM_32;
-		context->sfinfo.samplerate = 32000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 32000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "gsm")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_GSM610;
 		context->sfinfo.channels = 1;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
 		context->sfinfo.samplerate = 8000;
 	} else if (!strcmp(ext, "ul") || !strcmp(ext, "ulaw")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_ULAW;
-		context->sfinfo.channels = 1;
-		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 8000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "al") || !strcmp(ext, "alaw")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_ALAW;
-		context->sfinfo.channels = 1;
-		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = 8000;
+			context->sfinfo.channels = 1;
+		}
 	} else if (!strcmp(ext, "vox")) {
 		context->sfinfo.format = SF_FORMAT_RAW | SF_FORMAT_VOX_ADPCM;
 		context->sfinfo.channels = 1;
 		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
 	} else if (!strcmp(ext, "adpcm")) {
 		context->sfinfo.format = SF_FORMAT_WAV | SF_FORMAT_IMA_ADPCM;
 		context->sfinfo.channels = 1;
 		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
 	} else if (!strcmp(ext, "oga") || !strcmp(ext, "ogg")) {
 		context->sfinfo.format = SF_FORMAT_OGG | SF_FORMAT_VORBIS;
-		context->sfinfo.samplerate = handle->samplerate;
+		if (mode & SFM_READ) {
+			context->sfinfo.samplerate = handle->samplerate;
+		}
+	} else if (!strcmp(ext, "wve")) {
+		context->sfinfo.format = SF_FORMAT_WVE | SF_FORMAT_ALAW;
+		context->sfinfo.channels = 1;
+		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
+	} else if (!strcmp(ext, "htk")) {
+		context->sfinfo.format = SF_FORMAT_HTK | SF_FORMAT_PCM_16;
+		context->sfinfo.channels = 1;
+		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
+	} else if (!strcmp(ext, "iff")) {
+		context->sfinfo.format = SF_FORMAT_AIFF | SF_FORMAT_PCM_16;
+		context->sfinfo.channels = 1;
+		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
+	} else if (!strcmp(ext, "xi")) {
+		context->sfinfo.format = SF_FORMAT_XI | SF_FORMAT_DPCM_16;
+		context->sfinfo.channels = 1;
+		context->sfinfo.samplerate = 44100;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
+	} else if (!strcmp(ext, "sds")) {
+		context->sfinfo.format = SF_FORMAT_SDS | SF_FORMAT_PCM_16;
+		context->sfinfo.channels = 1;
+		context->sfinfo.samplerate = 8000;
+		if (mode & SFM_WRITE) {
+			reverse_channel_count(handle);
+		}
 	}
 
 	if ((mode & SFM_WRITE) && sf_format_check(&context->sfinfo) == 0) {
@@ -199,12 +278,15 @@ static switch_status_t sndfile_file_open(switch_file_handle_t *handle, const cha
 
 	if (!context->handle) {
 		if (sndfile_perform_open(context, path, mode, handle) != SWITCH_STATUS_SUCCESS) {
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error Opening File [%s] [%s]\n", path, sf_strerror(context->handle));
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "Error Opening File [%s] [%s]\n", path, sf_strerror(context->handle));
 			status = SWITCH_STATUS_GENERR;
 			goto end;
 		}
 	}
-	//switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Opening File [%s] rate %dhz\n", path, context->sfinfo.samplerate);
+	if (globals.debug) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, 
+				"Opening File [%s] rate [%dhz] channels: [%d]\n", path, context->sfinfo.samplerate, (uint8_t) context->sfinfo.channels);
+	}
 	handle->samples = (unsigned int) context->sfinfo.frames;
 	handle->samplerate = context->sfinfo.samplerate;
 	handle->channels = (uint8_t) context->sfinfo.channels;
@@ -365,6 +447,20 @@ static switch_status_t sndfile_file_get_string(switch_file_handle_t *handle, swi
 	return SWITCH_STATUS_FALSE;
 }
 
+static switch_bool_t exten_is_allowed(const char *exten) {
+	int i;
+	if (!globals.allowed_extensions[0]) {
+		// defaults to allowing all extensions if param "allowed-extensions" not set in cfg
+		return SWITCH_TRUE;
+	}
+	for (i = 0 ; i < globals.allowed_extensions_count; i++) {
+		if (exten && globals.allowed_extensions[i] && !strcasecmp(globals.allowed_extensions[i], exten)) {
+			return SWITCH_TRUE;
+		}
+	}
+	return SWITCH_FALSE;
+}
+
 /* Registration */
 
 static char **supported_formats;
@@ -407,6 +503,9 @@ static switch_status_t setup_formats(switch_memory_pool_t *pool)
 		skip = 0;
 		info.format = m;
 		sf_command(NULL, SFC_GET_FORMAT_MAJOR, &info, sizeof(info));
+		if (!exten_is_allowed(info.extension)) {
+			continue;
+		}
 		switch_log_printf(SWITCH_CHANNEL_LOG_CLEAN, SWITCH_LOG_INFO, "%s  (extension \"%s\")\n", info.name, info.extension);
 		for (x = 0; x < len; x++) {
 			if (supported_formats[x] == info.extension) {
@@ -470,7 +569,9 @@ static switch_status_t setup_formats(switch_memory_pool_t *pool)
 		}
 	}
 	for (m = 0; m < exlen; m++) {
-		supported_formats[len++] = extras[m];
+		if (exten_is_allowed(extras[m])) {
+			supported_formats[len++] = extras[m];
+		}
 	}
 
 	switch_log_printf(SWITCH_CHANNEL_LOG_CLEAN, SWITCH_LOG_NOTICE, "================================================================================\n");
@@ -478,12 +579,49 @@ static switch_status_t setup_formats(switch_memory_pool_t *pool)
 	return SWITCH_STATUS_SUCCESS;
 }
 
+#define SNDFILE_DEBUG_SYNTAX "<on|off>"
+SWITCH_STANDARD_API(mod_sndfile_debug)
+{
+		if (zstr(cmd)) {
+			stream->write_function(stream, "-USAGE: %s\n", SNDFILE_DEBUG_SYNTAX);
+		} else {
+			if (!strcasecmp(cmd, "on")) {
+				globals.debug = 1;
+				stream->write_function(stream, "Sndfile Debug: on\n");
+			} else if (!strcasecmp(cmd, "off")) {
+				globals.debug = 0;
+				stream->write_function(stream, "Sndfile Debug: off\n");
+			} else {
+				stream->write_function(stream, "-USAGE: %s\n", SNDFILE_DEBUG_SYNTAX);
+			}
+		}
+	return SWITCH_STATUS_SUCCESS;
+}
+
 SWITCH_MODULE_LOAD_FUNCTION(mod_sndfile_load)
 {
 	switch_file_interface_t *file_interface;
+	switch_api_interface_t *commands_api_interface;
+	char *cf = "sndfile.conf";
+	switch_xml_t cfg, xml, settings, param;
+
+	memset(&globals, 0, sizeof(globals));
 
 	switch_core_hash_init(&globals.format_hash);
 
+	if ((xml = switch_xml_open_cfg(cf, &cfg, NULL))) {
+		if ((settings = switch_xml_child(cfg, "settings"))) {
+			for (param = switch_xml_child(settings, "param"); param; param = param->next) {
+				char *var = (char *) switch_xml_attr_soft(param, "name");
+				char *val = (char *) switch_xml_attr_soft(param, "value");
+				if (!strcasecmp(var, "allowed-extensions") && val) {
+					globals.allowed_extensions_count = switch_separate_string(val, ',', globals.allowed_extensions, (sizeof(globals.allowed_extensions) / sizeof(globals.allowed_extensions[0])));
+				}
+			}
+		}
+		switch_xml_free(xml);
+	}
+
 	if (setup_formats(pool) != SWITCH_STATUS_SUCCESS) {
 		return SWITCH_STATUS_FALSE;
 	}
@@ -502,6 +640,11 @@ SWITCH_MODULE_LOAD_FUNCTION(mod_sndfile_load)
 	file_interface->file_set_string = sndfile_file_set_string;
 	file_interface->file_get_string = sndfile_file_get_string;
 
+	SWITCH_ADD_API(commands_api_interface, "sndfile_debug", "Set sndfile debug", mod_sndfile_debug, SNDFILE_DEBUG_SYNTAX);
+
+	switch_console_set_complete("add sndfile_debug on");
+	switch_console_set_complete("add sndfile_debug off");
+
 	/* indicate that the module should continue to be loaded */
 	return SWITCH_STATUS_SUCCESS;
 }
diff --git a/src/mod/formats/mod_sndfile/test/sounds/hello_stereo.wav b/src/mod/formats/mod_sndfile/test/sounds/hello_stereo.wav
new file mode 100644
index 00000000000..83e83889482
Binary files /dev/null and b/src/mod/formats/mod_sndfile/test/sounds/hello_stereo.wav differ
diff --git a/src/mod/formats/mod_sndfile/test/sounds/hi.wav b/src/mod/formats/mod_sndfile/test/sounds/hi.wav
new file mode 100644
index 00000000000..fec0d6b0734
Binary files /dev/null and b/src/mod/formats/mod_sndfile/test/sounds/hi.wav differ
diff --git a/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml b/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml
new file mode 100644
index 00000000000..c29aed89913
--- /dev/null
+++ b/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml
@@ -0,0 +1,27 @@
+<document type="freeswitch/xml">
+
+  <section name="configuration" description="Various Configuration">
+    <configuration name="modules.conf" description="Modules">
+      <modules>
+        <load module="mod_loopback"/>
+        <load module="mod_sndfile"/>
+      </modules>
+    </configuration>
+    <configuration name="sndfile.conf">
+        <settings>
+                <!-- Allow only these file extensions. Default: allow all sndfile provided extensions + FS custom extra -->               
+                <param name="allowed-extensions" value="wav,raw,r8,r16"/>
+        </settings>
+    </configuration>
+  </section>
+
+  <section name="dialplan" description="Regex/XML Dialplan">
+    <context name="default">
+      <extension name="sample">
+        <condition>
+          <action application="info"/>
+        </condition>
+      </extension>
+    </context>
+  </section>
+</document>
diff --git a/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml.fsxml b/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml.fsxml
new file mode 100644
index 00000000000..c29aed89913
--- /dev/null
+++ b/src/mod/formats/mod_sndfile/test/test_conf/freeswitch.xml.fsxml
@@ -0,0 +1,27 @@
+<document type="freeswitch/xml">
+
+  <section name="configuration" description="Various Configuration">
+    <configuration name="modules.conf" description="Modules">
+      <modules>
+        <load module="mod_loopback"/>
+        <load module="mod_sndfile"/>
+      </modules>
+    </configuration>
+    <configuration name="sndfile.conf">
+        <settings>
+                <!-- Allow only these file extensions. Default: allow all sndfile provided extensions + FS custom extra -->               
+                <param name="allowed-extensions" value="wav,raw,r8,r16"/>
+        </settings>
+    </configuration>
+  </section>
+
+  <section name="dialplan" description="Regex/XML Dialplan">
+    <context name="default">
+      <extension name="sample">
+        <condition>
+          <action application="info"/>
+        </condition>
+      </extension>
+    </context>
+  </section>
+</document>
diff --git a/src/mod/formats/mod_sndfile/test/test_formats_and_muxing/freeswitch.xml b/src/mod/formats/mod_sndfile/test/test_formats_and_muxing/freeswitch.xml
new file mode 100644
index 00000000000..39178db499e
--- /dev/null
+++ b/src/mod/formats/mod_sndfile/test/test_formats_and_muxing/freeswitch.xml
@@ -0,0 +1,21 @@
+<document type="freeswitch/xml">
+
+  <section name="configuration" description="Various Configuration">
+    <configuration name="modules.conf" description="Modules">
+      <modules>
+        <load module="mod_loopback"/>
+        <load module="mod_sndfile"/>
+      </modules>
+    </configuration>
+  </section>
+
+  <section name="dialplan" description="Regex/XML Dialplan">
+    <context name="default">
+      <extension name="sample">
+        <condition>
+          <action application="info"/>
+        </condition>
+      </extension>
+    </context>
+  </section>
+</document>
diff --git a/src/mod/formats/mod_sndfile/test/test_sndfile.c b/src/mod/formats/mod_sndfile/test/test_sndfile.c
new file mode 100644
index 00000000000..a9783ffa15e
--- /dev/null
+++ b/src/mod/formats/mod_sndfile/test/test_sndfile.c
@@ -0,0 +1,451 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2018, Anthony Minessale II <anthm@freeswitch.org>
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ *
+ * The Initial Developer of the Original Code is
+ * Anthony Minessale II <anthm@freeswitch.org>
+ * Portions created by the Initial Developer are Copyright (C)
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Dragos Oancea <dragos@signalwire.com>
+ *
+ * test_sndfile.c -- tests mod_sndfile
+ *
+ */
+#include <switch.h>
+#include <stdlib.h>
+
+#include <test/switch_test.h>
+
+/* Media files used:
+ *
+ * 1. hi.wav
+ *
+ * General
+ * Complete name                            : test/sounds/hi.wav
+ * Format                                   : Wave
+ * File size                                : 67.2 KiB
+ * Duration                                 : 2 s 150 ms
+ * Overall bit rate mode                    : Constant
+ * Overall bit rate                         : 256 kb/s
+ *
+ * Audio
+ * Format                                   : PCM
+ * Format settings, Endianness              : Little
+ * Format settings, Sign                    : Signed
+ * Codec ID                                 : 1
+ * Duration                                 : 2 s 150 ms
+ * Bit rate mode                            : Constant
+ * Bit rate                                 : 256 kb/s
+ * Channel(s)                               : 1 channel
+ * Sampling rate                            : 16.0 kHz
+ * Bit depth                                : 16 bits
+ * Stream size                              : 67.2 KiB (100%)
+ *
+ *
+ * 2. hello_stereo.wav 
+ *
+ *
+ * General
+ * Complete name                            : sounds/hello_stereo.wav
+ * Format                                   : Wave
+ * File size                                : 220 KiB
+ * Duration                                 : 1 s 277 ms
+ * Overall bit rate mode                    : Constant
+ * Overall bit rate                         : 1 412 kb/s
+ *
+ * Audio
+ * Format                                   : PCM
+ * Format settings, Endianness              : Little
+ * Format settings, Sign                    : Signed
+ * Codec ID                                 : 1
+ * Duration                                 : 1 s 277 ms
+ * Bit rate mode                            : Constant
+ * Bit rate                                 : 1 411.2 kb/s
+ * Channel(s)                               : 2 channels
+ * Sampling rate                            : 44.1 kHz
+ * Bit depth                                : 16 bits
+ * Stream size                              : 220 KiB (100%)
+ *
+
+*/
+
+char *extensions[] = { 
+	"aiff", "au", "avr", "caf", 
+	"flac", "htk", "iff", "mat", 
+	"mpc", "paf", "pvf", "rf64", 
+	"sd2", "sds", "sf", "voc", 
+	"w64", "wav", "wve", "xi",
+	"raw", "r8", "r16", "r24", 
+	"r32", "ul", "ulaw", "al", 
+	"alaw", "gsm", "vox", "oga", "ogg"};
+
+FST_CORE_BEGIN("test_formats_and_muxing")
+{
+	FST_SUITE_BEGIN(test_sndfile)
+	{
+		FST_SETUP_BEGIN()
+		{
+			fst_requires_module("mod_loopback");
+			fst_requires_module("mod_sndfile");
+		}
+		FST_SETUP_END()
+
+		FST_TEARDOWN_BEGIN()
+		{
+		}
+		FST_TEARDOWN_END()
+
+		FST_TEST_BEGIN(sndfile_write_read_mono)
+		{
+			/* play mono, record mono, open mono */
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			static char play_filename[] = "../sounds/hi.wav";
+			char path[4096];
+			switch_file_handle_t fh = { 0 };
+			int16_t *audiobuf;
+			switch_size_t len, rd;
+			char *recording;
+			int i, exlen, timeout_sec = 2, duration = 3000; /*ms*/
+			switch_stream_handle_t stream = { 0 };
+
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, play_filename);
+
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions) / sizeof(extensions[0]));
+
+			status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(session);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			for (i = 0; i < exlen; i++) {
+
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions[i]);
+
+				recording = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions[i]);
+				status = switch_ivr_record_session(session, recording, duration, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				status = switch_ivr_play_file(session, NULL, path, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_sleep(1000 * duration); // wait for audio to finish playing
+
+				switch_ivr_stop_record_session(session, "all");
+
+				status = switch_core_file_open(&fh, recording, 1, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				rd = 320; // samples
+				len = rd * sizeof(*audiobuf);
+				switch_zmalloc(audiobuf, len);
+
+				status = switch_core_file_read(&fh, audiobuf, &rd);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+				fst_check(rd = 320); // check that we read the wanted number of samples
+
+				status = switch_core_file_close(&fh);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				switch_safe_free(audiobuf);
+
+				unlink(recording);
+
+				switch_safe_free(recording);
+
+				switch_sleep(1000000);
+			}
+
+			channel = switch_core_session_get_channel(session);
+			fst_requires(channel);
+
+			switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+			fst_check(!switch_channel_ready(channel));
+
+			switch_core_session_rwunlock(session);
+		}
+
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(sndfile_write_read_m2s)
+		{
+			/* play mono file, record mono, open stereo */
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			static char play_filename[] = "../sounds/hi.wav";
+			char path[4096];
+			switch_file_handle_t fh = { 0 };
+			int16_t *audiobuf;
+			switch_size_t len, rd;
+			char *recording;
+			int i, exlen, channels = 2, timeout_sec = 2, duration = 3000; /*ms*/
+			switch_stream_handle_t stream = { 0 };
+		
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, play_filename);
+
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions) / sizeof(extensions[0]));
+
+			for (i = 0; i < exlen; i++) {
+
+				status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+				fst_requires(session);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions[i]);
+
+				recording = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions[i]);
+
+				status = switch_ivr_record_session(session, recording, duration, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				status = switch_ivr_play_file(session, NULL, path, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_sleep(1000 * duration); // wait for audio to finish playing
+
+				switch_ivr_stop_record_session(session, "all");
+
+				channel = switch_core_session_get_channel(session);
+				fst_requires(channel);
+
+				switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+				fst_check(!switch_channel_ready(channel));
+
+				switch_core_session_rwunlock(session);
+
+				status = switch_core_file_open(&fh, recording, channels, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				rd = 320; // samples
+				len = rd * sizeof(*audiobuf) * channels;
+				switch_zmalloc(audiobuf, len);
+
+				status = switch_core_file_read(&fh, audiobuf, &rd);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+				fst_check(rd = 320); // check that we read the wanted number of samples
+
+				status = switch_core_file_close(&fh);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				switch_safe_free(audiobuf);
+
+				unlink(recording);
+
+				switch_safe_free(recording);
+
+				switch_sleep(1000000);
+			}
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(sndfile_write_read_s2m)
+		{
+			/* play stereo wav, record stereo, open stereo file */
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			static char play_filename[] = "../sounds/hello_stereo.wav";
+			char path[4096];
+			switch_file_handle_t fh = { 0 };
+			int16_t *audiobuf;
+			switch_size_t len, rd;
+			char *recording, *rec_path;
+			int i, exlen, channels = 2, timeout_sec = 2, duration = 3000; /*ms*/
+			switch_stream_handle_t stream = { 0 };
+			
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, play_filename);
+
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions) / sizeof(extensions[0]));
+
+			for (i = 0; i < exlen; i++) {
+
+				status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+
+				fst_requires(session);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions[i]);
+
+				rec_path = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions[i]);
+				recording = switch_mprintf("{force_channels=2}%s", rec_path);
+
+				channel = switch_core_session_get_channel(session);
+				fst_requires(channel);
+
+				switch_channel_set_variable(channel, "enable_file_write_buffering", "true");
+
+				status = switch_ivr_record_session(session, recording, duration, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				status = switch_ivr_play_file(session, NULL, path, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_sleep(1000 * duration); // wait for audio to finish playing
+
+				switch_ivr_stop_record_session(session, "all");
+
+				switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+				fst_check(!switch_channel_ready(channel));
+
+				switch_core_session_rwunlock(session);
+
+				status = switch_core_file_open(&fh, recording, channels, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				rd = 320; // samples
+				len = rd * sizeof(*audiobuf) * channels;
+				switch_zmalloc(audiobuf, len);
+
+				status = switch_core_file_read(&fh, audiobuf, &rd);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+				fst_check(rd = 320); // check that we read the wanted number of samples
+
+				status = switch_core_file_close(&fh);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				switch_safe_free(audiobuf);
+
+				unlink(rec_path);
+
+				switch_safe_free(rec_path);
+				switch_safe_free(recording);
+
+				switch_sleep(1000000);
+			}
+		}
+
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(sndfile_write_read_stereo)
+		{
+			/* play stereo wav, record stereo, open stereo file */
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			static char play_filename[] = "../sounds/hello_stereo.wav";
+			char path[4096];
+			switch_file_handle_t fh = { 0 };
+			int16_t *audiobuf;
+			switch_size_t len, rd;
+			char *recording, *rec_path;
+			int i, exlen, channels = 2, timeout_sec = 2, duration = 3000; /*ms*/
+			switch_stream_handle_t stream = { 0 };
+			
+			sprintf(path, "%s%s%s", SWITCH_GLOBAL_dirs.conf_dir, SWITCH_PATH_SEPARATOR, play_filename);
+
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions) / sizeof(extensions[0]));
+
+			for (i = 0; i < exlen; i++) {
+
+				status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+				fst_requires(session);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions[i]);
+
+				rec_path = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions[i]);
+				recording = switch_mprintf("{force_channels=2}%s", rec_path);
+
+				channel = switch_core_session_get_channel(session);
+				fst_requires(channel);
+
+				switch_channel_set_variable(channel, "RECORD_STEREO", "true");
+				switch_channel_set_variable(channel, "enable_file_write_buffering", "true");
+
+				status = switch_ivr_record_session(session, recording, duration, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				status = switch_ivr_play_file(session, NULL, path, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				switch_sleep(1000 * duration); // wait for audio to finish playing
+
+				switch_ivr_stop_record_session(session, "all");
+
+				switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+				fst_check(!switch_channel_ready(channel));
+
+				switch_core_session_rwunlock(session);
+
+				status = switch_core_file_open(&fh, recording, channels, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				rd = 320; // samples
+				len = rd * sizeof(*audiobuf) * channels;
+				switch_zmalloc(audiobuf, len);
+
+				status = switch_core_file_read(&fh, audiobuf, &rd);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+				fst_check(rd = 320); // check that we read the wanted number of samples
+
+				status = switch_core_file_close(&fh);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+
+				switch_safe_free(audiobuf);
+
+				unlink(rec_path);
+
+				switch_safe_free(rec_path);
+				switch_safe_free(recording);
+
+				switch_sleep(1000000);
+			}
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(unload_mod_sndfile)
+		{
+			const char *err = NULL;
+			switch_sleep(1000000);
+			fst_check(switch_loadable_module_unload_module((char *)"../.libs", (char *)"mod_sndfile", SWITCH_TRUE, &err) == SWITCH_STATUS_SUCCESS);
+		}
+		FST_TEST_END()
+	}
+	FST_SUITE_END()
+}
+FST_CORE_END()
diff --git a/src/mod/formats/mod_sndfile/test/test_sndfile_conf.c b/src/mod/formats/mod_sndfile/test/test_sndfile_conf.c
new file mode 100644
index 00000000000..51505919222
--- /dev/null
+++ b/src/mod/formats/mod_sndfile/test/test_sndfile_conf.c
@@ -0,0 +1,154 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2005-2018, Anthony Minessale II <anthm@freeswitch.org>
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ *
+ * The Initial Developer of the Original Code is
+ * Anthony Minessale II <anthm@freeswitch.org>
+ * Portions created by the Initial Developer are Copyright (C)
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Dragos Oancea <dragos@signalwire.com>
+ *
+ * test_sndfile_conf.c -- tests mod_sndfile config
+ *
+ */
+#include <switch.h>
+#include <stdlib.h>
+
+#include <test/switch_test.h>
+
+char *extensions_will_fail[] = {  // not allowed through conf file.
+	"ul", "gsm", "vox", "ogg"
+};
+
+char *extensions_will_succeed[] = {  // allowed through conf file.
+	"wav", "raw", "r8", "r16"
+};
+
+FST_CORE_BEGIN("test_conf")
+{
+	FST_SUITE_BEGIN(test_sndfile)
+	{
+		FST_SETUP_BEGIN()
+		{
+			fst_requires_module("mod_loopback");
+			fst_requires_module("mod_sndfile");
+		}
+		FST_SETUP_END()
+
+		FST_TEARDOWN_BEGIN()
+		{
+		}
+		FST_TEARDOWN_END()
+
+		FST_TEST_BEGIN(sndfile_exten_not_allowed)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			char *recording;
+			int i, exlen, timeout_sec = 2;
+			switch_stream_handle_t stream = { 0 };
+			  
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions_will_fail) / sizeof(extensions_will_fail[0]));
+
+			status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(session);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			for (i = 0; i < exlen; i++) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions_will_fail[i]);
+
+				recording = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions_will_fail[i]);
+				status = switch_ivr_record_session(session, recording, 3000, NULL);
+				fst_check(status == SWITCH_STATUS_GENERR);
+				if (status == SWITCH_STATUS_SUCCESS) {
+					// not expected
+					unlink(recording);
+				}
+
+				switch_safe_free(recording);
+			}
+
+			channel = switch_core_session_get_channel(session);
+			fst_requires(channel);
+
+			switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+			fst_check(!switch_channel_ready(channel));
+
+			switch_core_session_rwunlock(session);
+
+			switch_sleep(1000000);
+		}
+		FST_TEST_END()
+		FST_TEST_BEGIN(sndfile_exten_allowed)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			char *recording;
+			int i, exlen, timeout_sec = 2;
+			switch_stream_handle_t stream = { 0 };
+			  
+			SWITCH_STANDARD_STREAM(stream);
+
+			switch_api_execute("sndfile_debug", "on", session, &stream);
+
+			switch_safe_free(stream.data);
+
+			exlen = (sizeof(extensions_will_succeed) / sizeof(extensions_will_succeed[0]));
+
+			status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", timeout_sec, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(session);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			for (i = 0; i < exlen; i++) {
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Testing media file extension: [%s]\n", extensions_will_succeed[i]);
+
+				recording = switch_mprintf("/tmp/%s.%s", switch_core_session_get_uuid(session), extensions_will_succeed[i]);
+				status = switch_ivr_record_session(session, recording, 3000, NULL);
+				fst_check(status == SWITCH_STATUS_SUCCESS);
+
+				unlink(recording);
+
+				switch_safe_free(recording);
+			}
+
+			channel = switch_core_session_get_channel(session);
+			fst_requires(channel);
+
+			switch_channel_hangup(channel, SWITCH_CAUSE_NORMAL_CLEARING);
+			fst_check(!switch_channel_ready(channel));
+
+			switch_core_session_rwunlock(session);
+
+			switch_sleep(1000000);
+		}
+		FST_TEST_END()
+	}
+	FST_SUITE_END()
+}
+FST_CORE_END()
diff --git a/src/mod/formats/mod_vlc/mod_vlc.c b/src/mod/formats/mod_vlc/mod_vlc.c
index 783d398c74e..af83330d07d 100644
--- a/src/mod/formats/mod_vlc/mod_vlc.c
+++ b/src/mod/formats/mod_vlc/mod_vlc.c
@@ -62,7 +62,7 @@ typedef int  (*imem_get_t)(void *data, const char *cookie,
 typedef void (*imem_release_t)(void *data, const char *cookie, size_t, void *);
 
 /* Change value to -vvv for vlc related debug. Be careful since vlc is at least as verbose as FS about logging */
-const char *vlc_args[] = {"-v"};
+const char *vlc_args[] = {"-vvv"};
 //const char *vlc_args[] = {"--network-caching=0"};
 //--sout-mux-caching
 
@@ -274,6 +274,8 @@ static void vlc_media_av_state_callback(const libvlc_event_t * event, void * dat
 		if (!(vcontext = acontext->vcontext)) {
 			return;
 		}
+	} else {
+		return;
 	}
 
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Got a libvlc_MediaStateChanged callback. New state: %d\n", new_state);
@@ -357,6 +359,8 @@ void vlc_file_play_audio_callback(void *data, const void *samples, unsigned coun
 		if (!(vcontext = acontext->vcontext)) {
 			return;
 		}
+	} else {
+		return;
 	}
 
 	switch_mutex_lock(vcontext->audio_mutex);
@@ -585,15 +589,17 @@ static switch_status_t av_init_handle(switch_file_handle_t *handle, switch_image
 	int32_t offset = 250;
 	const char *tmp;
     vlc_file_context_t *acontext = handle->private_info;
-	const char * opts[25] = {
-		*vlc_args,
-		switch_core_sprintf(acontext->pool, "--sout=%s", acontext->path)
-	};
+	const char * opts[25] = {0};
 	int argc = 2;
 	vlc_video_context_t *vcontext;
 
+	if (!acontext) {
+		return SWITCH_STATUS_FALSE;
+	}
+
+	opts[0] = *vlc_args;
+	opts[1] = switch_core_sprintf(acontext->pool, "--sout=%s", acontext->path);
 
-	vcontext = acontext->vcontext;
 	pool = acontext->pool;
 
 
@@ -683,10 +689,6 @@ static switch_status_t vlc_file_av_open(switch_file_handle_t *handle, const char
     vlc_file_context_t *acontext = handle->private_info;
 	vlc_video_context_t *vcontext;
 
-	if (acontext) {
-		vcontext = acontext->vcontext;
-	}
-
 	switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "VLC open %s for reading\n", acontext->path);
 
 	vcontext = switch_core_alloc(acontext->pool, sizeof(vlc_video_context_t));
@@ -766,6 +768,10 @@ static switch_status_t vlc_file_open(switch_file_handle_t *handle, const char *p
 	context = switch_core_alloc(handle->memory_pool, sizeof(*context));
 	context->pool = handle->memory_pool;
 	context->vlc_handle = libvlc_new(sizeof(vlc_args)/sizeof(char *), vlc_args);
+    if (!context->vlc_handle) {
+        switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "VLC error: cannot initialise VLC handle\n");
+        return SWITCH_STATUS_GENERR;
+    }
 	libvlc_log_set(context->vlc_handle, log_cb, NULL);
 
 	realpath = path;
@@ -967,9 +973,11 @@ static switch_status_t vlc_file_av_read(switch_file_handle_t *handle, void *data
 		if (!(vcontext = acontext->vcontext)) {
 			return SWITCH_STATUS_FALSE;
 		}
+	} else {
+		return SWITCH_STATUS_FALSE;
 	}
 
-	if (vcontext->err) {
+	if (vcontext && vcontext->err) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "VLC ended\n");
 		return SWITCH_STATUS_GENERR;
 	}
@@ -1126,6 +1134,8 @@ static switch_status_t vlc_file_read_video(switch_file_handle_t *handle, switch_
 		if (!(vcontext = acontext->vcontext)) {
 			return SWITCH_STATUS_FALSE;
 		}
+	} else {
+		return SWITCH_STATUS_FALSE;
 	}
 
 	if (vcontext->err) {
@@ -2247,9 +2257,7 @@ static switch_status_t channel_on_consume_media(switch_core_session_t *session)
 
 static switch_status_t channel_on_destroy(switch_core_session_t *session)
 {
-	vlc_private_t *tech_pvt = switch_core_session_get_private(session);
-
-	switch_assert(tech_pvt && tech_pvt->context);
+	vlc_private_t *tech_pvt;
 
 	if ((tech_pvt = switch_core_session_get_private(session))) {
 
@@ -2264,6 +2272,8 @@ static switch_status_t channel_on_destroy(switch_core_session_t *session)
 		switch_media_handle_destroy(session);
 	}
 
+	switch_assert(tech_pvt && tech_pvt->context);
+
 	switch_yield(50000);
 
 	if (tech_pvt->context->mp) libvlc_media_player_stop(tech_pvt->context->mp);
diff --git a/src/mod/languages/mod_java/freeswitch_java.cpp b/src/mod/languages/mod_java/freeswitch_java.cpp
index d71d212b45b..9c7aa74f13d 100644
--- a/src/mod/languages/mod_java/freeswitch_java.cpp
+++ b/src/mod/languages/mod_java/freeswitch_java.cpp
@@ -207,7 +207,7 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
     jstring digits = NULL;
     jint res;
     jstring callbackResult = NULL;
-    switch_status_t status;
+    switch_status_t status = SWITCH_STATUS_FALSE;
 
     if (cb_state.function == NULL)
     {
@@ -229,7 +229,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
     onDTMF = env->GetMethodID(klass, "onDTMF", "(Ljava/lang/Object;ILjava/lang/String;)Ljava/lang/String;");
     if (onDTMF == NULL)
     {
-        status = SWITCH_STATUS_FALSE;
         goto done;
     }
 
@@ -238,7 +237,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
         digits = env->NewStringUTF((char*)input);
         if (digits == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto done;
         }
         callbackResult = (jstring) env->CallObjectMethod((jobject)cb_state.function, onDTMF, digits, itype, (jstring)cb_state.funcargs);
@@ -248,8 +246,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
             status = process_callback_result((char*) callbackResultUTF);
             env->ReleaseStringUTFChars(callbackResult, callbackResultUTF);
         }
-        else
-            status = SWITCH_STATUS_FALSE;
     }
     else if (itype == SWITCH_INPUT_TYPE_EVENT)
     {
@@ -265,32 +261,27 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
         env->FindClass("org/freeswitch/Event");
         if (Event == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
         constructor = env->GetMethodID(Event, "<init>", "()V");
         if (constructor == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
         event = env->CallStaticObjectMethod(Event, constructor);
         if (event == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
 
         setBody = env->GetMethodID(Event, "setBody", "(Ljava/lang/String;)V");
         if (setBody == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
         body = env->NewStringUTF(switch_event->body);
         if (body == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
         env->CallVoidMethod(event, setBody, body);
@@ -300,7 +291,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
         addHeader = env->GetMethodID(Event, "addHeader", "(Ljava/lang/String;Ljava/lang/String;)V");
         if (addHeader == NULL)
         {
-            status = SWITCH_STATUS_FALSE;
             goto cleanup;
         }
         for (header = switch_event->headers; header; header = header->next)
@@ -324,7 +314,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
                 env->DeleteLocalRef(value);
             if (env->ExceptionOccurred())
             {
-                status = SWITCH_STATUS_FALSE;
                 goto cleanup;
             }
         }
@@ -336,8 +325,6 @@ switch_status_t JavaSession::run_dtmf_callback(void *input, switch_input_type_t
             status = process_callback_result((char*) callbackResultUTF);
             env->ReleaseStringUTFChars(callbackResult, callbackResultUTF);
         }
-        else
-            status = SWITCH_STATUS_FALSE;
 
     cleanup:
         if (Event != NULL)
diff --git a/src/mod/languages/mod_java/modjava.c b/src/mod/languages/mod_java/modjava.c
index 3290d49c57c..a334af302d1 100644
--- a/src/mod/languages/mod_java/modjava.c
+++ b/src/mod/languages/mod_java/modjava.c
@@ -323,7 +323,6 @@ static switch_status_t create_java_vm(JavaVMOption *options, int optionCount, vm
         	{
         		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Unable to find 'org.freeswitch.Launcher' class!\n");
         		(*env)->ExceptionDescribe(env);
-        		status = SWITCH_STATUS_FALSE;
         	}
 
         	// store a global reference for use in the launch_java() function
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/API.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/API.java
index 7d0824dbefc..d9686f9620c 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/API.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/API.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/CoreSession.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/CoreSession.java
index d111c2f8902..07571c683ad 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/CoreSession.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/CoreSession.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/DTMF.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/DTMF.java
index 71d6198f441..bf189103c9f 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/DTMF.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/DTMF.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/Event.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/Event.java
index f3a06a2177e..fd2c933b201 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/Event.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/Event.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/EventConsumer.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/EventConsumer.java
index 9a69a6c5e69..18af38ff9d3 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/EventConsumer.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/EventConsumer.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/IVRMenu.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/IVRMenu.java
index 84ca8671b93..c61ce9e4e1f 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/IVRMenu.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/IVRMenu.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/JavaSession.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/JavaSession.java
index 82573cbff90..4c71702a15c 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/JavaSession.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/JavaSession.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_JavaVM.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_JavaVM.java
index 87d2efd11f8..9947a8d9358 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_JavaVM.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_JavaVM.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_int.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_int.java
index b7aa92cec29..ca2ece7145c 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_int.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_int.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_p_switch_event_node_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_p_switch_event_node_t.java
index bf20923d8fd..d7b9cf39ad1 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_p_switch_event_node_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_p_switch_event_node_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_call_cause_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_call_cause_t.java
index c668c339ba6..e205409f41e 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_call_cause_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_call_cause_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_state_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_state_t.java
index b0443427aeb..d562494db96 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_state_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_state_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_t.java
index eea4e2e1bd6..e72db712f5e 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_channel_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_core_session_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_core_session_t.java
index fa595a73868..ac7971800ee 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_core_session_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_core_session_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_t.java
index 060199a96f3..ef5d70a2a95 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_types_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_types_t.java
index be40b44b6c0..f74001fb26a 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_types_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_event_types_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_args_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_args_t.java
index 3fdb4a7d207..92af42a7372 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_args_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_args_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_type_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_type_t.java
index 8bfe879d51c..6a8d3bc6b60 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_type_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_input_type_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_priority_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_priority_t.java
index 713ca09d3d3..2685896bd68 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_priority_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_priority_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_queue_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_queue_t.java
index 10e2d38666f..b228fc67ae0 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_queue_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_queue_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_state_handler_table_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_state_handler_table_t.java
index 989d3d403eb..74843c8821b 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_state_handler_table_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_state_handler_table_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_status_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_status_t.java
index dc28f2e1f8a..59b15e06d41 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_status_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_status_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_stream_handle_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_stream_handle_t.java
index 798349044c1..9d85b66c988 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_stream_handle_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_switch_stream_handle_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_uint32_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_uint32_t.java
index 6e6ab7d8c39..91282b05fc8 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_uint32_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_uint32_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_void.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_void.java
index 20f58354475..f6fce4f50dd 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_void.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/SWIGTYPE_p_void.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/Stream.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/Stream.java
index 23eab06f398..0fc677caa46 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/Stream.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/Stream.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitch.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitch.java
index 3d59d14638e..69ef167199e 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitch.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitch.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitchJNI.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitchJNI.java
index e2448cfa7b5..cceea3402a3 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitchJNI.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/freeswitchJNI.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/input_callback_state_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/input_callback_state_t.java
index f5bbcd62469..0ceaddd0254 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/input_callback_state_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/input_callback_state_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/src/org/freeswitch/swig/session_flag_t.java b/src/mod/languages/mod_java/src/org/freeswitch/swig/session_flag_t.java
index e4b6d10e880..eb4fe761c24 100644
--- a/src/mod/languages/mod_java/src/org/freeswitch/swig/session_flag_t.java
+++ b/src/mod/languages/mod_java/src/org/freeswitch/swig/session_flag_t.java
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * Do not make changes to this file unless you know what you are doing--modify
  * the SWIG interface file instead.
diff --git a/src/mod/languages/mod_java/switch_swig_wrap.cpp b/src/mod/languages/mod_java/switch_swig_wrap.cpp
index b54bb6cdeb0..f1b004dece7 100644
--- a/src/mod/languages/mod_java/switch_swig_wrap.cpp
+++ b/src/mod/languages/mod_java/switch_swig_wrap.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * This file is not intended to be easily readable and contains a number of
  * coding conventions designed to improve portability and efficiency. Do not make
diff --git a/src/mod/languages/mod_lua/Makefile.am b/src/mod/languages/mod_lua/Makefile.am
index 5b8e1ba3749..663cda16773 100644
--- a/src/mod/languages/mod_lua/Makefile.am
+++ b/src/mod/languages/mod_lua/Makefile.am
@@ -4,13 +4,19 @@ include $(top_srcdir)/build/modmake.rulesam
 MODNAME=mod_lua
 
 AM_CFLAGS += $(CFLAGS) -D_GNU_SOURCE
+
+noinst_LTLIBRARIES = libluamod.la
+libluamod_la_SOURCES = mod_lua.cpp freeswitch_lua.cpp mod_lua_wrap.cpp
+libluamod_la_CXXFLAGS = $(AM_CPPFLAGS) $(LUA_CFLAGS) $(CPPFLAGS)
+libluamod_la_CFLAGS = $(AM_CPPFLAGS) $(LUA_CFLAGS) $(CPPFLAGS)
+
 mod_LTLIBRARIES = mod_lua.la
 mod_lua_la_SOURCES = mod_lua.cpp freeswitch_lua.cpp mod_lua_wrap.cpp
 
 mod_lua_la_CXXFLAGS = $(AM_CPPFLAGS) $(LUA_CFLAGS) $(CPPFLAGS)
 mod_lua_la_CFLAGS   = $(AM_CPPFLAGS) $(LUA_CFLAGS) $(CPPFLAGS)
 
-mod_lua_la_LIBADD = $(switch_builddir)/libfreeswitch.la 
+mod_lua_la_LIBADD = $(switch_builddir)/libfreeswitch.la
 mod_lua_la_LDFLAGS = -avoid-version -module -no-undefined -shared -lm $(AM_LDFLAGS) $(SOLINK) $(LUA_LIBS)
 
 reswig: swigclean mod_lua_wrap.cpp
@@ -23,4 +29,12 @@ mod_lua_wrap.cpp: mod_lua_extra.c
 	echo "#include \"mod_lua_extra.c\"" >> mod_lua_wrap.cpp
 	patch -s -p0 -i hack.diff
 
-SUBDIRS=. test
+noinst_PROGRAMS = test/test_mod_lua
+
+test_test_mod_lua_SOURCES = test/test_mod_lua.c
+test_test_mod_lua_CFLAGS = $(AM_CFLAGS) -I. -DSWITCH_TEST_BASE_DIR_FOR_CONF=\"${abs_builddir}/test\" -DSWITCH_TEST_BASE_DIR_OVERRIDE=\"${abs_builddir}/test\"
+test_test_mod_lua_LDFLAGS = $(AM_LDFLAGS) -avoid-version -no-undefined $(freeswitch_LDFLAGS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
+test_test_mod_lua_LDADD = libluamod.la $(LUA_LIBS)
+
+TESTS = $(noinst_PROGRAMS)
+
diff --git a/src/mod/languages/mod_lua/freeswitch.i b/src/mod/languages/mod_lua/freeswitch.i
index 3485c8168cb..501b50fb48f 100644
--- a/src/mod/languages/mod_lua/freeswitch.i
+++ b/src/mod/languages/mod_lua/freeswitch.i
@@ -1,6 +1,7 @@
 %module freeswitch
 %include ../../../../swig_common.i
 //%include "cstring.i"
+%include std_string.i
 
 /** 
  * tell swig to treat these variables as mutable so they
@@ -44,7 +45,6 @@
   $1 = default_swiglua_fn;
 }
 
-
 %ignore SwitchToMempool;   
 %newobject EventConsumer::pop;
 %newobject Session;
@@ -56,10 +56,7 @@
 %newobject API::executeString;
 %newobject CoreSession::playAndDetectSpeech;
 %newobject JSON;
-%newobject JSON::encode;
 %newobject JSON::decode;
-%newobject JSON::execute;
-%newobject JSON::execute2;
 
 %include "typemaps.i"
 %apply int *OUTPUT { int *len };
@@ -130,11 +127,11 @@ class JSON {
     JSON();
     ~JSON();
     cJSON *decode(const char *str);
-    char *encode(SWIGLUA_TABLE lua_table);
+    std::string encode(SWIGLUA_TABLE lua_table);
     cJSON *execute(const char *);
     cJSON *execute(SWIGLUA_TABLE table);
-    char *execute2(const char *);
-    char *execute2(SWIGLUA_TABLE table);
+    std::string execute2(const char *);
+    std::string execute2(SWIGLUA_TABLE table);
     void encode_empty_table_as_object(bool flag);
     void return_unformatted_json(bool flag);
 };
diff --git a/src/mod/languages/mod_lua/freeswitch_lua.cpp b/src/mod/languages/mod_lua/freeswitch_lua.cpp
index 134906357e4..50d07783369 100644
--- a/src/mod/languages/mod_lua/freeswitch_lua.cpp
+++ b/src/mod/languages/mod_lua/freeswitch_lua.cpp
@@ -600,7 +600,7 @@ void JSON::LuaTable2cJSON(lua_State *L, int index, cJSON **json)
     // Stack is now the same as it was on entry to this function
 }
 
-char *JSON::encode(SWIGLUA_TABLE lua_table)
+std::string JSON::encode(SWIGLUA_TABLE lua_table)
 {
 	lua_State *L = lua_table.L;
 	cJSON *json = NULL;
@@ -613,8 +613,10 @@ char *JSON::encode(SWIGLUA_TABLE lua_table)
 	}
 
 	char *s = _return_unformatted_json ? cJSON_PrintUnformatted(json) : cJSON_Print(json);
+	std::string result = std::string(s);
+	free(s);
 	cJSON_Delete(json);
-	return s;
+	return result;
 }
 
 int JSON::cJSON2LuaTable(lua_State *L, cJSON *json) {
@@ -724,16 +726,22 @@ cJSON *JSON::execute(SWIGLUA_TABLE table)
 	return reply;
 }
 
-char *JSON::execute2(const char *str)
+std::string JSON::execute2(const char *str)
 {
-	cJSON *reply = execute(str);
-
-	return _return_unformatted_json ? cJSON_PrintUnformatted(reply) : cJSON_Print(reply);
+    cJSON *reply = execute(str);
+    char *s = _return_unformatted_json ? cJSON_PrintUnformatted(reply) : cJSON_Print(reply);
+    std::string result = std::string(s);
+    free(s);
+    cJSON_Delete(reply);
+    return result;
 }
 
-char *JSON::execute2(SWIGLUA_TABLE table)
+std::string JSON::execute2(SWIGLUA_TABLE table)
 {
-	cJSON *reply = execute(table);
-
-	return _return_unformatted_json ? cJSON_PrintUnformatted(reply) : cJSON_Print(reply);
+    cJSON *reply = execute(table);
+    char *s = _return_unformatted_json ? cJSON_PrintUnformatted(reply) : cJSON_Print(reply);
+    std::string result = std::string(s);
+    free(s);
+    cJSON_Delete(reply);
+    return result;
 }
diff --git a/src/mod/languages/mod_lua/freeswitch_lua.h b/src/mod/languages/mod_lua/freeswitch_lua.h
index e7acf3c99ec..c357916607a 100644
--- a/src/mod/languages/mod_lua/freeswitch_lua.h
+++ b/src/mod/languages/mod_lua/freeswitch_lua.h
@@ -8,6 +8,7 @@ extern "C" {
 #include "mod_lua_extra.h"
 }
 #include <switch_cpp.h>
+#include <string>
 
 #ifndef lua_pushglobaltable
 #define lua_pushglobaltable(L) lua_pushvalue(L,LUA_GLOBALSINDEX)
@@ -89,11 +90,11 @@ namespace LUA {
       JSON();
       ~JSON();
       cJSON *decode(const char *);
-      char *encode(SWIGLUA_TABLE table);
+      std::string encode(SWIGLUA_TABLE table);
       cJSON *execute(const char *);
       cJSON *execute(SWIGLUA_TABLE table);
-      char *execute2(const char *);
-      char *execute2(SWIGLUA_TABLE table);
+      std::string execute2(const char *);
+      std::string execute2(SWIGLUA_TABLE table);
       void encode_empty_table_as_object(bool flag);
       void return_unformatted_json(bool flag);
       static int cJSON2LuaTable(lua_State *L, cJSON *json);
diff --git a/src/mod/languages/mod_lua/hack.diff b/src/mod/languages/mod_lua/hack.diff
index 308ef189e99..907efb0d7e9 100644
--- a/src/mod/languages/mod_lua/hack.diff
+++ b/src/mod/languages/mod_lua/hack.diff
@@ -1,6 +1,6 @@
 --- mod_lua_wrap.cpp.old	2015-06-16 12:27:19.024000000 -0500
 +++ mod_lua_wrap.cpp	2015-06-16 12:34:51.540000000 -0500
-@@ -3065,7 +3065,7 @@
+@@ -4242,7 +4242,7 @@ static int _wrap_Stream_read(lua_State* L) {
    }
    
    result = (char *)(arg1)->read(arg2);
@@ -9,7 +9,7 @@
    lua_pushnumber(L, (lua_Number) *arg2); SWIG_arg++;
    return SWIG_arg;
    
-@@ -6855,7 +6855,7 @@
+@@ -8304,7 +8304,7 @@ static int _wrap_new_Session__SWIG_0(lua_State* L) {
    
    SWIG_check_num_args("LUA::Session::Session",0,0)
    result = (LUA::Session *)new LUA::Session();
@@ -18,7 +18,7 @@
    return SWIG_arg;
    
    if(0) SWIG_fail;
-@@ -6882,7 +6882,7 @@
+@@ -8331,7 +8331,7 @@ static int _wrap_new_Session__SWIG_1(lua_State* L) {
    }
    
    result = (LUA::Session *)new LUA::Session(arg1,arg2);
@@ -27,7 +27,7 @@
    return SWIG_arg;
    
    if(0) SWIG_fail;
-@@ -6902,7 +6902,7 @@
+@@ -8351,7 +8351,7 @@ static int _wrap_new_Session__SWIG_2(lua_State* L) {
    if(!SWIG_lua_isnilstring(L,1)) SWIG_fail_arg("LUA::Session::Session",1,"char *");
    arg1 = (char *)lua_tostring(L, 1);
    result = (LUA::Session *)new LUA::Session(arg1);
@@ -36,7 +36,7 @@
    return SWIG_arg;
    
    if(0) SWIG_fail;
-@@ -6926,7 +6926,7 @@
+@@ -8375,7 +8375,7 @@ static int _wrap_new_Session__SWIG_3(lua_State* L) {
    }
    
    result = (LUA::Session *)new LUA::Session(arg1);
@@ -45,7 +45,7 @@
    return SWIG_arg;
    
    if(0) SWIG_fail;
-@@ -9217,6 +9217,7 @@ static int _wrap_Dbh_test_reactive__SWIG_0(lua_State* L) {
+@@ -9485,6 +9485,7 @@ static int _wrap_Dbh_test_reactive__SWIG_0(lua_State* L) {
    arg2 = (char *)lua_tostring(L, 2);
    arg3 = (char *)lua_tostring(L, 3);
    arg4 = (char *)lua_tostring(L, 4);
@@ -53,7 +53,7 @@
    result = (bool)(arg1)->test_reactive(arg2,arg3,arg4);
    lua_pushboolean(L,(int)(result!=0)); SWIG_arg++;
    return SWIG_arg;
-@@ -9404,6 +9405,7 @@ static int _wrap_Dbh_query(lua_State* L) {
+@@ -9672,6 +9673,7 @@ static int _wrap_Dbh_query(lua_State* L) {
        (&arg3)->idx = 3;
      }
    }
@@ -61,7 +61,7 @@
    result = (bool)(arg1)->query(arg2,arg3);
    lua_pushboolean(L,(int)(result!=0)); SWIG_arg++;
    return SWIG_arg;
-@@ -9427,7 +9429,7 @@ static int _wrap_Dbh_affected_rows(lua_State* L) {
+@@ -9695,7 +9697,7 @@ static int _wrap_Dbh_affected_rows(lua_State* L) {
    if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_LUA__Dbh,0))){
      SWIG_fail_ptr("Dbh_affected_rows",1,SWIGTYPE_p_LUA__Dbh);
    }
@@ -70,7 +70,7 @@
    result = (int)(arg1)->affected_rows();
    lua_pushnumber(L, (lua_Number) result); SWIG_arg++;
    return SWIG_arg;
-@@ -9451,7 +9453,7 @@ static int _wrap_Dbh_last_error(lua_State* L) {
+@@ -9719,7 +9721,7 @@ static int _wrap_Dbh_last_error(lua_State* L) {
    if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_LUA__Dbh,0))){
      SWIG_fail_ptr("Dbh_last_error",1,SWIGTYPE_p_LUA__Dbh);
    }
@@ -79,7 +79,7 @@
    result = (char *)(arg1)->last_error();
    lua_pushstring(L,(const char *)result); SWIG_arg++;
    return SWIG_arg;
-@@ -9474,7 +9476,7 @@ static int _wrap_Dbh_clear_error(lua_State* L) {
+@@ -9742,7 +9744,7 @@ static int _wrap_Dbh_clear_error(lua_State* L) {
    if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_LUA__Dbh,0))){
      SWIG_fail_ptr("Dbh_clear_error",1,SWIGTYPE_p_LUA__Dbh);
    }
@@ -88,7 +88,7 @@
    (arg1)->clear_error();
    
    return SWIG_arg;
-@@ -9502,6 +9504,7 @@ static int _wrap_Dbh_load_extension(lua_State* L) {
+@@ -9770,6 +9772,7 @@ static int _wrap_Dbh_load_extension(lua_State* L) {
    }
    
    arg2 = (char *)lua_tostring(L, 2);
@@ -96,7 +96,7 @@
    result = (int)(arg1)->load_extension((char const *)arg2);
    lua_pushnumber(L, (lua_Number) result); SWIG_arg++;
    return SWIG_arg;
-@@ -9601,6 +9604,7 @@ static int _wrap_JSON_decode(lua_State* L) {
+@@ -9869,6 +9872,7 @@ static int _wrap_JSON_decode(lua_State* L) {
    }
    
    arg2 = (char *)lua_tostring(L, 2);
@@ -104,15 +104,15 @@
    result = (cJSON *)(arg1)->decode((char const *)arg2);
    {
      SWIG_arg += LUA::JSON::cJSON2LuaTable(L, result);
-@@ -9634,6 +9638,7 @@ static int _wrap_JSON_encode(lua_State* L) {
+@@ -9902,6 +9906,7 @@ static int _wrap_JSON_encode(lua_State* L) {
      (&arg2)->L = L;
      (&arg2)->idx = 2;
    }
 +  switch_assert(arg1);
-   result = (char *)(arg1)->encode(arg2);
-   lua_pushstring(L,(const char *)result); SWIG_arg++;
+   result = (arg1)->encode(arg2);
+   lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
    return SWIG_arg;
-@@ -9661,6 +9666,7 @@ static int _wrap_JSON_execute__SWIG_0(lua_State* L) {
+@@ -9929,6 +9934,7 @@ static int _wrap_JSON_execute__SWIG_0(lua_State* L) {
    }
    
    arg2 = (char *)lua_tostring(L, 2);
@@ -120,7 +120,7 @@
    result = (cJSON *)(arg1)->execute((char const *)arg2);
    {
      SWIG_arg += LUA::JSON::cJSON2LuaTable(L, result);
-@@ -9694,6 +9700,7 @@ static int _wrap_JSON_execute__SWIG_1(lua_State* L) {
+@@ -9962,6 +9968,7 @@ static int _wrap_JSON_execute__SWIG_1(lua_State* L) {
      (&arg2)->L = L;
      (&arg2)->idx = 2;
    }
@@ -128,23 +128,23 @@
    result = (cJSON *)(arg1)->execute(arg2);
    {
      SWIG_arg += LUA::JSON::cJSON2LuaTable(L, result);
-@@ -9778,6 +9785,7 @@ static int _wrap_JSON_execute2__SWIG_0(lua_State* L) {
+@@ -10046,6 +10053,7 @@ static int _wrap_JSON_execute2__SWIG_0(lua_State* L) {
    }
    
    arg2 = (char *)lua_tostring(L, 2);
 +  switch_assert(arg1);
-   result = (char *)(arg1)->execute2((char const *)arg2);
-   lua_pushstring(L,(const char *)result); SWIG_arg++;
+   result = (arg1)->execute2((char const *)arg2);
+   lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
    return SWIG_arg;
-@@ -9808,6 +9816,7 @@ static int _wrap_JSON_execute2__SWIG_1(lua_State* L) {
+@@ -10076,6 +10084,7 @@ static int _wrap_JSON_execute2__SWIG_1(lua_State* L) {
      (&arg2)->L = L;
      (&arg2)->idx = 2;
    }
 +  switch_assert(arg1);
-   result = (char *)(arg1)->execute2(arg2);
-   lua_pushstring(L,(const char *)result); SWIG_arg++;
+   result = (arg1)->execute2(arg2);
+   lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
    return SWIG_arg;
-@@ -9888,6 +9897,7 @@ static int _wrap_JSON_encode_empty_table_as_object(lua_State* L) {
+@@ -10156,6 +10165,7 @@ static int _wrap_JSON_encode_empty_table_as_object(lua_State* L) {
    }
    
    arg2 = (lua_toboolean(L, 2)!=0);
@@ -152,7 +152,7 @@
    (arg1)->encode_empty_table_as_object(arg2);
    
    return SWIG_arg;
-@@ -9914,6 +9924,7 @@ static int _wrap_JSON_return_unformatted_json(lua_State* L) {
+@@ -10182,6 +10192,7 @@ static int _wrap_JSON_return_unformatted_json(lua_State* L) {
    }
    
    arg2 = (lua_toboolean(L, 2)!=0);
diff --git a/src/mod/languages/mod_lua/mod_lua_wrap.cpp b/src/mod/languages/mod_lua/mod_lua_wrap.cpp
index ea42d2134f4..a54f1faf68e 100644
--- a/src/mod/languages/mod_lua/mod_lua_wrap.cpp
+++ b/src/mod/languages/mod_lua/mod_lua_wrap.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * This file is not intended to be easily readable and contains a number of
  * coding conventions designed to improve portability and efficiency. Do not make
@@ -2691,23 +2691,24 @@ SWIG_Lua_dostring(lua_State *L, const char *str) {
 #define SWIGTYPE_p_lua_State swig_types[13]
 #define SWIGTYPE_p_p_switch_event_node_t swig_types[14]
 #define SWIGTYPE_p_session_flag_t swig_types[15]
-#define SWIGTYPE_p_switch_call_cause_t swig_types[16]
-#define SWIGTYPE_p_switch_channel_state_t swig_types[17]
-#define SWIGTYPE_p_switch_channel_t swig_types[18]
-#define SWIGTYPE_p_switch_core_session_t swig_types[19]
-#define SWIGTYPE_p_switch_event_t swig_types[20]
-#define SWIGTYPE_p_switch_event_types_t swig_types[21]
-#define SWIGTYPE_p_switch_input_args_t swig_types[22]
-#define SWIGTYPE_p_switch_input_type_t swig_types[23]
-#define SWIGTYPE_p_switch_priority_t swig_types[24]
-#define SWIGTYPE_p_switch_queue_t swig_types[25]
-#define SWIGTYPE_p_switch_state_handler_table_t swig_types[26]
-#define SWIGTYPE_p_switch_status_t swig_types[27]
-#define SWIGTYPE_p_switch_stream_handle_t swig_types[28]
-#define SWIGTYPE_p_uint32_t swig_types[29]
-#define SWIGTYPE_p_void swig_types[30]
-static swig_type_info *swig_types[32];
-static swig_module_info swig_module = {swig_types, 31, 0, 0, 0, 0};
+#define SWIGTYPE_p_std__string swig_types[16]
+#define SWIGTYPE_p_switch_call_cause_t swig_types[17]
+#define SWIGTYPE_p_switch_channel_state_t swig_types[18]
+#define SWIGTYPE_p_switch_channel_t swig_types[19]
+#define SWIGTYPE_p_switch_core_session_t swig_types[20]
+#define SWIGTYPE_p_switch_event_t swig_types[21]
+#define SWIGTYPE_p_switch_event_types_t swig_types[22]
+#define SWIGTYPE_p_switch_input_args_t swig_types[23]
+#define SWIGTYPE_p_switch_input_type_t swig_types[24]
+#define SWIGTYPE_p_switch_priority_t swig_types[25]
+#define SWIGTYPE_p_switch_queue_t swig_types[26]
+#define SWIGTYPE_p_switch_state_handler_table_t swig_types[27]
+#define SWIGTYPE_p_switch_status_t swig_types[28]
+#define SWIGTYPE_p_switch_stream_handle_t swig_types[29]
+#define SWIGTYPE_p_uint32_t swig_types[30]
+#define SWIGTYPE_p_void swig_types[31]
+static swig_type_info *swig_types[33];
+static swig_module_info swig_module = {swig_types, 32, 0, 0, 0, 0};
 #define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name)
 #define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name)
 
@@ -2724,6 +2725,17 @@ typedef struct{} LANGUAGE_OBJ;
 }
 
 
+#include <string>
+
+
+SWIGINTERN int SWIG_lua_isnilstring(lua_State *L, int idx) {
+  int ret = lua_isstring(L, idx);
+  if (!ret)
+   ret = lua_isnil(L, idx);
+  return ret;
+}
+
+
 #include "switch.h"
 #include "switch_cpp.h"
 #include "freeswitch_lua.h"
@@ -2885,17 +2897,273 @@ SWIGINTERN void SWIG_write_ptr_array(lua_State* L,void **array,int size,swig_typ
 	}
 }
 
-
-SWIGINTERN int SWIG_lua_isnilstring(lua_State *L, int idx) {
-  int ret = lua_isstring(L, idx);
-  if (!ret)
-   ret = lua_isnil(L, idx);
-  return ret;
-}
-
 #ifdef __cplusplus
 extern "C" {
 #endif
+static int _wrap_new_string__SWIG_0(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *result = 0 ;
+  
+  SWIG_check_num_args("std::string::string",0,0)
+  result = (std::string *)new std::string();
+  SWIG_NewPointerObj(L,result,SWIGTYPE_p_std__string,1); SWIG_arg++; 
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_new_string__SWIG_1(lua_State* L) {
+  int SWIG_arg = 0;
+  char *arg1 = (char *) 0 ;
+  std::string *result = 0 ;
+  
+  SWIG_check_num_args("std::string::string",1,1)
+  if(!SWIG_lua_isnilstring(L,1)) SWIG_fail_arg("std::string::string",1,"char const *");
+  arg1 = (char *)lua_tostring(L, 1);
+  result = (std::string *)new std::string((char const *)arg1);
+  SWIG_NewPointerObj(L,result,SWIGTYPE_p_std__string,1); SWIG_arg++; 
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_new_string(lua_State* L) {
+  int argc;
+  int argv[2]={
+    1,2
+  };
+  
+  argc = lua_gettop(L);
+  if (argc == 0) {
+    return _wrap_new_string__SWIG_0(L);
+  }
+  if (argc == 1) {
+    int _v;
+    {
+      _v = SWIG_lua_isnilstring(L,argv[0]);
+    }
+    if (_v) {
+      return _wrap_new_string__SWIG_1(L);
+    }
+  }
+  
+  SWIG_Lua_pusherrstring(L,"Wrong arguments for overloaded function 'new_string'\n"
+    "  Possible C/C++ prototypes are:\n"
+    "    std::string::string()\n"
+    "    std::string::string(char const *)\n");
+  lua_error(L);return 0;
+}
+
+
+static int _wrap_string_size(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  unsigned int result;
+  
+  SWIG_check_num_args("std::string::size",1,1)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::size",1,"std::string const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_size",1,SWIGTYPE_p_std__string);
+  }
+  
+  result = (unsigned int)((std::string const *)arg1)->size();
+  lua_pushnumber(L, (lua_Number) result); SWIG_arg++;
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_string_length(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  unsigned int result;
+  
+  SWIG_check_num_args("std::string::length",1,1)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::length",1,"std::string const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_length",1,SWIGTYPE_p_std__string);
+  }
+  
+  result = (unsigned int)((std::string const *)arg1)->length();
+  lua_pushnumber(L, (lua_Number) result); SWIG_arg++;
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_string_empty(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  bool result;
+  
+  SWIG_check_num_args("std::string::empty",1,1)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::empty",1,"std::string const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_empty",1,SWIGTYPE_p_std__string);
+  }
+  
+  result = (bool)((std::string const *)arg1)->empty();
+  lua_pushboolean(L,(int)(result!=0)); SWIG_arg++;
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_string_c_str(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  char *result = 0 ;
+  
+  SWIG_check_num_args("std::string::c_str",1,1)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::c_str",1,"std::string const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_c_str",1,SWIGTYPE_p_std__string);
+  }
+  
+  result = (char *)((std::string const *)arg1)->c_str();
+  lua_pushstring(L,(const char *)result); SWIG_arg++;
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_string_data(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  char *result = 0 ;
+  
+  SWIG_check_num_args("std::string::data",1,1)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::data",1,"std::string const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_data",1,SWIGTYPE_p_std__string);
+  }
+  
+  result = (char *)((std::string const *)arg1)->data();
+  lua_pushstring(L,(const char *)result); SWIG_arg++;
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static int _wrap_string_assign(lua_State* L) {
+  int SWIG_arg = 0;
+  std::string *arg1 = (std::string *) 0 ;
+  char *arg2 = (char *) 0 ;
+  
+  SWIG_check_num_args("std::string::assign",2,2)
+  if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("std::string::assign",1,"std::string *");
+  if(!SWIG_lua_isnilstring(L,2)) SWIG_fail_arg("std::string::assign",2,"char const *");
+  
+  if (!SWIG_IsOK(SWIG_ConvertPtr(L,1,(void**)&arg1,SWIGTYPE_p_std__string,0))){
+    SWIG_fail_ptr("string_assign",1,SWIGTYPE_p_std__string);
+  }
+  
+  arg2 = (char *)lua_tostring(L, 2);
+  (arg1)->assign((char const *)arg2);
+  
+  return SWIG_arg;
+  
+  if(0) SWIG_fail;
+  
+fail:
+  lua_error(L);
+  return SWIG_arg;
+}
+
+
+static void swig_delete_string(void *obj) {
+std::string *arg1 = (std::string *) obj;
+delete arg1;
+}
+static int _proxy__wrap_new_string(lua_State *L) {
+    assert(lua_istable(L,1));
+    lua_pushcfunction(L,_wrap_new_string);
+    assert(!lua_isnil(L,-1));
+    lua_replace(L,1); /* replace our table with real constructor */
+    lua_call(L,lua_gettop(L)-1,1);
+    return 1;
+}
+static swig_lua_attribute swig_string_attributes[] = {
+    {0,0,0}
+};
+static swig_lua_method swig_string_methods[]= {
+    { "size", _wrap_string_size},
+    { "length", _wrap_string_length},
+    { "empty", _wrap_string_empty},
+    { "c_str", _wrap_string_c_str},
+    { "data", _wrap_string_data},
+    { "assign", _wrap_string_assign},
+    {0,0}
+};
+static swig_lua_method swig_string_meta[] = {
+    {0,0}
+};
+
+static swig_lua_attribute swig_string_Sf_SwigStatic_attributes[] = {
+    {0,0,0}
+};
+static swig_lua_const_info swig_string_Sf_SwigStatic_constants[]= {
+    {0,0,0,0,0,0}
+};
+static swig_lua_method swig_string_Sf_SwigStatic_methods[]= {
+    {0,0}
+};
+static swig_lua_class* swig_string_Sf_SwigStatic_classes[]= {
+    0
+};
+
+static swig_lua_namespace swig_string_Sf_SwigStatic = {
+    "string",
+    swig_string_Sf_SwigStatic_methods,
+    swig_string_Sf_SwigStatic_attributes,
+    swig_string_Sf_SwigStatic_constants,
+    swig_string_Sf_SwigStatic_classes,
+    0
+};
+static swig_lua_class *swig_string_bases[] = {0};
+static const char *swig_string_base_names[] = {0};
+static swig_lua_class _wrap_class_string = { "string", "string", &SWIGTYPE_p_std__string,_proxy__wrap_new_string, swig_delete_string, swig_string_methods, swig_string_attributes, &swig_string_Sf_SwigStatic, swig_string_meta, swig_string_bases, swig_string_base_names };
+
 static int _wrap_setGlobalVariable(lua_State* L) {
   int SWIG_arg = 0;
   char *arg1 = (char *) 0 ;
@@ -9624,7 +9892,7 @@ static int _wrap_JSON_encode(lua_State* L) {
   int SWIG_arg = 0;
   LUA::JSON *arg1 = (LUA::JSON *) 0 ;
   SWIGLUA_TABLE arg2 ;
-  char *result = 0 ;
+  std::string result;
   
   SWIG_check_num_args("LUA::JSON::encode",2,2)
   if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("LUA::JSON::encode",1,"LUA::JSON *");
@@ -9639,8 +9907,8 @@ static int _wrap_JSON_encode(lua_State* L) {
     (&arg2)->idx = 2;
   }
   switch_assert(arg1);
-  result = (char *)(arg1)->encode(arg2);
-  lua_pushstring(L,(const char *)result); SWIG_arg++;
+  result = (arg1)->encode(arg2);
+  lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
   return SWIG_arg;
   
   if(0) SWIG_fail;
@@ -9774,7 +10042,7 @@ static int _wrap_JSON_execute2__SWIG_0(lua_State* L) {
   int SWIG_arg = 0;
   LUA::JSON *arg1 = (LUA::JSON *) 0 ;
   char *arg2 = (char *) 0 ;
-  char *result = 0 ;
+  std::string result;
   
   SWIG_check_num_args("LUA::JSON::execute2",2,2)
   if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("LUA::JSON::execute2",1,"LUA::JSON *");
@@ -9786,8 +10054,8 @@ static int _wrap_JSON_execute2__SWIG_0(lua_State* L) {
   
   arg2 = (char *)lua_tostring(L, 2);
   switch_assert(arg1);
-  result = (char *)(arg1)->execute2((char const *)arg2);
-  lua_pushstring(L,(const char *)result); SWIG_arg++;
+  result = (arg1)->execute2((char const *)arg2);
+  lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
   return SWIG_arg;
   
   if(0) SWIG_fail;
@@ -9802,7 +10070,7 @@ static int _wrap_JSON_execute2__SWIG_1(lua_State* L) {
   int SWIG_arg = 0;
   LUA::JSON *arg1 = (LUA::JSON *) 0 ;
   SWIGLUA_TABLE arg2 ;
-  char *result = 0 ;
+  std::string result;
   
   SWIG_check_num_args("LUA::JSON::execute2",2,2)
   if(!SWIG_isptrtype(L,1)) SWIG_fail_arg("LUA::JSON::execute2",1,"LUA::JSON *");
@@ -9817,8 +10085,8 @@ static int _wrap_JSON_execute2__SWIG_1(lua_State* L) {
     (&arg2)->idx = 2;
   }
   switch_assert(arg1);
-  result = (char *)(arg1)->execute2(arg2);
-  lua_pushstring(L,(const char *)result); SWIG_arg++;
+  result = (arg1)->execute2(arg2);
+  lua_pushlstring(L,(&result)->data(),(&result)->size()); SWIG_arg++;
   return SWIG_arg;
   
   if(0) SWIG_fail;
@@ -10017,6 +10285,7 @@ static swig_lua_method swig_SwigModule_methods[]= {
     {0,0}
 };
 static swig_lua_class* swig_SwigModule_classes[]= {
+&_wrap_class_string,
 &_wrap_class_IVRMenu,
 &_wrap_class_API,
 &_wrap_class_input_callback_state_t,
@@ -10067,6 +10336,7 @@ static swig_type_info _swigt__p_int = {"_p_int", "int *", 0, 0, (void*)0, 0};
 static swig_type_info _swigt__p_lua_State = {"_p_lua_State", "lua_State *", 0, 0, (void*)0, 0};
 static swig_type_info _swigt__p_p_switch_event_node_t = {"_p_p_switch_event_node_t", "switch_event_node_t **", 0, 0, (void*)0, 0};
 static swig_type_info _swigt__p_session_flag_t = {"_p_session_flag_t", "enum session_flag_t *|session_flag_t *", 0, 0, (void*)0, 0};
+static swig_type_info _swigt__p_std__string = {"_p_std__string", "std::string *", 0, 0, (void*)&_wrap_class_string, 0};
 static swig_type_info _swigt__p_switch_call_cause_t = {"_p_switch_call_cause_t", "switch_call_cause_t *", 0, 0, (void*)0, 0};
 static swig_type_info _swigt__p_switch_channel_state_t = {"_p_switch_channel_state_t", "switch_channel_state_t *", 0, 0, (void*)0, 0};
 static swig_type_info _swigt__p_switch_channel_t = {"_p_switch_channel_t", "switch_channel_t *", 0, 0, (void*)0, 0};
@@ -10100,6 +10370,7 @@ static swig_type_info *swig_type_initial[] = {
   &_swigt__p_lua_State,
   &_swigt__p_p_switch_event_node_t,
   &_swigt__p_session_flag_t,
+  &_swigt__p_std__string,
   &_swigt__p_switch_call_cause_t,
   &_swigt__p_switch_channel_state_t,
   &_swigt__p_switch_channel_t,
@@ -10133,6 +10404,7 @@ static swig_cast_info _swigc__p_int[] = {  {&_swigt__p_int, 0, 0, 0},{0, 0, 0, 0
 static swig_cast_info _swigc__p_lua_State[] = {  {&_swigt__p_lua_State, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_p_switch_event_node_t[] = {  {&_swigt__p_p_switch_event_node_t, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_session_flag_t[] = {  {&_swigt__p_session_flag_t, 0, 0, 0},{0, 0, 0, 0}};
+static swig_cast_info _swigc__p_std__string[] = {  {&_swigt__p_std__string, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_switch_call_cause_t[] = {  {&_swigt__p_switch_call_cause_t, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_switch_channel_state_t[] = {  {&_swigt__p_switch_channel_state_t, 0, 0, 0},{0, 0, 0, 0}};
 static swig_cast_info _swigc__p_switch_channel_t[] = {  {&_swigt__p_switch_channel_t, 0, 0, 0},{0, 0, 0, 0}};
@@ -10166,6 +10438,7 @@ static swig_cast_info *swig_cast_initial[] = {
   _swigc__p_lua_State,
   _swigc__p_p_switch_event_node_t,
   _swigc__p_session_flag_t,
+  _swigc__p_std__string,
   _swigc__p_switch_call_cause_t,
   _swigc__p_switch_channel_state_t,
   _swigc__p_switch_channel_t,
diff --git a/src/mod/languages/mod_lua/test/Makefile.am b/src/mod/languages/mod_lua/test/Makefile.am
deleted file mode 100644
index 8db9324b98d..00000000000
--- a/src/mod/languages/mod_lua/test/Makefile.am
+++ /dev/null
@@ -1,4 +0,0 @@
-noinst_PROGRAMS = test_mod_lua
-AM_CFLAGS = $(SWITCH_AM_CFLAGS)
-AM_LDFLAGS = $(switch_builddir)/libfreeswitch.la -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS)
-TESTS = $(noinst_PROGRAMS)
diff --git a/src/mod/languages/mod_managed/freeswitch_wrap.cxx b/src/mod/languages/mod_managed/freeswitch_wrap.cxx
index 07efc7d898d..7ecadca8264 100644
--- a/src/mod/languages/mod_managed/freeswitch_wrap.cxx
+++ b/src/mod/languages/mod_managed/freeswitch_wrap.cxx
@@ -18030,6 +18030,18 @@ SWIGEXPORT unsigned long SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_default_rat
 }
 
 
+SWIGEXPORT unsigned long SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_core_max_audio_channels___(unsigned long jarg1) {
+  unsigned long jresult ;
+  uint32_t arg1 ;
+  uint32_t result;
+  
+  arg1 = (uint32_t)jarg1; 
+  result = (uint32_t)switch_core_max_audio_channels(arg1);
+  jresult = (unsigned long)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_core_add_registration___(char * jarg1, char * jarg2, char * jarg3, char * jarg4, unsigned long jarg5, char * jarg6, char * jarg7, char * jarg8, char * jarg9) {
   int jresult ;
   char *arg1 = (char *) 0 ;
@@ -22226,6 +22238,20 @@ SWIGEXPORT int SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_split_user_domain___(
 }
 
 
+SWIGEXPORT void * SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_calloc___(unsigned long jarg1, unsigned long jarg2) {
+  void * jresult ;
+  size_t arg1 ;
+  size_t arg2 ;
+  void *result = 0 ;
+  
+  arg1 = (size_t)jarg1; 
+  arg2 = (size_t)jarg2; 
+  result = (void *)switch_calloc(arg1,arg2);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT char * SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_uuid_str___(char * jarg1, void * jarg2) {
   char * jresult ;
   char *arg1 = (char *) 0 ;
@@ -23121,6 +23147,16 @@ SWIGEXPORT char * SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_html_strip___(char
 }
 
 
+SWIGEXPORT unsigned long SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_getpid___() {
+  unsigned long jresult ;
+  unsigned long result;
+  
+  result = (unsigned long)switch_getpid();
+  jresult = (unsigned long)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_FreeSWITCHfNative_profile_node_t_var_set___(void * jarg1, char * jarg2) {
   profile_node_s *arg1 = (profile_node_s *) 0 ;
   char *arg2 = (char *) 0 ;
@@ -41953,6 +41989,14 @@ SWIGEXPORT int SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_ivr_signal_bridge___(
 }
 
 
+SWIGEXPORT void SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_ivr_check_hold___(void * jarg1) {
+  switch_core_session_t *arg1 = (switch_core_session_t *) 0 ;
+  
+  arg1 = (switch_core_session_t *)jarg1; 
+  switch_ivr_check_hold(arg1);
+}
+
+
 SWIGEXPORT int SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_ivr_session_transfer___(void * jarg1, char * jarg2, char * jarg3, char * jarg4) {
   int jresult ;
   switch_core_session_t *arg1 = (switch_core_session_t *) 0 ;
@@ -45807,6 +45851,18 @@ SWIGEXPORT void SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_rtp_video_loss___(vo
 }
 
 
+SWIGEXPORT void * SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_rtp_get_core_session___(void * jarg1) {
+  void * jresult ;
+  switch_rtp_t *arg1 = (switch_rtp_t *) 0 ;
+  switch_core_session_t *result = 0 ;
+  
+  arg1 = (switch_rtp_t *)jarg1; 
+  result = (switch_core_session_t *)switch_rtp_get_core_session(arg1);
+  jresult = (void *)result; 
+  return jresult;
+}
+
+
 SWIGEXPORT void SWIGSTDCALL CSharp_FreeSWITCHfNative_switch_log_node_t_data_set___(void * jarg1, char * jarg2) {
   switch_log_node_t *arg1 = (switch_log_node_t *) 0 ;
   char *arg2 = (char *) 0 ;
diff --git a/src/mod/languages/mod_managed/managed/swig.cs b/src/mod/languages/mod_managed/managed/swig.cs
index 884806bcab4..98e9ce5b5d9 100644
--- a/src/mod/languages/mod_managed/managed/swig.cs
+++ b/src/mod/languages/mod_managed/managed/swig.cs
@@ -10021,6 +10021,11 @@ public static uint switch_default_rate(string name, uint number) {
     return ret;
   }
 
+  public static uint switch_core_max_audio_channels(uint limit) {
+    uint ret = freeswitchPINVOKE.switch_core_max_audio_channels(limit);
+    return ret;
+  }
+
   public static switch_status_t switch_core_add_registration(string user, string realm, string token, string url, uint expires, string network_ip, string network_port, string network_proto, string metadata) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_core_add_registration(user, realm, token, url, expires, network_ip, network_port, network_proto, metadata);
     return ret;
@@ -11288,6 +11293,12 @@ public static int switch_split_user_domain(string arg0, ref string user, ref str
     return ret;
   }
 
+  public static SWIGTYPE_p_void switch_calloc(uint nmemb, uint size) {
+    global::System.IntPtr cPtr = freeswitchPINVOKE.switch_calloc(nmemb, size);
+    SWIGTYPE_p_void ret = (cPtr == global::System.IntPtr.Zero) ? null : new SWIGTYPE_p_void(cPtr, false);
+    return ret;
+  }
+
   public static string switch_uuid_str(string buf, SWIGTYPE_p_switch_size_t len) {
     string ret = freeswitchPINVOKE.switch_uuid_str(buf, SWIGTYPE_p_switch_size_t.getCPtr(len));
     if (freeswitchPINVOKE.SWIGPendingException.Pending) throw freeswitchPINVOKE.SWIGPendingException.Retrieve();
@@ -11397,6 +11408,11 @@ public static string switch_html_strip(string str) {
     return ret;
   }
 
+  public static uint switch_getpid() {
+    uint ret = freeswitchPINVOKE.switch_getpid();
+    return ret;
+  }
+
   public static switch_caller_extension switch_caller_extension_new(SWIGTYPE_p_switch_core_session session, string extension_name, string extension_number) {
     global::System.IntPtr cPtr = freeswitchPINVOKE.switch_caller_extension_new(SWIGTYPE_p_switch_core_session.getCPtr(session), extension_name, extension_number);
     switch_caller_extension ret = (cPtr == global::System.IntPtr.Zero) ? null : new switch_caller_extension(cPtr, false);
@@ -13084,6 +13100,10 @@ public static switch_status_t switch_ivr_signal_bridge(SWIGTYPE_p_switch_core_se
     return ret;
   }
 
+  public static void switch_ivr_check_hold(SWIGTYPE_p_switch_core_session session) {
+    freeswitchPINVOKE.switch_ivr_check_hold(SWIGTYPE_p_switch_core_session.getCPtr(session));
+  }
+
   public static switch_status_t switch_ivr_session_transfer(SWIGTYPE_p_switch_core_session session, string extension, string dialplan, string context) {
     switch_status_t ret = (switch_status_t)freeswitchPINVOKE.switch_ivr_session_transfer(SWIGTYPE_p_switch_core_session.getCPtr(session), extension, dialplan, context);
     return ret;
@@ -14068,6 +14088,12 @@ public static void switch_rtp_video_loss(SWIGTYPE_p_switch_rtp rtp_session) {
     freeswitchPINVOKE.switch_rtp_video_loss(SWIGTYPE_p_switch_rtp.getCPtr(rtp_session));
   }
 
+  public static SWIGTYPE_p_switch_core_session switch_rtp_get_core_session(SWIGTYPE_p_switch_rtp rtp_session) {
+    global::System.IntPtr cPtr = freeswitchPINVOKE.switch_rtp_get_core_session(SWIGTYPE_p_switch_rtp.getCPtr(rtp_session));
+    SWIGTYPE_p_switch_core_session ret = (cPtr == global::System.IntPtr.Zero) ? null : new SWIGTYPE_p_switch_core_session(cPtr, false);
+    return ret;
+  }
+
   public static SWIGTYPE_p_cJSON switch_log_node_to_json(switch_log_node_t node, int log_level, switch_log_json_format_t json_format, switch_event chan_vars) {
     global::System.IntPtr cPtr = freeswitchPINVOKE.switch_log_node_to_json(switch_log_node_t.getCPtr(node), log_level, switch_log_json_format_t.getCPtr(json_format), switch_event.getCPtr(chan_vars));
     SWIGTYPE_p_cJSON ret = (cPtr == global::System.IntPtr.Zero) ? null : new SWIGTYPE_p_cJSON(cPtr, false);
@@ -19596,6 +19622,9 @@ static SWIGStringHelper() {
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_default_rate___")]
   public static extern uint switch_default_rate(string jarg1, uint jarg2);
 
+  [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_core_max_audio_channels___")]
+  public static extern uint switch_core_max_audio_channels(uint jarg1);
+
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_core_add_registration___")]
   public static extern int switch_core_add_registration(string jarg1, string jarg2, string jarg3, string jarg4, uint jarg5, string jarg6, string jarg7, string jarg8, string jarg9);
 
@@ -20520,6 +20549,9 @@ static SWIGStringHelper() {
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_split_user_domain___")]
   public static extern int switch_split_user_domain(string jarg1, ref string jarg2, ref string jarg3);
 
+  [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_calloc___")]
+  public static extern global::System.IntPtr switch_calloc(uint jarg1, uint jarg2);
+
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_uuid_str___")]
   public static extern string switch_uuid_str(string jarg1, global::System.Runtime.InteropServices.HandleRef jarg2);
 
@@ -20724,6 +20756,9 @@ static SWIGStringHelper() {
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_html_strip___")]
   public static extern string switch_html_strip(string jarg1);
 
+  [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_getpid___")]
+  public static extern uint switch_getpid();
+
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_profile_node_t_var_set___")]
   public static extern void profile_node_t_var_set(global::System.Runtime.InteropServices.HandleRef jarg1, string jarg2);
 
@@ -25338,6 +25373,9 @@ static SWIGStringHelper() {
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_ivr_signal_bridge___")]
   public static extern int switch_ivr_signal_bridge(global::System.Runtime.InteropServices.HandleRef jarg1, global::System.Runtime.InteropServices.HandleRef jarg2);
 
+  [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_ivr_check_hold___")]
+  public static extern void switch_ivr_check_hold(global::System.Runtime.InteropServices.HandleRef jarg1);
+
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_ivr_session_transfer___")]
   public static extern int switch_ivr_session_transfer(global::System.Runtime.InteropServices.HandleRef jarg1, string jarg2, string jarg3, string jarg4);
 
@@ -26166,6 +26204,9 @@ static SWIGStringHelper() {
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_rtp_video_loss___")]
   public static extern void switch_rtp_video_loss(global::System.Runtime.InteropServices.HandleRef jarg1);
 
+  [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_rtp_get_core_session___")]
+  public static extern global::System.IntPtr switch_rtp_get_core_session(global::System.Runtime.InteropServices.HandleRef jarg1);
+
   [global::System.Runtime.InteropServices.DllImport("mod_managed", EntryPoint="CSharp_FreeSWITCHfNative_switch_log_node_t_data_set___")]
   public static extern void switch_log_node_t_data_set(global::System.Runtime.InteropServices.HandleRef jarg1, string jarg2);
 
diff --git a/src/mod/languages/mod_perl/freeswitch.pm b/src/mod/languages/mod_perl/freeswitch.pm
index 2e8ca46ef46..3d511c9225f 100644
--- a/src/mod/languages/mod_perl/freeswitch.pm
+++ b/src/mod/languages/mod_perl/freeswitch.pm
@@ -1,5 +1,5 @@
 # This file was automatically generated by SWIG (http://www.swig.org).
-# Version 3.0.10
+# Version 3.0.12
 #
 # Do not make changes to this file unless you know what you are doing--modify
 # the SWIG interface file instead.
diff --git a/src/mod/languages/mod_perl/mod_perl_wrap.cpp b/src/mod/languages/mod_perl/mod_perl_wrap.cpp
index be5d700ea89..a6b48e7670a 100644
--- a/src/mod/languages/mod_perl/mod_perl_wrap.cpp
+++ b/src/mod/languages/mod_perl/mod_perl_wrap.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * This file is not intended to be easily readable and contains a number of
  * coding conventions designed to improve portability and efficiency. Do not make
@@ -1572,7 +1572,7 @@ static swig_module_info swig_module = {swig_types, 30, 0, 0, 0, 0};
 #define SWIG_name   "freeswitchc::boot_freeswitch"
 #define SWIG_prefix "freeswitchc::"
 
-#define SWIGVERSION 0x030010 
+#define SWIGVERSION 0x030012 
 #define SWIG_VERSION SWIGVERSION
 
 
@@ -1629,7 +1629,7 @@ SWIG_AsCharPtrAndSize(SV *obj, char** cptr, size_t* psize, int *alloc)
     if (cptr)  {
       if (alloc) {
 	if (*alloc == SWIG_NEWOBJ) {
-	  *cptr = reinterpret_cast< char* >(memcpy((new char[size]), cstr, sizeof(char)*(size)));
+	  *cptr = reinterpret_cast< char* >(memcpy(new char[size], cstr, sizeof(char)*(size)));
 	} else {
 	  *cptr = cstr;
 	  *alloc = SWIG_OLDOBJ;
@@ -3091,7 +3091,7 @@ XS(_wrap_input_callback_state_t_funcargs_set) {
     if (arg1->funcargs) delete[] arg1->funcargs;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->funcargs = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->funcargs = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->funcargs = 0;
     }
@@ -3752,7 +3752,7 @@ XS(_wrap_Event_serialized_string_set) {
     if (arg1->serialized_string) delete[] arg1->serialized_string;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->serialized_string = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->serialized_string = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->serialized_string = 0;
     }
@@ -4644,7 +4644,7 @@ XS(_wrap_EventConsumer_e_callback_set) {
     if (arg1->e_callback) delete[] arg1->e_callback;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->e_callback = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->e_callback = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->e_callback = 0;
     }
@@ -4716,7 +4716,7 @@ XS(_wrap_EventConsumer_e_subclass_name_set) {
     if (arg1->e_subclass_name) delete[] arg1->e_subclass_name;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->e_subclass_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->e_subclass_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->e_subclass_name = 0;
     }
@@ -4788,7 +4788,7 @@ XS(_wrap_EventConsumer_e_cb_arg_set) {
     if (arg1->e_cb_arg) delete[] arg1->e_cb_arg;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->e_cb_arg = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->e_cb_arg = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->e_cb_arg = 0;
     }
@@ -5705,7 +5705,7 @@ XS(_wrap_CoreSession_uuid_set) {
     if (arg1->uuid) delete[] arg1->uuid;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->uuid = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->uuid = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->uuid = 0;
     }
@@ -5777,7 +5777,7 @@ XS(_wrap_CoreSession_tts_name_set) {
     if (arg1->tts_name) delete[] arg1->tts_name;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->tts_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->tts_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->tts_name = 0;
     }
@@ -5849,7 +5849,7 @@ XS(_wrap_CoreSession_voice_name_set) {
     if (arg1->voice_name) delete[] arg1->voice_name;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->voice_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->voice_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->voice_name = 0;
     }
@@ -10219,7 +10219,7 @@ XS(_wrap_Session_suuid_set) {
     if (arg1->suuid) delete[] arg1->suuid;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->suuid = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->suuid = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->suuid = 0;
     }
@@ -10291,7 +10291,7 @@ XS(_wrap_Session_cb_function_set) {
     if (arg1->cb_function) delete[] arg1->cb_function;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->cb_function = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->cb_function = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->cb_function = 0;
     }
@@ -10363,7 +10363,7 @@ XS(_wrap_Session_cb_arg_set) {
     if (arg1->cb_arg) delete[] arg1->cb_arg;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->cb_arg = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->cb_arg = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->cb_arg = 0;
     }
@@ -10435,7 +10435,7 @@ XS(_wrap_Session_hangup_func_str_set) {
     if (arg1->hangup_func_str) delete[] arg1->hangup_func_str;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->hangup_func_str = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->hangup_func_str = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->hangup_func_str = 0;
     }
@@ -10507,7 +10507,7 @@ XS(_wrap_Session_hangup_func_arg_set) {
     if (arg1->hangup_func_arg) delete[] arg1->hangup_func_arg;
     if (arg2) {
       size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-      arg1->hangup_func_arg = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+      arg1->hangup_func_arg = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
     } else {
       arg1->hangup_func_arg = 0;
     }
diff --git a/src/mod/languages/mod_python/freeswitch.py b/src/mod/languages/mod_python/freeswitch.py
index e6c43d3a458..ac9f655ad9b 100644
--- a/src/mod/languages/mod_python/freeswitch.py
+++ b/src/mod/languages/mod_python/freeswitch.py
@@ -1,13 +1,9 @@
 # This file was automatically generated by SWIG (http://www.swig.org).
-# Version 3.0.10
+# Version 3.0.12
 #
 # Do not make changes to this file unless you know what you are doing--modify
 # the SWIG interface file instead.
 
-
-
-
-
 from sys import version_info as _swig_python_version_info
 if _swig_python_version_info >= (2, 7, 0):
     def swig_import_helper():
@@ -30,17 +26,18 @@ def swig_import_helper():
         except ImportError:
             import _freeswitch
             return _freeswitch
-        if fp is not None:
-            try:
-                _mod = imp.load_module('_freeswitch', fp, pathname, description)
-            finally:
+        try:
+            _mod = imp.load_module('_freeswitch', fp, pathname, description)
+        finally:
+            if fp is not None:
                 fp.close()
-            return _mod
+        return _mod
     _freeswitch = swig_import_helper()
     del swig_import_helper
 else:
     import _freeswitch
 del _swig_python_version_info
+
 try:
     _swig_property = property
 except NameError:
diff --git a/src/mod/languages/mod_python/mod_python.c b/src/mod/languages/mod_python/mod_python.c
index 17da5f263ff..c9ae2b895d4 100644
--- a/src/mod/languages/mod_python/mod_python.c
+++ b/src/mod/languages/mod_python/mod_python.c
@@ -90,6 +90,7 @@ static void print_python_error(const char * script)
 
 	if (buffer == NULL ) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Not enough Memory to create the error buffer");
+		return;
 	}
 
 	/* just for security that we will always have a string terminater */
diff --git a/src/mod/languages/mod_python/mod_python_wrap.cpp b/src/mod/languages/mod_python/mod_python_wrap.cpp
index bd29b959546..31ec090c6a2 100644
--- a/src/mod/languages/mod_python/mod_python_wrap.cpp
+++ b/src/mod/languages/mod_python/mod_python_wrap.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------------
  * This file was automatically generated by SWIG (http://www.swig.org).
- * Version 3.0.10
+ * Version 3.0.12
  *
  * This file is not intended to be easily readable and contains a number of
  * coding conventions designed to improve portability and efficiency. Do not make
@@ -976,6 +976,7 @@ typedef destructor freefunc;
 #if PY_VERSION_HEX < 0x03020000
 #define PyDescr_TYPE(x) (((PyDescrObject *)(x))->d_type)
 #define PyDescr_NAME(x) (((PyDescrObject *)(x))->d_name)
+#define Py_hash_t long
 #endif
 
 /* -----------------------------------------------------------------------------
@@ -1631,6 +1632,14 @@ SwigPyObject_repr(SwigPyObject *v, PyObject *args)
   return repr;  
 }
 
+/* We need a version taking two PyObject* parameters so it's a valid
+ * PyCFunction to use in swigobject_methods[]. */
+SWIGRUNTIME PyObject *
+SwigPyObject_repr2(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
+{
+  return SwigPyObject_repr((SwigPyObject*)v);
+}
+
 SWIGRUNTIME int
 SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w)
 {
@@ -1760,11 +1769,7 @@ SwigPyObject_append(PyObject* v, PyObject* next)
 }
 
 SWIGRUNTIME PyObject* 
-#ifdef METH_NOARGS
-SwigPyObject_next(PyObject* v)
-#else
 SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
-#endif
 {
   SwigPyObject *sobj = (SwigPyObject *) v;
   if (sobj->next) {    
@@ -1799,6 +1804,20 @@ SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
   return SWIG_Py_Void();
 }
 
+#ifdef METH_NOARGS
+static PyObject*
+SwigPyObject_disown2(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+{
+  return SwigPyObject_disown(v);
+}
+
+static PyObject*
+SwigPyObject_acquire2(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+{
+  return SwigPyObject_acquire(v);
+}
+#endif
+
 SWIGINTERN PyObject*
 SwigPyObject_own(PyObject *v, PyObject *args)
 {
@@ -1839,12 +1858,12 @@ SwigPyObject_own(PyObject *v, PyObject *args)
 #ifdef METH_O
 static PyMethodDef
 swigobject_methods[] = {
-  {(char *)"disown",  (PyCFunction)SwigPyObject_disown,  METH_NOARGS,  (char *)"releases ownership of the pointer"},
-  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS,  (char *)"acquires ownership of the pointer"},
+  {(char *)"disown",  (PyCFunction)SwigPyObject_disown2, METH_NOARGS,  (char *)"releases ownership of the pointer"},
+  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire2,METH_NOARGS,  (char *)"acquires ownership of the pointer"},
   {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS, (char *)"returns/sets ownership of the pointer"},
   {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_O,       (char *)"appends another 'this' object"},
   {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_NOARGS,  (char *)"returns the next 'this' object"},
-  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,    METH_NOARGS,  (char *)"returns object representation"},
+  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr2,   METH_NOARGS,  (char *)"returns object representation"},
   {0, 0, 0, 0}  
 };
 #else
@@ -1855,7 +1874,7 @@ swigobject_methods[] = {
   {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS,  (char *)"returns/sets ownership of the pointer"},
   {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_VARARGS,  (char *)"appends another 'this' object"},
   {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_VARARGS,  (char *)"returns the next 'this' object"},
-  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,   METH_VARARGS,  (char *)"returns object representation"},
+  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,    METH_VARARGS,  (char *)"returns object representation"},
   {0, 0, 0, 0}  
 };
 #endif
@@ -1924,7 +1943,6 @@ SwigPyObject_TypeOnce(void) {
   static int type_init = 0;
   if (!type_init) {
     const PyTypeObject tmp = {
-      /* PyObject header changed in Python 3 */
 #if PY_VERSION_HEX >= 0x03000000
       PyVarObject_HEAD_INIT(NULL, 0)
 #else
@@ -1935,7 +1953,7 @@ SwigPyObject_TypeOnce(void) {
       sizeof(SwigPyObject),                 /* tp_basicsize */
       0,                                    /* tp_itemsize */
       (destructor)SwigPyObject_dealloc,     /* tp_dealloc */
-      0,				    /* tp_print */
+      0,                                    /* tp_print */
 #if PY_VERSION_HEX < 0x02020000
       (getattrfunc)SwigPyObject_getattr,    /* tp_getattr */
 #else
@@ -1943,7 +1961,7 @@ SwigPyObject_TypeOnce(void) {
 #endif
       (setattrfunc)0,                       /* tp_setattr */
 #if PY_VERSION_HEX >= 0x03000000
-    0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */
+      0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */
 #else
       (cmpfunc)SwigPyObject_compare,        /* tp_compare */
 #endif
@@ -1953,7 +1971,7 @@ SwigPyObject_TypeOnce(void) {
       0,                                    /* tp_as_mapping */
       (hashfunc)0,                          /* tp_hash */
       (ternaryfunc)0,                       /* tp_call */
-      0,				    /* tp_str */
+      0,                                    /* tp_str */
       PyObject_GenericGetAttr,              /* tp_getattro */
       0,                                    /* tp_setattro */
       0,                                    /* tp_as_buffer */
@@ -2116,7 +2134,6 @@ SwigPyPacked_TypeOnce(void) {
   static int type_init = 0;
   if (!type_init) {
     const PyTypeObject tmp = {
-      /* PyObject header changed in Python 3 */
 #if PY_VERSION_HEX>=0x03000000
       PyVarObject_HEAD_INIT(NULL, 0)
 #else
@@ -3057,7 +3074,7 @@ static swig_module_info swig_module = {swig_types, 28, 0, 0, 0, 0};
 #endif
 #define SWIG_name    "_freeswitch"
 
-#define SWIGVERSION 0x030010 
+#define SWIGVERSION 0x030012 
 #define SWIG_VERSION SWIGVERSION
 
 
@@ -3202,7 +3219,7 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
 	if (*alloc == SWIG_NEWOBJ) 
 #endif
 	{
-	  *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1)));
+	  *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1)));
 	  *alloc = SWIG_NEWOBJ;
 	} else {
 	  *cptr = cstr;
@@ -3240,7 +3257,7 @@ SWIG_AsCharPtrAndSize(PyObject *obj, char** cptr, size_t* psize, int *alloc)
       if (PyString_AsStringAndSize(obj, &cstr, &len) != -1) {
         if (cptr) {
           if (alloc) *alloc = SWIG_NEWOBJ;
-          *cptr = reinterpret_cast< char* >(memcpy((new char[len + 1]), cstr, sizeof(char)*(len + 1)));
+          *cptr = reinterpret_cast< char* >(memcpy(new char[len + 1], cstr, sizeof(char)*(len + 1)));
         }
         if (psize) *psize = len + 1;
 
@@ -3326,7 +3343,7 @@ SWIG_AsVal_double (PyObject *obj, double *val)
     return SWIG_OK;
 #if PY_VERSION_HEX < 0x03000000
   } else if (PyInt_Check(obj)) {
-    if (val) *val = PyInt_AsLong(obj);
+    if (val) *val = (double) PyInt_AsLong(obj);
     return SWIG_OK;
 #endif
   } else if (PyLong_Check(obj)) {
@@ -4239,7 +4256,7 @@ SWIGINTERN PyObject *_wrap_IVRMenu_execute(PyObject *SWIGUNUSEDPARM(self), PyObj
 
 SWIGINTERN PyObject *IVRMenu_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_IVRMenu, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -4397,7 +4414,7 @@ SWIGINTERN PyObject *_wrap_API_getTime(PyObject *SWIGUNUSEDPARM(self), PyObject
 
 SWIGINTERN PyObject *API_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_API, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -4578,7 +4595,7 @@ SWIGINTERN PyObject *_wrap_input_callback_state_t_funcargs_set(PyObject *SWIGUNU
   if (arg1->funcargs) delete[] arg1->funcargs;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->funcargs = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->funcargs = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->funcargs = 0;
   }
@@ -4649,7 +4666,7 @@ SWIGINTERN PyObject *_wrap_delete_input_callback_state_t(PyObject *SWIGUNUSEDPAR
 
 SWIGINTERN PyObject *input_callback_state_t_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_input_callback_state, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -4830,7 +4847,7 @@ SWIGINTERN PyObject *_wrap_delete_DTMF(PyObject *SWIGUNUSEDPARM(self), PyObject
 
 SWIGINTERN PyObject *DTMF_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_DTMF, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -5055,7 +5072,7 @@ SWIGINTERN PyObject *_wrap_Stream_get_data(PyObject *SWIGUNUSEDPARM(self), PyObj
 
 SWIGINTERN PyObject *Stream_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_Stream, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -5138,7 +5155,7 @@ SWIGINTERN PyObject *_wrap_Event_serialized_string_set(PyObject *SWIGUNUSEDPARM(
   if (arg1->serialized_string) delete[] arg1->serialized_string;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->serialized_string = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->serialized_string = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->serialized_string = 0;
   }
@@ -5750,7 +5767,7 @@ SWIGINTERN PyObject *_wrap_Event_fire(PyObject *SWIGUNUSEDPARM(self), PyObject *
 
 SWIGINTERN PyObject *Event_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_Event, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -5893,7 +5910,7 @@ SWIGINTERN PyObject *_wrap_EventConsumer_e_callback_set(PyObject *SWIGUNUSEDPARM
   if (arg1->e_callback) delete[] arg1->e_callback;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->e_callback = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->e_callback = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->e_callback = 0;
   }
@@ -5954,7 +5971,7 @@ SWIGINTERN PyObject *_wrap_EventConsumer_e_subclass_name_set(PyObject *SWIGUNUSE
   if (arg1->e_subclass_name) delete[] arg1->e_subclass_name;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->e_subclass_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->e_subclass_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->e_subclass_name = 0;
   }
@@ -6015,7 +6032,7 @@ SWIGINTERN PyObject *_wrap_EventConsumer_e_cb_arg_set(PyObject *SWIGUNUSEDPARM(s
   if (arg1->e_cb_arg) delete[] arg1->e_cb_arg;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->e_cb_arg = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->e_cb_arg = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->e_cb_arg = 0;
   }
@@ -6359,7 +6376,7 @@ SWIGINTERN PyObject *_wrap_EventConsumer_cleanup(PyObject *SWIGUNUSEDPARM(self),
 
 SWIGINTERN PyObject *EventConsumer_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_EventConsumer, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -6791,7 +6808,7 @@ SWIGINTERN PyObject *_wrap_CoreSession_uuid_set(PyObject *SWIGUNUSEDPARM(self),
   if (arg1->uuid) delete[] arg1->uuid;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->uuid = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->uuid = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->uuid = 0;
   }
@@ -6852,7 +6869,7 @@ SWIGINTERN PyObject *_wrap_CoreSession_tts_name_set(PyObject *SWIGUNUSEDPARM(sel
   if (arg1->tts_name) delete[] arg1->tts_name;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->tts_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->tts_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->tts_name = 0;
   }
@@ -6913,7 +6930,7 @@ SWIGINTERN PyObject *_wrap_CoreSession_voice_name_set(PyObject *SWIGUNUSEDPARM(s
   if (arg1->voice_name) delete[] arg1->voice_name;
   if (arg2) {
     size_t size = strlen(reinterpret_cast< const char * >(arg2)) + 1;
-    arg1->voice_name = (char *)reinterpret_cast< char* >(memcpy((new char[size]), reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
+    arg1->voice_name = (char *)reinterpret_cast< char* >(memcpy(new char[size], reinterpret_cast< const char * >(arg2), sizeof(char)*(size)));
   } else {
     arg1->voice_name = 0;
   }
@@ -9399,7 +9416,7 @@ SWIGINTERN PyObject *_wrap_CoreSession_consoleLog2(PyObject *SWIGUNUSEDPARM(self
 
 SWIGINTERN PyObject *CoreSession_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_CoreSession, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
@@ -10455,199 +10472,199 @@ SWIGINTERN PyObject *_wrap_Session_setSelf(PyObject *SWIGUNUSEDPARM(self), PyObj
 
 SWIGINTERN PyObject *Session_swigregister(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
   PyObject *obj;
-  if (!PyArg_ParseTuple(args,(char*)"O:swigregister", &obj)) return NULL;
+  if (!PyArg_ParseTuple(args,(char *)"O:swigregister", &obj)) return NULL;
   SWIG_TypeNewClientData(SWIGTYPE_p_PYTHON__Session, SWIG_NewClientData(obj));
   return SWIG_Py_Void();
 }
 
 static PyMethodDef SwigMethods[] = {
-	 { (char *)"SWIG_PyInstanceMethod_New", (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL},
-	 { (char *)"setGlobalVariable", _wrap_setGlobalVariable, METH_VARARGS, NULL},
-	 { (char *)"getGlobalVariable", _wrap_getGlobalVariable, METH_VARARGS, NULL},
-	 { (char *)"consoleLog", _wrap_consoleLog, METH_VARARGS, NULL},
-	 { (char *)"consoleLog2", _wrap_consoleLog2, METH_VARARGS, NULL},
-	 { (char *)"consoleCleanLog", _wrap_consoleCleanLog, METH_VARARGS, NULL},
-	 { (char *)"running", _wrap_running, METH_VARARGS, NULL},
-	 { (char *)"email", _wrap_email, METH_VARARGS, NULL},
-	 { (char *)"new_IVRMenu", _wrap_new_IVRMenu, METH_VARARGS, NULL},
-	 { (char *)"delete_IVRMenu", _wrap_delete_IVRMenu, METH_VARARGS, NULL},
-	 { (char *)"IVRMenu_bindAction", _wrap_IVRMenu_bindAction, METH_VARARGS, NULL},
-	 { (char *)"IVRMenu_execute", _wrap_IVRMenu_execute, METH_VARARGS, NULL},
-	 { (char *)"IVRMenu_swigregister", IVRMenu_swigregister, METH_VARARGS, NULL},
-	 { (char *)"new_API", _wrap_new_API, METH_VARARGS, NULL},
-	 { (char *)"delete_API", _wrap_delete_API, METH_VARARGS, NULL},
-	 { (char *)"API_execute", _wrap_API_execute, METH_VARARGS, NULL},
-	 { (char *)"API_executeString", _wrap_API_executeString, METH_VARARGS, NULL},
-	 { (char *)"API_getTime", _wrap_API_getTime, METH_VARARGS, NULL},
-	 { (char *)"API_swigregister", API_swigregister, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_function_set", _wrap_input_callback_state_t_function_set, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_function_get", _wrap_input_callback_state_t_function_get, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_threadState_set", _wrap_input_callback_state_t_threadState_set, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_threadState_get", _wrap_input_callback_state_t_threadState_get, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_extra_set", _wrap_input_callback_state_t_extra_set, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_extra_get", _wrap_input_callback_state_t_extra_get, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_funcargs_set", _wrap_input_callback_state_t_funcargs_set, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_funcargs_get", _wrap_input_callback_state_t_funcargs_get, METH_VARARGS, NULL},
-	 { (char *)"new_input_callback_state_t", _wrap_new_input_callback_state_t, METH_VARARGS, NULL},
-	 { (char *)"delete_input_callback_state_t", _wrap_delete_input_callback_state_t, METH_VARARGS, NULL},
-	 { (char *)"input_callback_state_t_swigregister", input_callback_state_t_swigregister, METH_VARARGS, NULL},
-	 { (char *)"DTMF_digit_set", _wrap_DTMF_digit_set, METH_VARARGS, NULL},
-	 { (char *)"DTMF_digit_get", _wrap_DTMF_digit_get, METH_VARARGS, NULL},
-	 { (char *)"DTMF_duration_set", _wrap_DTMF_duration_set, METH_VARARGS, NULL},
-	 { (char *)"DTMF_duration_get", _wrap_DTMF_duration_get, METH_VARARGS, NULL},
-	 { (char *)"new_DTMF", _wrap_new_DTMF, METH_VARARGS, NULL},
-	 { (char *)"delete_DTMF", _wrap_delete_DTMF, METH_VARARGS, NULL},
-	 { (char *)"DTMF_swigregister", DTMF_swigregister, METH_VARARGS, NULL},
-	 { (char *)"new_Stream", _wrap_new_Stream, METH_VARARGS, NULL},
-	 { (char *)"delete_Stream", _wrap_delete_Stream, METH_VARARGS, NULL},
-	 { (char *)"Stream_read", _wrap_Stream_read, METH_VARARGS, NULL},
-	 { (char *)"Stream_write", _wrap_Stream_write, METH_VARARGS, NULL},
-	 { (char *)"Stream_raw_write", _wrap_Stream_raw_write, METH_VARARGS, NULL},
-	 { (char *)"Stream_get_data", _wrap_Stream_get_data, METH_VARARGS, NULL},
-	 { (char *)"Stream_swigregister", Stream_swigregister, METH_VARARGS, NULL},
-	 { (char *)"Event_event_set", _wrap_Event_event_set, METH_VARARGS, NULL},
-	 { (char *)"Event_event_get", _wrap_Event_event_get, METH_VARARGS, NULL},
-	 { (char *)"Event_serialized_string_set", _wrap_Event_serialized_string_set, METH_VARARGS, NULL},
-	 { (char *)"Event_serialized_string_get", _wrap_Event_serialized_string_get, METH_VARARGS, NULL},
-	 { (char *)"Event_mine_set", _wrap_Event_mine_set, METH_VARARGS, NULL},
-	 { (char *)"Event_mine_get", _wrap_Event_mine_get, METH_VARARGS, NULL},
-	 { (char *)"new_Event", _wrap_new_Event, METH_VARARGS, NULL},
-	 { (char *)"delete_Event", _wrap_delete_Event, METH_VARARGS, NULL},
-	 { (char *)"Event_chat_execute", _wrap_Event_chat_execute, METH_VARARGS, NULL},
-	 { (char *)"Event_chat_send", _wrap_Event_chat_send, METH_VARARGS, NULL},
-	 { (char *)"Event_serialize", _wrap_Event_serialize, METH_VARARGS, NULL},
-	 { (char *)"Event_setPriority", _wrap_Event_setPriority, METH_VARARGS, NULL},
-	 { (char *)"Event_getHeader", _wrap_Event_getHeader, METH_VARARGS, NULL},
-	 { (char *)"Event_getBody", _wrap_Event_getBody, METH_VARARGS, NULL},
-	 { (char *)"Event_getType", _wrap_Event_getType, METH_VARARGS, NULL},
-	 { (char *)"Event_addBody", _wrap_Event_addBody, METH_VARARGS, NULL},
-	 { (char *)"Event_addHeader", _wrap_Event_addHeader, METH_VARARGS, NULL},
-	 { (char *)"Event_delHeader", _wrap_Event_delHeader, METH_VARARGS, NULL},
-	 { (char *)"Event_fire", _wrap_Event_fire, METH_VARARGS, NULL},
-	 { (char *)"Event_swigregister", Event_swigregister, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_events_set", _wrap_EventConsumer_events_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_events_get", _wrap_EventConsumer_events_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_event_id_set", _wrap_EventConsumer_e_event_id_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_event_id_get", _wrap_EventConsumer_e_event_id_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_callback_set", _wrap_EventConsumer_e_callback_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_callback_get", _wrap_EventConsumer_e_callback_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_subclass_name_set", _wrap_EventConsumer_e_subclass_name_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_subclass_name_get", _wrap_EventConsumer_e_subclass_name_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_cb_arg_set", _wrap_EventConsumer_e_cb_arg_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_e_cb_arg_get", _wrap_EventConsumer_e_cb_arg_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_enodes_set", _wrap_EventConsumer_enodes_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_enodes_get", _wrap_EventConsumer_enodes_get, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_node_index_set", _wrap_EventConsumer_node_index_set, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_node_index_get", _wrap_EventConsumer_node_index_get, METH_VARARGS, NULL},
-	 { (char *)"new_EventConsumer", _wrap_new_EventConsumer, METH_VARARGS, NULL},
-	 { (char *)"delete_EventConsumer", _wrap_delete_EventConsumer, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_bind", _wrap_EventConsumer_bind, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_pop", _wrap_EventConsumer_pop, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_cleanup", _wrap_EventConsumer_cleanup, METH_VARARGS, NULL},
-	 { (char *)"EventConsumer_swigregister", EventConsumer_swigregister, METH_VARARGS, NULL},
-	 { (char *)"delete_CoreSession", _wrap_delete_CoreSession, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_session_set", _wrap_CoreSession_session_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_session_get", _wrap_CoreSession_session_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_channel_set", _wrap_CoreSession_channel_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_channel_get", _wrap_CoreSession_channel_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_flags_set", _wrap_CoreSession_flags_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_flags_get", _wrap_CoreSession_flags_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_allocated_set", _wrap_CoreSession_allocated_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_allocated_get", _wrap_CoreSession_allocated_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_cb_state_set", _wrap_CoreSession_cb_state_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_cb_state_get", _wrap_CoreSession_cb_state_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_hook_state_set", _wrap_CoreSession_hook_state_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_hook_state_get", _wrap_CoreSession_hook_state_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_cause_set", _wrap_CoreSession_cause_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_cause_get", _wrap_CoreSession_cause_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_uuid_set", _wrap_CoreSession_uuid_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_uuid_get", _wrap_CoreSession_uuid_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_tts_name_set", _wrap_CoreSession_tts_name_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_tts_name_get", _wrap_CoreSession_tts_name_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_voice_name_set", _wrap_CoreSession_voice_name_set, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_voice_name_get", _wrap_CoreSession_voice_name_get, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_insertFile", _wrap_CoreSession_insertFile, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_answer", _wrap_CoreSession_answer, METH_VARARGS, NULL},
-	 { (char *)"CoreSession__print", _wrap_CoreSession__print, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_preAnswer", _wrap_CoreSession_preAnswer, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_hangup", _wrap_CoreSession_hangup, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_hangupState", _wrap_CoreSession_hangupState, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setVariable", _wrap_CoreSession_setVariable, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setPrivate", _wrap_CoreSession_setPrivate, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_getPrivate", _wrap_CoreSession_getPrivate, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_getVariable", _wrap_CoreSession_getVariable, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_process_callback_result", _wrap_CoreSession_process_callback_result, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_say", _wrap_CoreSession_say, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_sayPhrase", _wrap_CoreSession_sayPhrase, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_hangupCause", _wrap_CoreSession_hangupCause, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_getState", _wrap_CoreSession_getState, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_recordFile", _wrap_CoreSession_recordFile, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_originate", _wrap_CoreSession_originate, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_destroy", _wrap_CoreSession_destroy, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setDTMFCallback", _wrap_CoreSession_setDTMFCallback, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_speak", _wrap_CoreSession_speak, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_set_tts_parms", _wrap_CoreSession_set_tts_parms, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_set_tts_params", _wrap_CoreSession_set_tts_params, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_collectDigits", _wrap_CoreSession_collectDigits, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_getDigits", _wrap_CoreSession_getDigits, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_transfer", _wrap_CoreSession_transfer, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_read", _wrap_CoreSession_read, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_detectSpeech", _wrap_CoreSession_detectSpeech, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_playAndGetDigits", _wrap_CoreSession_playAndGetDigits, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_playAndDetectSpeech", _wrap_CoreSession_playAndDetectSpeech, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_streamFile", _wrap_CoreSession_streamFile, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_sleep", _wrap_CoreSession_sleep, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_flushEvents", _wrap_CoreSession_flushEvents, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_flushDigits", _wrap_CoreSession_flushDigits, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setAutoHangup", _wrap_CoreSession_setAutoHangup, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setHangupHook", _wrap_CoreSession_setHangupHook, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_ready", _wrap_CoreSession_ready, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_bridged", _wrap_CoreSession_bridged, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_answered", _wrap_CoreSession_answered, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_mediaReady", _wrap_CoreSession_mediaReady, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_waitForAnswer", _wrap_CoreSession_waitForAnswer, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_execute", _wrap_CoreSession_execute, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_sendEvent", _wrap_CoreSession_sendEvent, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_setEventData", _wrap_CoreSession_setEventData, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_getXMLCDR", _wrap_CoreSession_getXMLCDR, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_begin_allow_threads", _wrap_CoreSession_begin_allow_threads, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_end_allow_threads", _wrap_CoreSession_end_allow_threads, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_get_uuid", _wrap_CoreSession_get_uuid, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_get_cb_args", _wrap_CoreSession_get_cb_args, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_check_hangup_hook", _wrap_CoreSession_check_hangup_hook, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_run_dtmf_callback", _wrap_CoreSession_run_dtmf_callback, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_consoleLog", _wrap_CoreSession_consoleLog, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_consoleLog2", _wrap_CoreSession_consoleLog2, METH_VARARGS, NULL},
-	 { (char *)"CoreSession_swigregister", CoreSession_swigregister, METH_VARARGS, NULL},
-	 { (char *)"console_log", _wrap_console_log, METH_VARARGS, NULL},
-	 { (char *)"console_log2", _wrap_console_log2, METH_VARARGS, NULL},
-	 { (char *)"console_clean_log", _wrap_console_clean_log, METH_VARARGS, NULL},
-	 { (char *)"msleep", _wrap_msleep, METH_VARARGS, NULL},
-	 { (char *)"bridge", _wrap_bridge, METH_VARARGS, NULL},
-	 { (char *)"hanguphook", _wrap_hanguphook, METH_VARARGS, NULL},
-	 { (char *)"dtmf_callback", _wrap_dtmf_callback, METH_VARARGS, NULL},
-	 { (char *)"new_Session", _wrap_new_Session, METH_VARARGS, NULL},
-	 { (char *)"delete_Session", _wrap_delete_Session, METH_VARARGS, NULL},
-	 { (char *)"Session_begin_allow_threads", _wrap_Session_begin_allow_threads, METH_VARARGS, NULL},
-	 { (char *)"Session_end_allow_threads", _wrap_Session_end_allow_threads, METH_VARARGS, NULL},
-	 { (char *)"Session_check_hangup_hook", _wrap_Session_check_hangup_hook, METH_VARARGS, NULL},
-	 { (char *)"Session_destroy", _wrap_Session_destroy, METH_VARARGS, NULL},
-	 { (char *)"Session_run_dtmf_callback", _wrap_Session_run_dtmf_callback, METH_VARARGS, NULL},
-	 { (char *)"Session_setInputCallback", _wrap_Session_setInputCallback, METH_VARARGS, NULL},
-	 { (char *)"Session_unsetInputCallback", _wrap_Session_unsetInputCallback, METH_VARARGS, NULL},
-	 { (char *)"Session_setHangupHook", _wrap_Session_setHangupHook, METH_VARARGS, NULL},
-	 { (char *)"Session_ready", _wrap_Session_ready, METH_VARARGS, NULL},
-	 { (char *)"Session_cb_function_set", _wrap_Session_cb_function_set, METH_VARARGS, NULL},
-	 { (char *)"Session_cb_function_get", _wrap_Session_cb_function_get, METH_VARARGS, NULL},
-	 { (char *)"Session_cb_arg_set", _wrap_Session_cb_arg_set, METH_VARARGS, NULL},
-	 { (char *)"Session_cb_arg_get", _wrap_Session_cb_arg_get, METH_VARARGS, NULL},
-	 { (char *)"Session_hangup_func_set", _wrap_Session_hangup_func_set, METH_VARARGS, NULL},
-	 { (char *)"Session_hangup_func_get", _wrap_Session_hangup_func_get, METH_VARARGS, NULL},
-	 { (char *)"Session_hangup_func_arg_set", _wrap_Session_hangup_func_arg_set, METH_VARARGS, NULL},
-	 { (char *)"Session_hangup_func_arg_get", _wrap_Session_hangup_func_arg_get, METH_VARARGS, NULL},
-	 { (char *)"Session_setPython", _wrap_Session_setPython, METH_VARARGS, NULL},
-	 { (char *)"Session_setSelf", _wrap_Session_setSelf, METH_VARARGS, NULL},
-	 { (char *)"Session_swigregister", Session_swigregister, METH_VARARGS, NULL},
+	 { "SWIG_PyInstanceMethod_New", SWIG_PyInstanceMethod_New, METH_O, NULL},
+	 { "setGlobalVariable", _wrap_setGlobalVariable, METH_VARARGS, NULL},
+	 { "getGlobalVariable", _wrap_getGlobalVariable, METH_VARARGS, NULL},
+	 { "consoleLog", _wrap_consoleLog, METH_VARARGS, NULL},
+	 { "consoleLog2", _wrap_consoleLog2, METH_VARARGS, NULL},
+	 { "consoleCleanLog", _wrap_consoleCleanLog, METH_VARARGS, NULL},
+	 { "running", _wrap_running, METH_VARARGS, NULL},
+	 { "email", _wrap_email, METH_VARARGS, NULL},
+	 { "new_IVRMenu", _wrap_new_IVRMenu, METH_VARARGS, NULL},
+	 { "delete_IVRMenu", _wrap_delete_IVRMenu, METH_VARARGS, NULL},
+	 { "IVRMenu_bindAction", _wrap_IVRMenu_bindAction, METH_VARARGS, NULL},
+	 { "IVRMenu_execute", _wrap_IVRMenu_execute, METH_VARARGS, NULL},
+	 { "IVRMenu_swigregister", IVRMenu_swigregister, METH_VARARGS, NULL},
+	 { "new_API", _wrap_new_API, METH_VARARGS, NULL},
+	 { "delete_API", _wrap_delete_API, METH_VARARGS, NULL},
+	 { "API_execute", _wrap_API_execute, METH_VARARGS, NULL},
+	 { "API_executeString", _wrap_API_executeString, METH_VARARGS, NULL},
+	 { "API_getTime", _wrap_API_getTime, METH_VARARGS, NULL},
+	 { "API_swigregister", API_swigregister, METH_VARARGS, NULL},
+	 { "input_callback_state_t_function_set", _wrap_input_callback_state_t_function_set, METH_VARARGS, NULL},
+	 { "input_callback_state_t_function_get", _wrap_input_callback_state_t_function_get, METH_VARARGS, NULL},
+	 { "input_callback_state_t_threadState_set", _wrap_input_callback_state_t_threadState_set, METH_VARARGS, NULL},
+	 { "input_callback_state_t_threadState_get", _wrap_input_callback_state_t_threadState_get, METH_VARARGS, NULL},
+	 { "input_callback_state_t_extra_set", _wrap_input_callback_state_t_extra_set, METH_VARARGS, NULL},
+	 { "input_callback_state_t_extra_get", _wrap_input_callback_state_t_extra_get, METH_VARARGS, NULL},
+	 { "input_callback_state_t_funcargs_set", _wrap_input_callback_state_t_funcargs_set, METH_VARARGS, NULL},
+	 { "input_callback_state_t_funcargs_get", _wrap_input_callback_state_t_funcargs_get, METH_VARARGS, NULL},
+	 { "new_input_callback_state_t", _wrap_new_input_callback_state_t, METH_VARARGS, NULL},
+	 { "delete_input_callback_state_t", _wrap_delete_input_callback_state_t, METH_VARARGS, NULL},
+	 { "input_callback_state_t_swigregister", input_callback_state_t_swigregister, METH_VARARGS, NULL},
+	 { "DTMF_digit_set", _wrap_DTMF_digit_set, METH_VARARGS, NULL},
+	 { "DTMF_digit_get", _wrap_DTMF_digit_get, METH_VARARGS, NULL},
+	 { "DTMF_duration_set", _wrap_DTMF_duration_set, METH_VARARGS, NULL},
+	 { "DTMF_duration_get", _wrap_DTMF_duration_get, METH_VARARGS, NULL},
+	 { "new_DTMF", _wrap_new_DTMF, METH_VARARGS, NULL},
+	 { "delete_DTMF", _wrap_delete_DTMF, METH_VARARGS, NULL},
+	 { "DTMF_swigregister", DTMF_swigregister, METH_VARARGS, NULL},
+	 { "new_Stream", _wrap_new_Stream, METH_VARARGS, NULL},
+	 { "delete_Stream", _wrap_delete_Stream, METH_VARARGS, NULL},
+	 { "Stream_read", _wrap_Stream_read, METH_VARARGS, NULL},
+	 { "Stream_write", _wrap_Stream_write, METH_VARARGS, NULL},
+	 { "Stream_raw_write", _wrap_Stream_raw_write, METH_VARARGS, NULL},
+	 { "Stream_get_data", _wrap_Stream_get_data, METH_VARARGS, NULL},
+	 { "Stream_swigregister", Stream_swigregister, METH_VARARGS, NULL},
+	 { "Event_event_set", _wrap_Event_event_set, METH_VARARGS, NULL},
+	 { "Event_event_get", _wrap_Event_event_get, METH_VARARGS, NULL},
+	 { "Event_serialized_string_set", _wrap_Event_serialized_string_set, METH_VARARGS, NULL},
+	 { "Event_serialized_string_get", _wrap_Event_serialized_string_get, METH_VARARGS, NULL},
+	 { "Event_mine_set", _wrap_Event_mine_set, METH_VARARGS, NULL},
+	 { "Event_mine_get", _wrap_Event_mine_get, METH_VARARGS, NULL},
+	 { "new_Event", _wrap_new_Event, METH_VARARGS, NULL},
+	 { "delete_Event", _wrap_delete_Event, METH_VARARGS, NULL},
+	 { "Event_chat_execute", _wrap_Event_chat_execute, METH_VARARGS, NULL},
+	 { "Event_chat_send", _wrap_Event_chat_send, METH_VARARGS, NULL},
+	 { "Event_serialize", _wrap_Event_serialize, METH_VARARGS, NULL},
+	 { "Event_setPriority", _wrap_Event_setPriority, METH_VARARGS, NULL},
+	 { "Event_getHeader", _wrap_Event_getHeader, METH_VARARGS, NULL},
+	 { "Event_getBody", _wrap_Event_getBody, METH_VARARGS, NULL},
+	 { "Event_getType", _wrap_Event_getType, METH_VARARGS, NULL},
+	 { "Event_addBody", _wrap_Event_addBody, METH_VARARGS, NULL},
+	 { "Event_addHeader", _wrap_Event_addHeader, METH_VARARGS, NULL},
+	 { "Event_delHeader", _wrap_Event_delHeader, METH_VARARGS, NULL},
+	 { "Event_fire", _wrap_Event_fire, METH_VARARGS, NULL},
+	 { "Event_swigregister", Event_swigregister, METH_VARARGS, NULL},
+	 { "EventConsumer_events_set", _wrap_EventConsumer_events_set, METH_VARARGS, NULL},
+	 { "EventConsumer_events_get", _wrap_EventConsumer_events_get, METH_VARARGS, NULL},
+	 { "EventConsumer_e_event_id_set", _wrap_EventConsumer_e_event_id_set, METH_VARARGS, NULL},
+	 { "EventConsumer_e_event_id_get", _wrap_EventConsumer_e_event_id_get, METH_VARARGS, NULL},
+	 { "EventConsumer_e_callback_set", _wrap_EventConsumer_e_callback_set, METH_VARARGS, NULL},
+	 { "EventConsumer_e_callback_get", _wrap_EventConsumer_e_callback_get, METH_VARARGS, NULL},
+	 { "EventConsumer_e_subclass_name_set", _wrap_EventConsumer_e_subclass_name_set, METH_VARARGS, NULL},
+	 { "EventConsumer_e_subclass_name_get", _wrap_EventConsumer_e_subclass_name_get, METH_VARARGS, NULL},
+	 { "EventConsumer_e_cb_arg_set", _wrap_EventConsumer_e_cb_arg_set, METH_VARARGS, NULL},
+	 { "EventConsumer_e_cb_arg_get", _wrap_EventConsumer_e_cb_arg_get, METH_VARARGS, NULL},
+	 { "EventConsumer_enodes_set", _wrap_EventConsumer_enodes_set, METH_VARARGS, NULL},
+	 { "EventConsumer_enodes_get", _wrap_EventConsumer_enodes_get, METH_VARARGS, NULL},
+	 { "EventConsumer_node_index_set", _wrap_EventConsumer_node_index_set, METH_VARARGS, NULL},
+	 { "EventConsumer_node_index_get", _wrap_EventConsumer_node_index_get, METH_VARARGS, NULL},
+	 { "new_EventConsumer", _wrap_new_EventConsumer, METH_VARARGS, NULL},
+	 { "delete_EventConsumer", _wrap_delete_EventConsumer, METH_VARARGS, NULL},
+	 { "EventConsumer_bind", _wrap_EventConsumer_bind, METH_VARARGS, NULL},
+	 { "EventConsumer_pop", _wrap_EventConsumer_pop, METH_VARARGS, NULL},
+	 { "EventConsumer_cleanup", _wrap_EventConsumer_cleanup, METH_VARARGS, NULL},
+	 { "EventConsumer_swigregister", EventConsumer_swigregister, METH_VARARGS, NULL},
+	 { "delete_CoreSession", _wrap_delete_CoreSession, METH_VARARGS, NULL},
+	 { "CoreSession_session_set", _wrap_CoreSession_session_set, METH_VARARGS, NULL},
+	 { "CoreSession_session_get", _wrap_CoreSession_session_get, METH_VARARGS, NULL},
+	 { "CoreSession_channel_set", _wrap_CoreSession_channel_set, METH_VARARGS, NULL},
+	 { "CoreSession_channel_get", _wrap_CoreSession_channel_get, METH_VARARGS, NULL},
+	 { "CoreSession_flags_set", _wrap_CoreSession_flags_set, METH_VARARGS, NULL},
+	 { "CoreSession_flags_get", _wrap_CoreSession_flags_get, METH_VARARGS, NULL},
+	 { "CoreSession_allocated_set", _wrap_CoreSession_allocated_set, METH_VARARGS, NULL},
+	 { "CoreSession_allocated_get", _wrap_CoreSession_allocated_get, METH_VARARGS, NULL},
+	 { "CoreSession_cb_state_set", _wrap_CoreSession_cb_state_set, METH_VARARGS, NULL},
+	 { "CoreSession_cb_state_get", _wrap_CoreSession_cb_state_get, METH_VARARGS, NULL},
+	 { "CoreSession_hook_state_set", _wrap_CoreSession_hook_state_set, METH_VARARGS, NULL},
+	 { "CoreSession_hook_state_get", _wrap_CoreSession_hook_state_get, METH_VARARGS, NULL},
+	 { "CoreSession_cause_set", _wrap_CoreSession_cause_set, METH_VARARGS, NULL},
+	 { "CoreSession_cause_get", _wrap_CoreSession_cause_get, METH_VARARGS, NULL},
+	 { "CoreSession_uuid_set", _wrap_CoreSession_uuid_set, METH_VARARGS, NULL},
+	 { "CoreSession_uuid_get", _wrap_CoreSession_uuid_get, METH_VARARGS, NULL},
+	 { "CoreSession_tts_name_set", _wrap_CoreSession_tts_name_set, METH_VARARGS, NULL},
+	 { "CoreSession_tts_name_get", _wrap_CoreSession_tts_name_get, METH_VARARGS, NULL},
+	 { "CoreSession_voice_name_set", _wrap_CoreSession_voice_name_set, METH_VARARGS, NULL},
+	 { "CoreSession_voice_name_get", _wrap_CoreSession_voice_name_get, METH_VARARGS, NULL},
+	 { "CoreSession_insertFile", _wrap_CoreSession_insertFile, METH_VARARGS, NULL},
+	 { "CoreSession_answer", _wrap_CoreSession_answer, METH_VARARGS, NULL},
+	 { "CoreSession__print", _wrap_CoreSession__print, METH_VARARGS, NULL},
+	 { "CoreSession_preAnswer", _wrap_CoreSession_preAnswer, METH_VARARGS, NULL},
+	 { "CoreSession_hangup", _wrap_CoreSession_hangup, METH_VARARGS, NULL},
+	 { "CoreSession_hangupState", _wrap_CoreSession_hangupState, METH_VARARGS, NULL},
+	 { "CoreSession_setVariable", _wrap_CoreSession_setVariable, METH_VARARGS, NULL},
+	 { "CoreSession_setPrivate", _wrap_CoreSession_setPrivate, METH_VARARGS, NULL},
+	 { "CoreSession_getPrivate", _wrap_CoreSession_getPrivate, METH_VARARGS, NULL},
+	 { "CoreSession_getVariable", _wrap_CoreSession_getVariable, METH_VARARGS, NULL},
+	 { "CoreSession_process_callback_result", _wrap_CoreSession_process_callback_result, METH_VARARGS, NULL},
+	 { "CoreSession_say", _wrap_CoreSession_say, METH_VARARGS, NULL},
+	 { "CoreSession_sayPhrase", _wrap_CoreSession_sayPhrase, METH_VARARGS, NULL},
+	 { "CoreSession_hangupCause", _wrap_CoreSession_hangupCause, METH_VARARGS, NULL},
+	 { "CoreSession_getState", _wrap_CoreSession_getState, METH_VARARGS, NULL},
+	 { "CoreSession_recordFile", _wrap_CoreSession_recordFile, METH_VARARGS, NULL},
+	 { "CoreSession_originate", _wrap_CoreSession_originate, METH_VARARGS, NULL},
+	 { "CoreSession_destroy", _wrap_CoreSession_destroy, METH_VARARGS, NULL},
+	 { "CoreSession_setDTMFCallback", _wrap_CoreSession_setDTMFCallback, METH_VARARGS, NULL},
+	 { "CoreSession_speak", _wrap_CoreSession_speak, METH_VARARGS, NULL},
+	 { "CoreSession_set_tts_parms", _wrap_CoreSession_set_tts_parms, METH_VARARGS, NULL},
+	 { "CoreSession_set_tts_params", _wrap_CoreSession_set_tts_params, METH_VARARGS, NULL},
+	 { "CoreSession_collectDigits", _wrap_CoreSession_collectDigits, METH_VARARGS, NULL},
+	 { "CoreSession_getDigits", _wrap_CoreSession_getDigits, METH_VARARGS, NULL},
+	 { "CoreSession_transfer", _wrap_CoreSession_transfer, METH_VARARGS, NULL},
+	 { "CoreSession_read", _wrap_CoreSession_read, METH_VARARGS, NULL},
+	 { "CoreSession_detectSpeech", _wrap_CoreSession_detectSpeech, METH_VARARGS, NULL},
+	 { "CoreSession_playAndGetDigits", _wrap_CoreSession_playAndGetDigits, METH_VARARGS, NULL},
+	 { "CoreSession_playAndDetectSpeech", _wrap_CoreSession_playAndDetectSpeech, METH_VARARGS, NULL},
+	 { "CoreSession_streamFile", _wrap_CoreSession_streamFile, METH_VARARGS, NULL},
+	 { "CoreSession_sleep", _wrap_CoreSession_sleep, METH_VARARGS, NULL},
+	 { "CoreSession_flushEvents", _wrap_CoreSession_flushEvents, METH_VARARGS, NULL},
+	 { "CoreSession_flushDigits", _wrap_CoreSession_flushDigits, METH_VARARGS, NULL},
+	 { "CoreSession_setAutoHangup", _wrap_CoreSession_setAutoHangup, METH_VARARGS, NULL},
+	 { "CoreSession_setHangupHook", _wrap_CoreSession_setHangupHook, METH_VARARGS, NULL},
+	 { "CoreSession_ready", _wrap_CoreSession_ready, METH_VARARGS, NULL},
+	 { "CoreSession_bridged", _wrap_CoreSession_bridged, METH_VARARGS, NULL},
+	 { "CoreSession_answered", _wrap_CoreSession_answered, METH_VARARGS, NULL},
+	 { "CoreSession_mediaReady", _wrap_CoreSession_mediaReady, METH_VARARGS, NULL},
+	 { "CoreSession_waitForAnswer", _wrap_CoreSession_waitForAnswer, METH_VARARGS, NULL},
+	 { "CoreSession_execute", _wrap_CoreSession_execute, METH_VARARGS, NULL},
+	 { "CoreSession_sendEvent", _wrap_CoreSession_sendEvent, METH_VARARGS, NULL},
+	 { "CoreSession_setEventData", _wrap_CoreSession_setEventData, METH_VARARGS, NULL},
+	 { "CoreSession_getXMLCDR", _wrap_CoreSession_getXMLCDR, METH_VARARGS, NULL},
+	 { "CoreSession_begin_allow_threads", _wrap_CoreSession_begin_allow_threads, METH_VARARGS, NULL},
+	 { "CoreSession_end_allow_threads", _wrap_CoreSession_end_allow_threads, METH_VARARGS, NULL},
+	 { "CoreSession_get_uuid", _wrap_CoreSession_get_uuid, METH_VARARGS, NULL},
+	 { "CoreSession_get_cb_args", _wrap_CoreSession_get_cb_args, METH_VARARGS, NULL},
+	 { "CoreSession_check_hangup_hook", _wrap_CoreSession_check_hangup_hook, METH_VARARGS, NULL},
+	 { "CoreSession_run_dtmf_callback", _wrap_CoreSession_run_dtmf_callback, METH_VARARGS, NULL},
+	 { "CoreSession_consoleLog", _wrap_CoreSession_consoleLog, METH_VARARGS, NULL},
+	 { "CoreSession_consoleLog2", _wrap_CoreSession_consoleLog2, METH_VARARGS, NULL},
+	 { "CoreSession_swigregister", CoreSession_swigregister, METH_VARARGS, NULL},
+	 { "console_log", _wrap_console_log, METH_VARARGS, NULL},
+	 { "console_log2", _wrap_console_log2, METH_VARARGS, NULL},
+	 { "console_clean_log", _wrap_console_clean_log, METH_VARARGS, NULL},
+	 { "msleep", _wrap_msleep, METH_VARARGS, NULL},
+	 { "bridge", _wrap_bridge, METH_VARARGS, NULL},
+	 { "hanguphook", _wrap_hanguphook, METH_VARARGS, NULL},
+	 { "dtmf_callback", _wrap_dtmf_callback, METH_VARARGS, NULL},
+	 { "new_Session", _wrap_new_Session, METH_VARARGS, NULL},
+	 { "delete_Session", _wrap_delete_Session, METH_VARARGS, NULL},
+	 { "Session_begin_allow_threads", _wrap_Session_begin_allow_threads, METH_VARARGS, NULL},
+	 { "Session_end_allow_threads", _wrap_Session_end_allow_threads, METH_VARARGS, NULL},
+	 { "Session_check_hangup_hook", _wrap_Session_check_hangup_hook, METH_VARARGS, NULL},
+	 { "Session_destroy", _wrap_Session_destroy, METH_VARARGS, NULL},
+	 { "Session_run_dtmf_callback", _wrap_Session_run_dtmf_callback, METH_VARARGS, NULL},
+	 { "Session_setInputCallback", _wrap_Session_setInputCallback, METH_VARARGS, NULL},
+	 { "Session_unsetInputCallback", _wrap_Session_unsetInputCallback, METH_VARARGS, NULL},
+	 { "Session_setHangupHook", _wrap_Session_setHangupHook, METH_VARARGS, NULL},
+	 { "Session_ready", _wrap_Session_ready, METH_VARARGS, NULL},
+	 { "Session_cb_function_set", _wrap_Session_cb_function_set, METH_VARARGS, NULL},
+	 { "Session_cb_function_get", _wrap_Session_cb_function_get, METH_VARARGS, NULL},
+	 { "Session_cb_arg_set", _wrap_Session_cb_arg_set, METH_VARARGS, NULL},
+	 { "Session_cb_arg_get", _wrap_Session_cb_arg_get, METH_VARARGS, NULL},
+	 { "Session_hangup_func_set", _wrap_Session_hangup_func_set, METH_VARARGS, NULL},
+	 { "Session_hangup_func_get", _wrap_Session_hangup_func_get, METH_VARARGS, NULL},
+	 { "Session_hangup_func_arg_set", _wrap_Session_hangup_func_arg_set, METH_VARARGS, NULL},
+	 { "Session_hangup_func_arg_get", _wrap_Session_hangup_func_arg_get, METH_VARARGS, NULL},
+	 { "Session_setPython", _wrap_Session_setPython, METH_VARARGS, NULL},
+	 { "Session_setSelf", _wrap_Session_setSelf, METH_VARARGS, NULL},
+	 { "Session_swigregister", Session_swigregister, METH_VARARGS, NULL},
 	 { NULL, NULL, 0, NULL }
 };
 
@@ -11154,7 +11171,6 @@ extern "C" {
     static int type_init = 0;
     if (!type_init) {
       const PyTypeObject tmp = {
-        /* PyObject header changed in Python 3 */
 #if PY_VERSION_HEX >= 0x03000000
         PyVarObject_HEAD_INIT(NULL, 0)
 #else
@@ -11316,9 +11332,9 @@ extern "C" {
             char *ndoc = (char*)malloc(ldoc + lptr + 10);
             if (ndoc) {
               char *buff = ndoc;
-              strncpy(buff, methods[i].ml_doc, ldoc);
+              memcpy(buff, methods[i].ml_doc, ldoc);
               buff += ldoc;
-              strncpy(buff, "swig_ptr: ", 10);
+              memcpy(buff, "swig_ptr: ", 10);
               buff += 10;
               SWIG_PackVoidPtr(buff, ptr, ty->name, lptr);
               methods[i].ml_doc = ndoc;
@@ -11380,19 +11396,19 @@ SWIG_init(void) {
     (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL
   };
   static SwigPyGetSet thisown_getset_closure = {
-    (PyCFunction) SwigPyObject_own,
-    (PyCFunction) SwigPyObject_own
+    SwigPyObject_own,
+    SwigPyObject_own
   };
   static PyGetSetDef thisown_getset_def = {
     (char *)"thisown", SwigPyBuiltin_GetterClosure, SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure
   };
-  PyObject *metatype_args;
   PyTypeObject *builtin_pytype;
   int builtin_base_count;
   swig_type_info *builtin_basetype;
   PyObject *tuple;
   PyGetSetDescrObject *static_getset;
   PyTypeObject *metatype;
+  PyTypeObject *swigpyobject;
   SwigPyClientData *cd;
   PyObject *public_interface, *public_symbol;
   PyObject *this_descr;
@@ -11407,14 +11423,9 @@ SWIG_init(void) {
   (void)static_getset;
   (void)self;
   
-  /* metatype is used to implement static member variables. */
-  metatype_args = Py_BuildValue("(s(O){})", "SwigPyObjectType", &PyType_Type);
-  assert(metatype_args);
-  metatype = (PyTypeObject *) PyType_Type.tp_call((PyObject *) &PyType_Type, metatype_args, NULL);
+  /* Metaclass is used to implement static member variables */
+  metatype = SwigPyObjectType();
   assert(metatype);
-  Py_DECREF(metatype_args);
-  metatype->tp_setattro = (setattrofunc) &SwigPyObjectType_setattro;
-  assert(PyType_Ready(metatype) >= 0);
 #endif
   
   /* Fix SwigMethods to carry the callback ptrs when needed */
@@ -11432,13 +11443,15 @@ SWIG_init(void) {
   SWIG_InitializeModule(0);
   
 #ifdef SWIGPYTHON_BUILTIN
+  swigpyobject = SwigPyObject_TypeOnce();
+  
   SwigPyObject_stype = SWIG_MangledTypeQuery("_p_SwigPyObject");
   assert(SwigPyObject_stype);
   cd = (SwigPyClientData*) SwigPyObject_stype->clientdata;
   if (!cd) {
     SwigPyObject_stype->clientdata = &SwigPyObject_clientdata;
-    SwigPyObject_clientdata.pytype = SwigPyObject_TypeOnce();
-  } else if (SwigPyObject_TypeOnce()->tp_basicsize != cd->pytype->tp_basicsize) {
+    SwigPyObject_clientdata.pytype = swigpyobject;
+  } else if (swigpyobject->tp_basicsize != cd->pytype->tp_basicsize) {
     PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two incompatible swig-generated modules.");
 # if PY_VERSION_HEX >= 0x03000000
     return NULL;
diff --git a/src/mod/languages/mod_v8/mod_v8.cpp b/src/mod/languages/mod_v8/mod_v8.cpp
index 49763e05c03..6c2133c0c4a 100644
--- a/src/mod/languages/mod_v8/mod_v8.cpp
+++ b/src/mod/languages/mod_v8/mod_v8.cpp
@@ -201,7 +201,6 @@ static switch_status_t v8_mod_load_file(const char *filename)
 	function_handle = (switch_loadable_module_function_table_t *)switch_dso_data_sym(dso, "v8_mod_init", &derr);
 
 	if (!function_handle || derr) {
-		status = SWITCH_STATUS_FALSE;
 		err = derr;
 		goto err;
 	}
@@ -1458,7 +1457,7 @@ SWITCH_STANDARD_API(process_status_function)
 
 	} else if (!strcasecmp(as, "xml")) {
 		switch_snprintf(tmp_str, sizeof(tmp_str), "%u", count);
-		switch_xml_set_attr(switch_xml_set_flag(xml, SWITCH_XML_DUP), strdup("row_count"), strdup(tmp_str));
+		switch_xml_set_attr_d(xml, "row_count", tmp_str);
 
 		output_text = switch_xml_toxml(xml, SWITCH_FALSE);
 
diff --git a/src/mod/languages/mod_v8/src/fscoredb.cpp b/src/mod/languages/mod_v8/src/fscoredb.cpp
index 5af1bd5be88..321677b642c 100644
--- a/src/mod/languages/mod_v8/src/fscoredb.cpp
+++ b/src/mod/languages/mod_v8/src/fscoredb.cpp
@@ -301,7 +301,6 @@ JS_COREDB_FUNCTION_IMPL(Prepare)
 JS_COREDB_FUNCTION_IMPL(BindText)
 {
 	HandleScope handle_scope(info.GetIsolate());
-	bool status;
 	int32_t param_index = -1;
 	string param_value;
 
@@ -324,7 +323,6 @@ JS_COREDB_FUNCTION_IMPL(BindText)
 	}
 
 	/* convert args */
-	status = !info[0].IsEmpty() && info[0]->IsInt32() ? true : false;
 	param_index = info[0]->Int32Value();
 	String::Utf8Value str(info[1]);
 	param_value = js_safe_str(*str);
@@ -347,7 +345,6 @@ JS_COREDB_FUNCTION_IMPL(BindText)
 JS_COREDB_FUNCTION_IMPL(BindInt)
 {
 	HandleScope handle_scope(info.GetIsolate());
-	bool status;
 	int32_t param_index = -1;
 	int32_t param_value = -1;
 
@@ -370,10 +367,7 @@ JS_COREDB_FUNCTION_IMPL(BindInt)
 	}
 
 	/* convert args */
-	status = !info[0].IsEmpty() && info[0]->IsInt32() ? true : false;
 	param_index = info[0]->Int32Value();
-
-	status = !info[1].IsEmpty() && info[1]->IsInt32() ? true : false;
 	param_value = info[1]->Int32Value();
 
 	if (param_index < 1) {
diff --git a/src/mod/languages/mod_v8/src/jsmain.cpp b/src/mod/languages/mod_v8/src/jsmain.cpp
index d473d9f0850..2a02c8737e1 100644
--- a/src/mod/languages/mod_v8/src/jsmain.cpp
+++ b/src/mod/languages/mod_v8/src/jsmain.cpp
@@ -443,7 +443,7 @@ Isolate *JSMain::GetIsolate()
 #if defined(V8_MAJOR_VERSION) && V8_MAJOR_VERSION >=5
 void JSMain::Initialize(v8::Platform **platform)
 {
-	bool res = V8::InitializeICUDefaultLocation(SWITCH_GLOBAL_dirs.mod_dir);
+	V8::InitializeICUDefaultLocation(SWITCH_GLOBAL_dirs.mod_dir);
 	V8::InitializeExternalStartupData(SWITCH_GLOBAL_dirs.mod_dir);
 
 	*platform = v8::platform::CreateDefaultPlatform();
diff --git a/src/mod/say/mod_say_hr/mod_say_hr.c b/src/mod/say/mod_say_hr/mod_say_hr.c
index 5f5b9a99a44..f996c7424d1 100644
--- a/src/mod/say/mod_say_hr/mod_say_hr.c
+++ b/src/mod/say/mod_say_hr/mod_say_hr.c
@@ -752,7 +752,6 @@ static switch_status_t hr_say_time(switch_core_session_t *session, char *tosay,
 
 	if (say_date) {
 		say_year = say_month = say_day = say_dow = 1;
-		say_today = say_yesterday = 0;
 	}
 
 	if (say_day) {
diff --git a/src/mod/say/mod_say_hu/mod_say_hu.c b/src/mod/say/mod_say_hu/mod_say_hu.c
index c35be1bc599..839fe47421a 100644
--- a/src/mod/say/mod_say_hu/mod_say_hu.c
+++ b/src/mod/say/mod_say_hu/mod_say_hu.c
@@ -326,7 +326,6 @@ static switch_status_t hu_say_time(switch_core_session_t *session, char *tosay,
 
 	if (say_date) {
 		say_year = say_month = say_day = say_dow = 1;
-		say_today = say_yesterday = 0;
 	}
 
 	if (say_year) {
diff --git a/src/mod/say/mod_say_nl/mod_say_nl.c b/src/mod/say/mod_say_nl/mod_say_nl.c
index 770e7cbf215..78f398bc7c4 100644
--- a/src/mod/say/mod_say_nl/mod_say_nl.c
+++ b/src/mod/say/mod_say_nl/mod_say_nl.c
@@ -188,7 +188,7 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 	if (say_args->type == SST_TIME_MEASUREMENT) {
 		int64_t hours = 0;
 		int64_t minutes = 0;
-		int64_t seconds = 0;
+		int64_t seconds;
 		int64_t r = 0;
 
 		if (strchr(tosay, ':')) {
@@ -197,7 +197,6 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 
 			if ((p = strrchr(tme, ':'))) {
 				*p++ = '\0';
-				seconds = atoi(p);
 				if ((p = strchr(tme, ':'))) {
 					*p++ = '\0';
 					minutes = atoi(p);
@@ -215,8 +214,6 @@ static switch_status_t nl_say_time(switch_core_session_t *session, char *tosay,
 
 			if (seconds >= 60) {
 				minutes = seconds / 60;
-				r = seconds % 60;
-				seconds = r;
 			}
 
 			if (minutes >= 60) {
diff --git a/src/mod/say/mod_say_pl/mod_say_pl.c b/src/mod/say/mod_say_pl/mod_say_pl.c
index 68546f9410e..02fb4e15a36 100644
--- a/src/mod/say/mod_say_pl/mod_say_pl.c
+++ b/src/mod/say/mod_say_pl/mod_say_pl.c
@@ -385,7 +385,6 @@ static switch_status_t pl_say_time(switch_say_file_handle_t *sh, char *tosay, sw
 
 	if (say_date) {
 		say_year = say_month = say_day = say_dow = 1;
-		say_today = say_yesterday = 0;
 	}
 
 	if (say_day) {
diff --git a/src/mod/say/mod_say_ru/mod_say_ru.c b/src/mod/say/mod_say_ru/mod_say_ru.c
index b4759442721..74c6c5c216a 100644
--- a/src/mod/say/mod_say_ru/mod_say_ru.c
+++ b/src/mod/say/mod_say_ru/mod_say_ru.c
@@ -381,7 +381,6 @@ static switch_status_t ru_say_time(switch_say_file_handle_t *sh, char *tosay, sw
 
 			if (seconds >= 60) {
 				minutes = seconds / 60;
-				r = seconds % 60;
 			}
 
 			if (minutes >= 60) {
@@ -503,7 +502,6 @@ static switch_status_t ru_say_time(switch_say_file_handle_t *sh, char *tosay, sw
 	}
 	if (say_date) {
 		say_year = say_month = say_day = say_dow = 1;
-		say_today = say_yesterday = 0;
 	}
 	if (say_day) {
 		switch_snprintf(buf, sizeof(buf), "%u", (unsigned) tm.tm_mday);
diff --git a/src/mod/say/mod_say_sv/mod_say_sv.c b/src/mod/say/mod_say_sv/mod_say_sv.c
index b3fd2dcfebd..7a2311b013e 100644
--- a/src/mod/say/mod_say_sv/mod_say_sv.c
+++ b/src/mod/say/mod_say_sv/mod_say_sv.c
@@ -416,7 +416,6 @@ static switch_status_t sv_say_time(switch_say_file_handle_t *sh, char *tosay, sw
 
         if (say_date) {
                 say_year = say_month = say_day = say_dow = 1;
-                say_today = say_yesterday = 0;
         }
 
         if (say_month) {
diff --git a/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c b/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
index d9260b687eb..4e3bd81f784 100644
--- a/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
+++ b/src/mod/xml_int/mod_xml_cdr/mod_xml_cdr.c
@@ -256,7 +256,6 @@ static switch_status_t my_on_reporting(switch_core_session_t *session)
 				wrote = write(fd, xml_text, (unsigned) strlen(xml_text));
 				wrote++;
 				close(fd);
-				fd = -1;
 			} else {
 				char ebuf[512] = { 0 };
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error writing [%s][%s]\n",
@@ -430,7 +429,6 @@ static switch_status_t my_on_reporting(switch_core_session_t *session)
 				wrote = write(fd, xml_text, (unsigned) strlen(xml_text));
 				wrote++;
 				close(fd);
-				fd = -1;
 			} else {
 				char ebuf[512] = { 0 };
 				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Error![%s]\n",
diff --git a/src/mod/xml_int/mod_xml_rpc/ws.c b/src/mod/xml_int/mod_xml_rpc/ws.c
index a5ad9c7a3bf..b660b644f66 100644
--- a/src/mod/xml_int/mod_xml_rpc/ws.c
+++ b/src/mod/xml_int/mod_xml_rpc/ws.c
@@ -19,7 +19,7 @@ void deinit_ssl(void)
 }
 
 #else
-static unsigned long pthreads_thread_id(void);
+static void pthreads_thread_id(CRYPTO_THREADID *id);
 static void pthreads_locking_callback(int mode, int type, const char *file, int line);
 
 static pthread_mutex_t *lock_cs;
@@ -39,7 +39,7 @@ static void thread_setup(void)
 		pthread_mutex_init(&(lock_cs[i]), NULL);
 	}
 
-	CRYPTO_set_id_callback(pthreads_thread_id);
+	CRYPTO_THREADID_set_callback(pthreads_thread_id);
 	CRYPTO_set_locking_callback(pthreads_locking_callback);
 }
 
@@ -70,9 +70,9 @@ static void pthreads_locking_callback(int mode, int type, const char *file, int
 
 
 
-static unsigned long pthreads_thread_id(void)
+static void pthreads_thread_id(CRYPTO_THREADID *id)
 {
-	return (unsigned long) pthread_self();
+	CRYPTO_THREADID_set_numeric(id, (unsigned long)pthread_self());
 }
 
 
diff --git a/src/switch_core.c b/src/switch_core.c
index d3150835c01..e7d478d2b8e 100644
--- a/src/switch_core.c
+++ b/src/switch_core.c
@@ -2383,6 +2383,8 @@ static void switch_load_core_config(const char *file)
 					} else {
 						switch_clear_flag((&runtime), SCF_EVENT_CHANNEL_LOG_UNDELIVERABLE_JSON);
 					}
+				} else if (!strcasecmp(var, "max-audio-channels") && !zstr(val)) {
+					switch_core_max_audio_channels(atoi(val));
 				}
 			}
 		}
@@ -2638,6 +2640,15 @@ SWITCH_DECLARE(int32_t) switch_core_sessions_peak_fivemin(void)
 	return runtime.sessions_peak_fivemin;
 }
 
+SWITCH_DECLARE(uint32_t) switch_core_max_audio_channels(uint32_t limit)
+{
+	if (limit) {
+		runtime.max_audio_channels = limit;
+	}
+
+	return runtime.max_audio_channels;
+}
+
 SWITCH_DECLARE(int32_t) switch_core_session_ctl(switch_session_ctl_t cmd, void *val)
 {
 	int *intval = (int *) val;
diff --git a/src/switch_core_cert.c b/src/switch_core_cert.c
index 0cc992b9ed2..393baa55a96 100644
--- a/src/switch_core_cert.c
+++ b/src/switch_core_cert.c
@@ -46,9 +46,9 @@ static inline void switch_ssl_ssl_lock_callback(int mode, int type, char *file,
 	}
 }
 
-static inline unsigned long switch_ssl_ssl_thread_id(void)
+static inline void switch_ssl_ssl_thread_id(CRYPTO_THREADID *id)
 {
-	return (unsigned long) switch_thread_self();
+	CRYPTO_THREADID_set_numeric(id, (unsigned long)switch_thread_self());
 }
 
 SWITCH_DECLARE(void) switch_ssl_init_ssl_locks(void)
@@ -69,7 +69,7 @@ SWITCH_DECLARE(void) switch_ssl_init_ssl_locks(void)
 			switch_assert(ssl_mutexes[i] != NULL);
 		}
 
-		CRYPTO_set_id_callback(switch_ssl_ssl_thread_id);
+		CRYPTO_THREADID_set_callback(switch_ssl_ssl_thread_id);
 		CRYPTO_set_locking_callback((void (*)(int, int, const char*, int))switch_ssl_ssl_lock_callback);
 	}
 
diff --git a/src/switch_core_file.c b/src/switch_core_file.c
index 717233ce55f..abeda29430d 100644
--- a/src/switch_core_file.c
+++ b/src/switch_core_file.c
@@ -76,6 +76,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 	char *fp = NULL;
 	int to = 0;
 	int force_channels = 0;
+	uint32_t core_channel_limit;
 
 	if (switch_test_flag(fh, SWITCH_FILE_OPEN)) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_ERROR, "Handle already open\n");
@@ -361,6 +362,19 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_file_open(const char *file,
 		goto fail;
 	}
 
+	if (fh->channels > 2) {
+		/* just show a warning for more than 2 channels, no matter if we allow them or not */
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "File [%s] has more than 2 channels: [%u]\n", file_path, fh->channels);
+	}
+
+	core_channel_limit = switch_core_max_audio_channels(0);
+	if (core_channel_limit && fh->channels > core_channel_limit) {
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_WARNING, "File [%s] has more channels (%u) than limit (%u). Closing.\n", file_path, fh->channels, core_channel_limit);
+		fh->file_interface->file_close(fh);
+		UNPROTECT_INTERFACE(fh->file_interface);
+		switch_goto_status(SWITCH_STATUS_FALSE, fail);
+	}
+
 	if (!force_channels && !fh->real_channels) {
 		fh->real_channels = fh->channels;
 
@@ -482,7 +496,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 				if (status != SWITCH_STATUS_SUCCESS || !rlen) {
 					switch_set_flag_locked(fh, SWITCH_FILE_BUFFER_DONE);
 				} else {
-					fh->samples_in += rlen;
 					if (fh->real_channels != fh->channels && !switch_test_flag(fh, SWITCH_FILE_NOMUX)) {
 						switch_mux_channels((int16_t *) fh->pre_buffer_data, rlen, fh->real_channels, fh->channels);
 					}
@@ -492,6 +505,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_read(switch_file_handle_t *fh,
 		}
 
 		rlen = switch_buffer_read(fh->pre_buffer, data, asis ? *len : *len * 2 * fh->channels);
+		fh->samples_in += rlen;
 		*len = asis ? rlen : rlen / 2 / fh->channels;
 
 		if (*len == 0) {
@@ -589,15 +603,17 @@ SWITCH_DECLARE(switch_status_t) switch_core_file_write(switch_file_handle_t *fh,
 
 
 	if (fh->real_channels != fh->channels && !switch_test_flag(fh, SWITCH_FILE_NOMUX)) {
-		int need = *len * 2 * fh->real_channels;
+		int need = *len * 2 * (fh->real_channels > fh->channels ? fh->real_channels : fh->channels);
 
 		if (need > fh->muxlen) {
 			fh->muxbuf = realloc(fh->muxbuf, need);
 			switch_assert(fh->muxbuf);
 			fh->muxlen = need;
-			memcpy(fh->muxbuf, data, fh->muxlen);
-			data = fh->muxbuf;
+		}
 
+		if (fh->muxbuf) {
+			memcpy(fh->muxbuf, data, *len * 2);
+			data = fh->muxbuf;
 		}
 
 		switch_mux_channels((int16_t *) data, *len, fh->real_channels, fh->channels);
diff --git a/src/switch_core_hash.c b/src/switch_core_hash.c
index 1655aac7265..70102ce9572 100644
--- a/src/switch_core_hash.c
+++ b/src/switch_core_hash.c
@@ -57,20 +57,28 @@ SWITCH_DECLARE(switch_status_t) switch_core_hash_destroy(switch_hash_t **hash)
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_auto_free(switch_hash_t *hash, const char *key, const void *data)
 {
-	int r = 0;
+	char *dkey = strdup(key);
+
+	if (switch_hashtable_insert_destructor(hash, dkey, (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_FLAG_FREE_VALUE | HASHTABLE_DUP_CHECK, NULL)) {
+		return SWITCH_STATUS_SUCCESS;
+	}
 
-	r = switch_hashtable_insert_destructor(hash, strdup(key), (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_FLAG_FREE_VALUE | HASHTABLE_DUP_CHECK, NULL);
+	switch_safe_free(dkey);
 
-	return r ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
+	return SWITCH_STATUS_FALSE;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_destructor(switch_hash_t *hash, const char *key, const void *data, hashtable_destructor_t destructor)
 {
-	int r = 0;
+	char *dkey = strdup(key);
+
+	if (switch_hashtable_insert_destructor(hash, dkey, (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, destructor)) {
+		return SWITCH_STATUS_SUCCESS;
+	}
 
-	r = switch_hashtable_insert_destructor(hash, strdup(key), (void *)data, HASHTABLE_FLAG_FREE_KEY | HASHTABLE_DUP_CHECK, destructor);
+	switch_safe_free(dkey);
 
-	return r ? SWITCH_STATUS_SUCCESS : SWITCH_STATUS_FALSE;
+	return SWITCH_STATUS_FALSE;
 }
 
 SWITCH_DECLARE(switch_status_t) switch_core_hash_insert_locked(switch_hash_t *hash, const char *key, const void *data, switch_mutex_t *mutex)
diff --git a/src/switch_core_media.c b/src/switch_core_media.c
index c3a29353ee2..65ea2f3a6bb 100644
--- a/src/switch_core_media.c
+++ b/src/switch_core_media.c
@@ -4263,7 +4263,7 @@ static switch_status_t check_ice(switch_media_handle_t *smh, switch_media_type_t
 
 				cid = fields[1] ? atoi(fields[1]) - 1 : 0;
 
-				if (argc < 5 || engine->ice_in.cand_idx[cid] >= MAX_CAND - 1) {
+				if (argc < 6 || engine->ice_in.cand_idx[cid] >= MAX_CAND - 1) {
 					switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(smh->session), SWITCH_LOG_WARNING, "Invalid data\n");
 					continue;
 				}
@@ -5977,7 +5977,9 @@ SWITCH_DECLARE(uint8_t) switch_core_media_negotiate_sdp(switch_core_session_t *s
 				}
 			}
 
-			t_engine->cur_payload_map = red_pmap;
+			if (red_pmap) {
+				t_engine->cur_payload_map = red_pmap;
+			}
 
 			for (attr = m->m_attributes; attr; attr = attr->a_next) {
 				if (!strcasecmp(attr->a_name, "rtcp") && attr->a_value) {
@@ -9111,7 +9113,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_activate_rtp(switch_core_sessi
 					!((val = switch_channel_get_variable(session->channel, "disable_rtp_auto_adjust")) && switch_true(val))) {
 					flags[SWITCH_RTP_FLAG_AUTOADJ]++;
 				}
-				timer_name = NULL;
 
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
 								  "PROXY TEXT RTP [%s] %s:%d->%s:%d codec: %u ms: %d\n",
@@ -9124,12 +9125,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_activate_rtp(switch_core_sessi
 				if (switch_rtp_ready(t_engine->rtp_session)) {
 					switch_rtp_set_default_payload(t_engine->rtp_session, t_engine->cur_payload_map->pt);
 				}
-			} else {
-				timer_name = smh->mparams->timer_name;
-
-				if ((var = switch_channel_get_variable(session->channel, "rtp_timer_name"))) {
-					timer_name = (char *) var;
-				}
 			}
 
 			/******************************************************************************************/
@@ -9426,7 +9421,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_activate_rtp(switch_core_sessi
 					!((val = switch_channel_get_variable(session->channel, "disable_rtp_auto_adjust")) && switch_true(val))) {
 					flags[SWITCH_RTP_FLAG_AUTOADJ]++;
 				}
-				timer_name = NULL;
 
 				switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG,
 								  "PROXY VIDEO RTP [%s] %s:%d->%s:%d codec: %u ms: %d\n",
@@ -9439,12 +9433,6 @@ SWITCH_DECLARE(switch_status_t) switch_core_media_activate_rtp(switch_core_sessi
 				if (switch_rtp_ready(v_engine->rtp_session)) {
 					switch_rtp_set_default_payload(v_engine->rtp_session, v_engine->cur_payload_map->pt);
 				}
-			} else {
-				timer_name = smh->mparams->timer_name;
-
-				if ((var = switch_channel_get_variable(session->channel, "rtp_timer_name"))) {
-					timer_name = (char *) var;
-				}
 			}
 
 			/******************************************************************************************/
@@ -10353,6 +10341,13 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 			switch_core_session_get_partner(session, &orig_session);
 
+			if (orig_session && !switch_channel_test_flag(session->channel, CF_ANSWERED)) {
+				switch_core_media_set_smode(smh->session, SWITCH_MEDIA_TYPE_AUDIO,
+											switch_core_session_remote_media_flow(orig_session, SWITCH_MEDIA_TYPE_AUDIO), sdp_type);
+				switch_core_media_set_smode(smh->session, SWITCH_MEDIA_TYPE_VIDEO,
+											switch_core_session_remote_media_flow(orig_session, SWITCH_MEDIA_TYPE_VIDEO), sdp_type);
+			}
+
 			for (i = 0; i < smh->mparams->num_codecs; i++) {
 				const switch_codec_implementation_t *imp = smh->codecs[i];
 				switch_payload_t orig_pt = 0;
@@ -10997,7 +10992,6 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 
 				if (v_engine->codec_negotiated) {
-					const char *of;
 					payload_map_t *pmap;
 
 					//if (!strcasecmp(v_engine->cur_payload_map->rm_encoding, "VP8")) {
@@ -11019,12 +11013,6 @@ SWITCH_DECLARE(void) switch_core_media_gen_local_sdp(switch_core_session_t *sess
 
 						pass_fmtp = NULL;
 
-						if (switch_channel_get_partner_uuid(session->channel)) {
-							if ((of = switch_channel_get_variable_partner(session->channel, "rtp_video_fmtp"))) {
-								pass_fmtp = of;
-							}
-						}
-
 						if (ov_fmtp) {
 							pass_fmtp = ov_fmtp;
 						} else {
@@ -13358,15 +13346,15 @@ static void add_audio_codec(sdp_rtpmap_t *map, const switch_codec_implementation
 	map_bit_rate = switch_known_bitrate((switch_payload_t)map->rm_pt);
 
 	if (!ptime && !strcasecmp(map->rm_encoding, "g723")) {
-		ptime = codec_ms = 30;
+		codec_ms = 30;
 	}
 
 	if (zstr(map->rm_fmtp)) {
 		if (!strcasecmp(map->rm_encoding, "ilbc")) {
-			ptime = codec_ms = 30;
+			codec_ms = 30;
 			map_bit_rate = 13330;
 		} else if (!strcasecmp(map->rm_encoding, "isac")) {
-			ptime = codec_ms = 30;
+			codec_ms = 30;
 			map_bit_rate = 32000;
 		}
 	} else {
@@ -13993,7 +13981,6 @@ SWITCH_DECLARE (void) switch_core_media_recover_session(switch_core_session_t *s
 		v_engine->cur_payload_map->rm_fmtp = (char *) switch_channel_get_variable(session->channel, "rtp_use_video_codec_fmtp");
 		v_engine->codec_negotiated = 1;
 
-		ip = switch_channel_get_variable(session->channel, SWITCH_LOCAL_VIDEO_IP_VARIABLE);
 		port = switch_channel_get_variable(session->channel, SWITCH_LOCAL_VIDEO_PORT_VARIABLE);
 		r_ip = switch_channel_get_variable(session->channel, SWITCH_REMOTE_VIDEO_IP_VARIABLE);
 		r_port = switch_channel_get_variable(session->channel, SWITCH_REMOTE_VIDEO_PORT_VARIABLE);
@@ -14582,7 +14569,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_write_video_frame(switch_cor
 
 	v_engine = &smh->engines[SWITCH_MEDIA_TYPE_VIDEO];
 	if (v_engine->thread_write_lock && v_engine->thread_write_lock != switch_thread_self()) {
-		return SWITCH_STATUS_SUCCESS;
+		switch_goto_status(SWITCH_STATUS_SUCCESS, done);
 	}
 
 	if (!smh->video_init && smh->mparams->video_key_first && (now - smh->video_last_key_time) > smh->mparams->video_key_first) {
@@ -14618,7 +14605,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_write_video_frame(switch_cor
 		if (vid_params.width && vid_params.height && ((vid_params.width != img->d_w) || (vid_params.height != img->d_h))) {
 			switch_img_letterbox(img, &dup_img, vid_params.width, vid_params.height, "#000000f");
 			if (!(img = dup_img)) {
-				return SWITCH_STATUS_INUSE;
+				switch_goto_status(SWITCH_STATUS_INUSE, done);
 			}
 		}
 	}
@@ -15529,7 +15516,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_write_text_frame(switch_core
 
 		if (!switch_buffer_inuse(t_engine->tf->write_buffer)) {
 			t_engine->tf->write_empty++;
-			return SWITCH_STATUS_BREAK;
+			switch_goto_status(SWITCH_STATUS_BREAK, done);
 		}
 
 		frame = &t_engine->tf->text_write_frame;
@@ -15901,7 +15888,7 @@ SWITCH_DECLARE(switch_status_t) switch_core_session_write_frame(switch_core_sess
 				goto error;
 			}
 			if (ptime_mismatch && status != SWITCH_STATUS_GENERR) {
-				status = perform_write(session, frame, flags, stream_id);
+				perform_write(session, frame, flags, stream_id);
 				status = SWITCH_STATUS_SUCCESS;
 				goto error;
 			}
diff --git a/src/switch_core_memory.c b/src/switch_core_memory.c
index eb35f1b2ca5..2cf007fad2c 100644
--- a/src/switch_core_memory.c
+++ b/src/switch_core_memory.c
@@ -463,7 +463,18 @@ SWITCH_DECLARE(switch_status_t) switch_core_perform_new_memory_pool(switch_memor
 
 SWITCH_DECLARE(switch_status_t) switch_core_perform_destroy_memory_pool(switch_memory_pool_t **pool, const char *file, const char *func, int line)
 {
+	char *tmp;
+	const char *tag;
 	switch_assert(pool != NULL);
+	
+	/* In tag we store who calls the pool creation.
+	   Now we append it with who calls the pool destroy.
+	*/
+	if (*pool) {
+		tag = apr_pool_tag(*pool, NULL);
+		tmp = switch_core_sprintf(*pool, "%s,%s:%d", (tag ? tag : ""), file, line);
+		apr_pool_tag(*pool, tmp);
+	}
 
 #ifdef DEBUG_ALLOC2
 	switch_log_printf(SWITCH_CHANNEL_ID_LOG, file, func, line, NULL, SWITCH_LOG_CONSOLE, "%p Free Pool %s\n", (void *) *pool, apr_pool_tag(*pool, NULL));
diff --git a/src/switch_core_sqldb.c b/src/switch_core_sqldb.c
index 7f5c2aff525..98c50c0d8d2 100644
--- a/src/switch_core_sqldb.c
+++ b/src/switch_core_sqldb.c
@@ -1616,6 +1616,7 @@ struct switch_sql_queue_manager {
 	uint32_t numq;
 	char *dsn;
 	switch_thread_t *thread;
+	int thread_initiated;
 	int thread_running;
 	switch_thread_cond_t *cond;
 	switch_mutex_t *cond_mutex;
@@ -1899,8 +1900,15 @@ SWITCH_DECLARE(switch_status_t) switch_sql_queue_manager_start(switch_sql_queue_
 		switch_threadattr_create(&thd_attr, qm->pool);
 		switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
 		switch_threadattr_priority_set(thd_attr, SWITCH_PRI_NORMAL);
-		switch_thread_create(&qm->thread, thd_attr, switch_user_sql_thread, qm, qm->pool);
-		return SWITCH_STATUS_SUCCESS;
+		if (switch_thread_create(&qm->thread, thd_attr, switch_user_sql_thread, qm, qm->pool) == SWITCH_STATUS_SUCCESS) {
+			while (!qm->thread_initiated) {
+				switch_cond_next();
+			}
+
+			if (qm->event_db) {
+				return SWITCH_STATUS_SUCCESS;
+			}
+		}
 	}
 
 	return SWITCH_STATUS_FALSE;
@@ -2274,11 +2282,10 @@ static void *SWITCH_THREAD_FUNC switch_user_sql_thread(switch_thread_t *thread,
 
 	if (!qm->event_db) {
 		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT, "%s Error getting db handle\n", qm->name);
+		qm->thread_initiated = 1;
 		return NULL;
 	}
 
-	qm->thread_running = 1;
-
 	switch_mutex_lock(qm->cond_mutex);
 
 	switch (qm->event_db->type) {
@@ -2296,6 +2303,8 @@ static void *SWITCH_THREAD_FUNC switch_user_sql_thread(switch_thread_t *thread,
 		break;
 	}
 
+	qm->thread_initiated = 1;
+	qm->thread_running = 1;
 
 	while (qm->thread_running == 1) {
 		uint32_t i, lc;
diff --git a/src/switch_core_state_machine.c b/src/switch_core_state_machine.c
index 01f200ddb45..57ba3bfeb22 100644
--- a/src/switch_core_state_machine.c
+++ b/src/switch_core_state_machine.c
@@ -803,7 +803,7 @@ SWITCH_DECLARE(void) switch_core_session_hangup_state(switch_core_session_t *ses
 	int do_extra_handlers = 1;
 	int silly = 0;
 	int index = 0;
-	switch_channel_state_t state = switch_channel_get_state(session->channel), midstate = state;
+	switch_channel_state_t state = switch_channel_get_state(session->channel), midstate;
 	const switch_endpoint_interface_t *endpoint_interface;
 	const switch_state_handler_table_t *driver_state_handler = NULL;
 	const switch_state_handler_table_t *application_state_handler = NULL;
@@ -868,7 +868,7 @@ SWITCH_DECLARE(void) switch_core_session_hangup_state(switch_core_session_t *ses
 
 SWITCH_DECLARE(void) switch_core_session_reporting_state(switch_core_session_t *session)
 {
-	switch_channel_state_t state = switch_channel_get_state(session->channel), midstate = state;
+	switch_channel_state_t state = switch_channel_get_state(session->channel), midstate;
 	const switch_endpoint_interface_t *endpoint_interface;
 	const switch_state_handler_table_t *driver_state_handler = NULL;
 	const switch_state_handler_table_t *application_state_handler = NULL;
diff --git a/src/switch_core_video.c b/src/switch_core_video.c
index 28b62293152..76232b40f04 100644
--- a/src/switch_core_video.c
+++ b/src/switch_core_video.c
@@ -1614,7 +1614,7 @@ static inline void switch_img_get_rgb_pixel(switch_image_t *img, switch_rgb_colo
 	if (x < 0 || y < 0 || x >= img->d_w || y >= img->d_h) return;
 
 	if (img->fmt == SWITCH_IMG_FMT_I420) {
-		switch_yuv_color_t yuv;
+		switch_yuv_color_t yuv = {0};
 
 		switch_img_get_yuv_pixel(img, &yuv, x, y);
 		switch_color_yuv2rgb(&yuv, rgb);
@@ -2134,12 +2134,15 @@ SWITCH_DECLARE(switch_image_t *) switch_img_write_text_img(int w, int h, switch_
 	char *argv[6] = { 0 };
 	switch_rgb_color_t bgcolor = { 0 };
 	int pre_width = 0, width = 0, font_size = 0, height = 0;
-	int len = 0;
 	char *duptxt = strdup(text);
 	switch_img_txt_handle_t *txthandle = NULL;
 	switch_image_t *txtimg = NULL;
 	int x = 0, y = 0;
 
+	if (!duptxt) {
+		return NULL;
+	}
+
 	if (strchr(text, ':')) {
 		argc = switch_split(duptxt, ':', argv);
 
@@ -2178,11 +2181,6 @@ SWITCH_DECLARE(switch_image_t *) switch_img_write_text_img(int w, int h, switch_
 	while (*txt == ' ') txt++;
 	while (end_of(txt) == ' ') end_of(txt) = '\0';
 
-	len = strlen(txt);
-
-	if (len < 5) len = 5;
-
-
 	switch_img_txt_handle_create(&txthandle, font_face, fg, bg, font_size, 0, NULL);
 
 	pre_width = switch_img_txt_handle_render(txthandle,
diff --git a/src/switch_cpp.cpp b/src/switch_cpp.cpp
index d2a7a531f85..f51d2c1fb87 100644
--- a/src/switch_cpp.cpp
+++ b/src/switch_cpp.cpp
@@ -83,7 +83,7 @@ SWITCH_DECLARE(int) EventConsumer::bind(const char *event_name, const char *subc
 
 	if (node_index <= SWITCH_EVENT_ALL &&
 		switch_event_bind_removable(__FILE__, event_id, subclass_name, event_handler, this, &enodes[node_index]) == SWITCH_STATUS_SUCCESS) {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "bound to %s %s\n", event_name, switch_str_nil(subclass_name));
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "bound to %s %s\n", event_name, switch_str_nil(subclass_name));
 		node_index++;
 		return 1;
 	}
@@ -901,8 +901,12 @@ SWITCH_DECLARE(char *) CoreSession::getDigits(int maxdigits,
 									terminators,
 									&terminator,
 									(uint32_t) timeout, (uint32_t)interdigit, (uint32_t)abstimeout);
-
-	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "getDigits dtmf_buf: %s\n", dtmf_buf);
+									
+	/* Only log DTMF buffer if sensitive_dtmf channel variable not set to true */
+	if (!(switch_channel_var_true(switch_core_session_get_channel(session), SWITCH_SENSITIVE_DTMF_VARIABLE))) {
+		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "getDigits dtmf_buf: %s\n", dtmf_buf);
+	}
+	
 	end_allow_threads();
 	return dtmf_buf;
 }
diff --git a/src/switch_event.c b/src/switch_event.c
index 8b8d57250ca..efe43e603ae 100644
--- a/src/switch_event.c
+++ b/src/switch_event.c
@@ -503,7 +503,13 @@ SWITCH_DECLARE(switch_status_t) switch_event_reserve_subclass_detailed(const cha
 	subclass->owner = DUP(owner);
 	subclass->name = DUP(subclass_name);
 
-	switch_core_hash_insert(CUSTOM_HASH, subclass->name, subclass);
+	status = switch_core_hash_insert(CUSTOM_HASH, subclass->name, subclass);
+
+	if (status != SWITCH_STATUS_SUCCESS) {
+		free(subclass->owner);
+		free(subclass->name);
+		free(subclass);
+	}
 
 end:
 
@@ -1114,7 +1120,11 @@ static switch_status_t switch_event_base_add_header(switch_event_t *event, switc
 	redraw:
 		len = 0;
 		for(j = 0; j < header->idx; j++) {
-			len += strlen(header->array[j]) + 2;
+			len += 2;
+			if (!header->array[j]) { 
+				continue;
+			}
+			len += strlen(header->array[j]);
 		}
 
 		if (len) {
@@ -1135,6 +1145,9 @@ static switch_status_t switch_event_base_add_header(switch_event_t *event, switc
 					memcpy(hv, "|:", 2);
 					hv += 2;
 				}
+				if (!header->array[j]) { 
+					continue;
+				}
 				memcpy(hv, header->array[j], strlen(header->array[j]));
 				hv += strlen(header->array[j]);
 			}
@@ -2131,7 +2144,7 @@ SWITCH_DECLARE(switch_status_t) switch_event_unbind_callback(switch_event_callba
 					EVENT_NODES[n->event_id] = n->next;
 				}
 
-				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Event Binding deleted for %s:%s\n", n->id, switch_event_name(n->event_id));
+				switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Event Binding deleted for %s:%s\n", n->id, switch_event_name(n->event_id));
 				FREE(n->subclass_name);
 				FREE(n->id);
 				FREE(n);
@@ -2171,7 +2184,7 @@ SWITCH_DECLARE(switch_status_t) switch_event_unbind(switch_event_node_t **node)
 			} else {
 				EVENT_NODES[n->event_id] = n->next;
 			}
-			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_NOTICE, "Event Binding deleted for %s:%s\n", n->id, switch_event_name(n->event_id));
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "Event Binding deleted for %s:%s\n", n->id, switch_event_name(n->event_id));
 			FREE(n->subclass_name);
 			FREE(n->id);
 			FREE(n);
@@ -2495,7 +2508,6 @@ SWITCH_DECLARE(char *) switch_event_expand_headers_check(switch_event_t *event,
 				switch_safe_free(expanded_sub_val);
 				sub_val = NULL;
 				vname = NULL;
-				vtype = 0;
 				br = 0;
 			}
 
diff --git a/src/switch_ivr.c b/src/switch_ivr.c
index a5b7d341dbf..c1d232e1b7b 100644
--- a/src/switch_ivr.c
+++ b/src/switch_ivr.c
@@ -989,9 +989,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_park(switch_core_session_t *session,
 			timeout_cause = switch_channel_str2cause(cause_str + 1);
 		}
 
-		if ((timeout = atoi(to)) < 0) {
-			timeout = 0;
-		} else {
+		if ((timeout = atoi(to)) >= 0) {
 			expires = switch_epoch_time_now(NULL) + timeout;
 		}
 		switch_channel_set_variable(channel, "park_timeout", NULL);
@@ -1299,7 +1297,10 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_collect_digits_callback(switch_core_s
 		}
 
 		if (switch_core_session_dequeue_event(session, &event, SWITCH_FALSE) == SWITCH_STATUS_SUCCESS) {
-			switch_status_t ostatus = args->input_callback(session, event, SWITCH_INPUT_TYPE_EVENT, args->buf, args->buflen);
+			switch_status_t ostatus = SWITCH_STATUS_FALSE;
+			if (args->input_callback) {
+				ostatus = args->input_callback(session, event, SWITCH_INPUT_TYPE_EVENT, args->buf, args->buflen);
+			}
 			if (ostatus != SWITCH_STATUS_SUCCESS) {
 				status = ostatus;
 			}
@@ -1414,7 +1415,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_collect_digits_count(switch_core_sess
 	if (digit_timeout && first_timeout) {
 		eff_timeout = first_timeout;
 	} else if (digit_timeout && !first_timeout) {
-		first_timeout = eff_timeout = digit_timeout;
+		eff_timeout = digit_timeout;
 	} else if (first_timeout) {
 		digit_timeout = eff_timeout = first_timeout;
 	}
@@ -2092,6 +2093,26 @@ SWITCH_DECLARE(void) switch_ivr_bg_media(const char *uuid, switch_media_flag_t f
 
 }
 
+SWITCH_DECLARE(void) switch_ivr_check_hold(switch_core_session_t *session)
+{
+	switch_channel_t *channel = switch_core_session_get_channel(session);
+	switch_media_flow_t flow;
+
+	if (switch_channel_test_flag(channel, CF_ANSWERED) &&
+		(flow = switch_core_session_media_flow(session, SWITCH_MEDIA_TYPE_AUDIO)) != SWITCH_MEDIA_FLOW_SENDRECV) {
+		switch_core_session_message_t msg = { 0 };
+
+		msg.message_id = SWITCH_MESSAGE_INDICATE_MEDIA_RENEG;
+		msg.from = __FILE__;
+
+		switch_core_media_set_smode(session, SWITCH_MEDIA_TYPE_AUDIO, SWITCH_MEDIA_FLOW_SENDRECV, SDP_TYPE_REQUEST);
+		switch_core_session_receive_message(session, &msg);
+	}
+
+	if (switch_channel_test_flag(channel, CF_HOLD)) {
+		switch_ivr_unhold(session);
+	}
+}
 
 SWITCH_DECLARE(switch_status_t) switch_ivr_session_transfer(switch_core_session_t *session, const char *extension, const char *dialplan,
 															const char *context)
@@ -2107,7 +2128,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_session_transfer(switch_core_session_
 	const char *forwardvar = switch_channel_get_variable(channel, forwardvar_name);
 	int forwardval = 70;
 	const char *use_dialplan = dialplan, *use_context = context;
-	switch_media_flow_t flow;
 	
 	if (zstr(forwardvar)) {
 		forwardvar_name = SWITCH_MAX_FORWARDS_VARIABLE; /* fall back to max_forwards variable for setting maximum */
@@ -2121,16 +2141,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_session_transfer(switch_core_session_
 		return SWITCH_STATUS_FALSE;
 	}
 
-	if (switch_channel_test_flag(channel, CF_ANSWERED) &&
-		(flow = switch_core_session_media_flow(session, SWITCH_MEDIA_TYPE_AUDIO)) != SWITCH_MEDIA_FLOW_SENDRECV) {
-		switch_core_session_message_t msg = { 0 };
-
-		msg.message_id = SWITCH_MESSAGE_INDICATE_MEDIA_RENEG;
-		msg.from = __FILE__;
-
-		switch_core_media_set_smode(session, SWITCH_MEDIA_TYPE_AUDIO, SWITCH_MEDIA_FLOW_SENDRECV, SDP_TYPE_REQUEST);
-		switch_core_session_receive_message(session, &msg);
-	}
+	switch_ivr_check_hold(session);
+	
 	
 	max_forwards = switch_core_session_sprintf(session, "%d", forwardval);
 	switch_channel_set_variable(channel, forwardvar_name, max_forwards);
diff --git a/src/switch_ivr_async.c b/src/switch_ivr_async.c
index db1bb1e4e54..14d6a21c154 100644
--- a/src/switch_ivr_async.c
+++ b/src/switch_ivr_async.c
@@ -737,7 +737,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_session_echo(switch_core_session_t *s
 			 */
 			if (switch_channel_has_dtmf(channel)) {
 				if (!args->input_callback && !args->buf) {
-					status = SWITCH_STATUS_BREAK;
 					break;
 				}
 				switch_channel_dequeue_dtmf(channel, &dtmf);
@@ -2001,10 +2000,9 @@ static switch_bool_t eavesdrop_callback(switch_media_bug_t *bug, void *user_data
 	}
 
 	if (nframe) {
-		switch_frame_t frame = {0};
+		switch_frame_t frame = *nframe;
 		uint8_t buf[SWITCH_RECOMMENDED_BUFFER_SIZE] = "";
 		
-		frame = *nframe;
 		frame.data = buf;
 		frame.codec = nframe->codec;
 		
@@ -2184,8 +2182,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 			const char *group_name = switch_channel_get_variable(tchannel, "eavesdrop_group");
 			/* If we don't have a group, then return */
 			if (!group_name) {
-				status = SWITCH_STATUS_BREAK;
-				goto end;
+				switch_goto_status(SWITCH_STATUS_BREAK, end);
 			}
 			/* Separate the group */
 			data = strdup(group_name);
@@ -2200,8 +2197,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 			switch_safe_free(data);
 			/* If we didn't find any match, then end */
 			if (!ok) {
-				status = SWITCH_STATUS_BREAK;
-				goto end;
+				switch_goto_status(SWITCH_STATUS_BREAK, end);
 			}
 		}
 
@@ -2226,7 +2222,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 								   SWITCH_CODEC_FLAG_ENCODE | SWITCH_CODEC_FLAG_DECODE,
 								   NULL, switch_core_session_get_pool(session)) != SWITCH_STATUS_SUCCESS) {
 			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "Cannot init codec\n");
-			switch_core_session_rwunlock(tsession);
 			goto end;
 		}
 
@@ -2586,8 +2581,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 		msg.message_id = SWITCH_MESSAGE_INDICATE_UNBRIDGE;
 		switch_core_session_receive_message(session, &msg);
 
-
-
+		status = SWITCH_STATUS_SUCCESS;
 	  end:
 
 		if (codec_initialized)
@@ -2612,7 +2606,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_eavesdrop_session(switch_core_session
 		}
 
 		switch_core_session_rwunlock(tsession);
-		status = SWITCH_STATUS_SUCCESS;
 
 		switch_core_session_reset(session, SWITCH_TRUE, SWITCH_TRUE);
 	}
@@ -4149,7 +4142,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_tone_detect_session(switch_core_sessi
 	bflags |= SMBF_NO_PAUSE;
 
 	if (cont->bug_running) {
-		status = SWITCH_STATUS_SUCCESS;
 		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s bug already running\n", switch_channel_get_name(channel));
 	} else {
 		cont->bug_running = 1;
@@ -4626,7 +4618,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_and_detect_speech(switch_core_se
 	if (state.done) {
 		status = SWITCH_STATUS_SUCCESS;
 	}
-	*result = state.result;
+	if (result) {
+		*result = state.result;
+	}
 
 	arg_recursion_check_stop(args);
 
@@ -5169,7 +5163,6 @@ SWITCH_STANDARD_SCHED_FUNC(sch_hangup_callback)
 {
 	struct hangup_helper *helper;
 	switch_core_session_t *session, *other_session;
-	const char *other_uuid;
 
 	switch_assert(task);
 
@@ -5179,7 +5172,7 @@ SWITCH_STANDARD_SCHED_FUNC(sch_hangup_callback)
 		switch_channel_t *channel = switch_core_session_get_channel(session);
 
 		if (helper->bleg) {
-			if ((other_uuid = switch_channel_get_variable(channel, SWITCH_BRIDGE_VARIABLE)) && (other_session = switch_core_session_locate(other_uuid))) {
+			if (SWITCH_STATUS_SUCCESS == switch_core_session_get_partner(session, &other_session)) {
 				switch_channel_t *other_channel = switch_core_session_get_channel(other_session);
 				switch_channel_hangup(other_channel, helper->cause);
 				switch_core_session_rwunlock(other_session);
@@ -5313,7 +5306,7 @@ SWITCH_DECLARE(uint32_t) switch_ivr_schedule_broadcast(time_t runtime, const cha
 SWITCH_DECLARE(switch_status_t) switch_ivr_broadcast(const char *uuid, const char *path, switch_media_flag_t flags)
 {
 	switch_channel_t *channel;
-	switch_core_session_t *session, *master;
+	switch_core_session_t *session;
 	switch_event_t *event;
 	switch_core_session_t *other_session = NULL;
 	const char *other_uuid = NULL;
@@ -5325,7 +5318,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_broadcast(const char *uuid, const cha
 
 	switch_assert(path);
 
-	if (!(master = session = switch_core_session_locate(uuid))) {
+	if (!(session = switch_core_session_locate(uuid))) {
 		return SWITCH_STATUS_FALSE;
 	}
 
@@ -5381,7 +5374,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_broadcast(const char *uuid, const cha
 		}
 
 		switch_core_session_rwunlock(other_session);
-		master = other_session;
 		other_session = NULL;
 	}
 
@@ -5391,7 +5383,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_broadcast(const char *uuid, const cha
 
 	if ((flags & SMF_ECHO_ALEG)) {
 		if ((flags & SMF_EXEC_INLINE)) {
-			nomedia = 0;
 			switch_core_session_execute_application(session, app, path);
 		} else {
 			if (switch_event_create(&event, SWITCH_EVENT_COMMAND) == SWITCH_STATUS_SUCCESS) {
@@ -5414,7 +5405,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_broadcast(const char *uuid, const cha
 					switch_channel_set_flag(channel, CF_BROADCAST_DROP_MEDIA);
 			}
 		}
-		master = session;
 	}
 
 	if (cause) {
diff --git a/src/switch_ivr_bridge.c b/src/switch_ivr_bridge.c
index 5ff242e102c..3e5e2145be5 100644
--- a/src/switch_ivr_bridge.c
+++ b/src/switch_ivr_bridge.c
@@ -80,7 +80,7 @@ static void text_bridge_thread(switch_core_session_t *session, void *obj)
 			inuse = switch_buffer_inuse(text_buffer);
 
 			if (inuse && (switch_channel_test_flag(channel, CF_TEXT_IDLE) || switch_test_flag(read_frame, SFF_TEXT_LINE_BREAK))) {
-				int bytes = 0;
+				int bytes;
 
 				if (inuse + 4 > text_framesize) {
 					void *tmp = malloc(inuse + 1024);
@@ -114,7 +114,6 @@ static void text_bridge_thread(switch_core_session_t *session, void *obj)
 				*(text_framedata + bytes) = '\r';
 				*(text_framedata + bytes + 1) = '\n';
 				*(text_framedata + bytes + 2) = '\0';
-				bytes += 2;
 
 				frame.data = text_framedata;
 				frame.datalen = strlen((char *)frame.data);
@@ -176,9 +175,6 @@ static void video_bridge_thread(switch_core_session_t *session, void *obj)
 
 	while (switch_channel_up_nosig(channel) && switch_channel_up_nosig(b_channel) && vh->up == 1) {
 		if (switch_channel_media_up(channel)) {
-			switch_codec_t *a_codec = switch_core_session_get_video_read_codec(vh->session_a);
-			switch_codec_t *b_codec = switch_core_session_get_video_write_codec(vh->session_b);
-			
 			if (switch_core_session_transcoding(vh->session_a, vh->session_b, SWITCH_MEDIA_TYPE_VIDEO)) {
 				pass_val = 1;
 			} else {
@@ -196,11 +192,14 @@ static void video_bridge_thread(switch_core_session_t *session, void *obj)
 			}
 
 			if (!switch_channel_test_flag(channel, CF_PROXY_MEDIA)) {
-				switch_assert(a_codec);
-				switch_assert(b_codec);
+				switch_codec_implementation_t session_a_codec_implementation;
+				switch_codec_implementation_t session_b_codec_implementation;
+
+				switch_core_session_get_video_read_impl(vh->session_a, &session_a_codec_implementation);
+				switch_core_session_get_video_write_impl(vh->session_b, &session_b_codec_implementation);
 
 				if (switch_channel_test_flag(channel, CF_VIDEO_DECODED_READ)) {
-					if (a_codec->implementation->impl_id == b_codec->implementation->impl_id && !switch_channel_test_flag(b_channel, CF_VIDEO_DECODED_READ)) {
+					if (session_a_codec_implementation.impl_id == session_b_codec_implementation.impl_id && !switch_channel_test_flag(b_channel, CF_VIDEO_DECODED_READ)) {
 						if (set_decoded_read) {
 							switch_channel_clear_flag_recursive(channel, CF_VIDEO_DECODED_READ);
 							set_decoded_read = 0;
@@ -208,7 +207,7 @@ static void video_bridge_thread(switch_core_session_t *session, void *obj)
 						}
 					}
 				} else {
-					if (a_codec->implementation->impl_id != b_codec->implementation->impl_id ||
+					if (session_a_codec_implementation.impl_id != session_b_codec_implementation.impl_id ||
 						switch_channel_test_flag(b_channel, CF_VIDEO_DECODED_READ)) {
 						switch_channel_set_flag_recursive(channel, CF_VIDEO_DECODED_READ);
 						set_decoded_read = 1;
@@ -421,9 +420,7 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 	inner_bridge = switch_channel_test_flag(chan_a, CF_INNER_BRIDGE);
 
 	if (!switch_channel_test_flag(chan_a, CF_ANSWERED) && (bridge_answer_timeout = switch_channel_get_variable(chan_a, "bridge_answer_timeout"))) {
-		if ((answer_timeout = atoi(bridge_answer_timeout)) < 0) {
-			answer_timeout = 0;
-		} else {
+		if ((answer_timeout = atoi(bridge_answer_timeout)) >= 0) {
 			answer_limit = switch_epoch_time_now(NULL) + answer_timeout;
 		}
 	}
@@ -707,7 +704,7 @@ static void *audio_bridge_thread(switch_thread_t *thread, void *obj)
 
 		if (switch_core_session_dequeue_event(session_a, &event, SWITCH_FALSE) == SWITCH_STATUS_SUCCESS) {
 			if (input_callback) {
-				status = input_callback(session_a, event, SWITCH_INPUT_TYPE_EVENT, user_data, 0);
+				input_callback(session_a, event, SWITCH_INPUT_TYPE_EVENT, user_data, 0);
 			}
 
 			if ((event->event_id != SWITCH_EVENT_COMMAND && event->event_id != SWITCH_EVENT_MESSAGE)
@@ -2020,6 +2017,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_uuid_bridge(const char *originator_uu
 			originator_channel = switch_core_session_get_channel(originator_session);
 			originatee_channel = switch_core_session_get_channel(originatee_session);
 
+			switch_ivr_check_hold(originator_session);
+			switch_ivr_check_hold(originatee_session);
+	
 
 			if (switch_channel_test_flag(originator_channel, CF_LEG_HOLDING)) {
 				switch_channel_set_flag(originator_channel, CF_HOLD_ON_BRIDGE);
diff --git a/src/switch_ivr_menu.c b/src/switch_ivr_menu.c
index 3d55e4dac09..255c9a4f127 100644
--- a/src/switch_ivr_menu.c
+++ b/src/switch_ivr_menu.c
@@ -616,7 +616,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_menu_execute(switch_core_session_t *s
 
 								if ((application_interface = switch_loadable_module_get_application_interface(app_name))) {
 									if (!zstr(menu->transfer_sound) && !strcmp(app_name, "transfer")) {
-										status = play_and_collect(session, menu, menu->transfer_sound, 0);
+										play_and_collect(session, menu, menu->transfer_sound, 0);
 									}
 
 									switch_core_session_exec(session, application_interface, app_arg);
diff --git a/src/switch_ivr_originate.c b/src/switch_ivr_originate.c
index 9e9ebdb19c6..2ae09195199 100644
--- a/src/switch_ivr_originate.c
+++ b/src/switch_ivr_originate.c
@@ -129,6 +129,7 @@ typedef struct {
 	char *file;
 	char *error_file;
 	int confirm_timeout;
+	int confirm_read_timeout;
 	char key[80];
 	uint8_t early_ok;
 	uint8_t ring_ready;
@@ -152,6 +153,7 @@ typedef struct {
 	switch_caller_profile_t *caller_profile_override;
 	switch_bool_t check_vars;
 	switch_memory_pool_t *pool;
+	originate_status_t originate_status[MAX_PEERS];// = { {0} };
 } originate_global_t;
 
 
@@ -168,7 +170,7 @@ struct key_collect {
 	char *key;
 	char *file;
 	char *error_file;
-	int confirm_timeout;
+	int confirm_read_timeout;
 	switch_core_session_t *session;
 };
 
@@ -243,7 +245,7 @@ static void *SWITCH_THREAD_FUNC collect_thread_run(switch_thread_t *thread, void
 		status = switch_ivr_read(collect->session,
 								 (uint32_t)len,
 								 (uint32_t)len,
-								 collect->file, NULL, buf, sizeof(buf), collect->confirm_timeout, NULL, 0);
+								 file, NULL, buf, sizeof(buf), collect->confirm_read_timeout, NULL, 0);
 
 
 		if (status != SWITCH_STATUS_SUCCESS && status != SWITCH_STATUS_BREAK && status != SWITCH_STATUS_TOO_SMALL) {
@@ -281,7 +283,7 @@ static void launch_collect_thread(struct key_collect *collect)
 }
 
 static int check_per_channel_timeouts(originate_global_t *oglobals,
-									  originate_status_t *originate_status, int max, time_t start, switch_call_cause_t *force_reason)
+									  int max, time_t start, switch_call_cause_t *force_reason)
 {
 	int x = 0, i, delayed_channels = 0, active_channels = 0;
 	uint32_t early_exit_time = 0, delayed_min = 0;
@@ -289,9 +291,9 @@ static int check_per_channel_timeouts(originate_global_t *oglobals,
 	time_t elapsed = switch_epoch_time_now(NULL) - start;
 
 	for (i = 0; i < max; i++) {
-		if (originate_status[i].peer_channel && switch_channel_get_state(originate_status[i].peer_channel) != CS_DESTROY &&
-			switch_channel_get_state(originate_status[i].peer_channel) != CS_REPORTING) {
-			if (originate_status[i].per_channel_delay_start) {
+		if (oglobals->originate_status[i].peer_channel && switch_channel_get_state(oglobals->originate_status[i].peer_channel) != CS_DESTROY &&
+			switch_channel_get_state(oglobals->originate_status[i].peer_channel) != CS_REPORTING) {
+			if (oglobals->originate_status[i].per_channel_delay_start) {
 				delayed_channels++;
 			} else {
 				active_channels++;
@@ -301,56 +303,56 @@ static int check_per_channel_timeouts(originate_global_t *oglobals,
 
 	if (active_channels == 0 && delayed_channels) {
 		for (i = 0; i < max; i++) {
-			if (originate_status[i].peer_channel && originate_status[i].per_channel_delay_start &&
-				(!delayed_min || delayed_min > originate_status[i].per_channel_delay_start)) {
-				delayed_min = originate_status[i].per_channel_delay_start;
+			if (oglobals->originate_status[i].peer_channel && oglobals->originate_status[i].per_channel_delay_start &&
+				(!delayed_min || delayed_min > oglobals->originate_status[i].per_channel_delay_start)) {
+				delayed_min = oglobals->originate_status[i].per_channel_delay_start;
 			}
 		}
 		early_exit_time = delayed_min - (uint32_t) elapsed;
 	}
 	for (i = 0; i < max; i++) {
-		if (originate_status[i].peer_channel && originate_status[i].per_channel_delay_start &&
-			(elapsed > originate_status[i].per_channel_delay_start || active_channels == 0)) {
+		if (oglobals->originate_status[i].peer_channel && oglobals->originate_status[i].per_channel_delay_start &&
+			(elapsed > oglobals->originate_status[i].per_channel_delay_start || active_channels == 0)) {
 			if (active_channels == 0) {
-				if (originate_status[i].per_channel_timelimit_sec) {
-					if (originate_status[i].per_channel_timelimit_sec > early_exit_time) {
+				if (oglobals->originate_status[i].per_channel_timelimit_sec) {
+					if (oglobals->originate_status[i].per_channel_timelimit_sec > early_exit_time) {
 						/* IN theory this check is not needed ( should just be if !0 then -= with no else), if its not 0 it should always be greater.... */
-						originate_status[i].per_channel_timelimit_sec -= early_exit_time;
+						oglobals->originate_status[i].per_channel_timelimit_sec -= early_exit_time;
 					} else {
-						originate_status[i].per_channel_timelimit_sec = 1;
+						oglobals->originate_status[i].per_channel_timelimit_sec = 1;
 					}
 				}
-				if (originate_status[i].per_channel_progress_timelimit_sec) {
-					if (originate_status[i].per_channel_progress_timelimit_sec > early_exit_time) {
+				if (oglobals->originate_status[i].per_channel_progress_timelimit_sec) {
+					if (oglobals->originate_status[i].per_channel_progress_timelimit_sec > early_exit_time) {
 						/* IN theory this check is not needed ( should just be if !0 then -= with no else), if its not 0 it should always be greater.... */
-						originate_status[i].per_channel_progress_timelimit_sec -= early_exit_time;
+						oglobals->originate_status[i].per_channel_progress_timelimit_sec -= early_exit_time;
 					} else {
-						originate_status[i].per_channel_progress_timelimit_sec = 1;
+						oglobals->originate_status[i].per_channel_progress_timelimit_sec = 1;
 					}
 				}
-				originate_status[i].per_channel_delay_start -= delayed_min;
+				oglobals->originate_status[i].per_channel_delay_start -= delayed_min;
 			} else {
-				originate_status[i].per_channel_delay_start = 0;
+				oglobals->originate_status[i].per_channel_delay_start = 0;
 			}
 
-			if (!originate_status[i].per_channel_delay_start) {
-				switch_channel_clear_flag(originate_status[i].peer_channel, CF_BLOCK_STATE);
+			if (!oglobals->originate_status[i].per_channel_delay_start) {
+				switch_channel_clear_flag(oglobals->originate_status[i].peer_channel, CF_BLOCK_STATE);
 			}
 		}
 
-		if (originate_status[i].peer_channel && switch_channel_up_nosig(originate_status[i].peer_channel)) {
-			if (originate_status[i].per_channel_progress_timelimit_sec && elapsed > originate_status[i].per_channel_progress_timelimit_sec &&
-				!(switch_channel_test_flag(originate_status[i].peer_channel, CF_RING_READY) ||
-				  switch_channel_test_flag(originate_status[i].peer_channel, CF_ANSWERED) ||
-				  (!oglobals->monitor_early_media_ring && switch_channel_test_flag(originate_status[i].peer_channel, CF_EARLY_MEDIA))
+		if (oglobals->originate_status[i].peer_channel && switch_channel_up_nosig(oglobals->originate_status[i].peer_channel)) {
+			if (oglobals->originate_status[i].per_channel_progress_timelimit_sec && elapsed > oglobals->originate_status[i].per_channel_progress_timelimit_sec &&
+				!(switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_RING_READY) ||
+				  switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_ANSWERED) ||
+				  (!oglobals->monitor_early_media_ring && switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_EARLY_MEDIA))
 				)
 				) {
-				switch_channel_hangup(originate_status[i].peer_channel, SWITCH_CAUSE_PROGRESS_TIMEOUT);
+				switch_channel_hangup(oglobals->originate_status[i].peer_channel, SWITCH_CAUSE_PROGRESS_TIMEOUT);
 				*force_reason = SWITCH_CAUSE_PROGRESS_TIMEOUT;
 				x++;
 			}
-			if (originate_status[i].per_channel_timelimit_sec && elapsed > originate_status[i].per_channel_timelimit_sec) {
-				switch_channel_hangup(originate_status[i].peer_channel, SWITCH_CAUSE_ALLOTTED_TIMEOUT);
+			if (oglobals->originate_status[i].per_channel_timelimit_sec && elapsed > oglobals->originate_status[i].per_channel_timelimit_sec) {
+				switch_channel_hangup(oglobals->originate_status[i].peer_channel, SWITCH_CAUSE_ALLOTTED_TIMEOUT);
 				x++;
 			}
 		}
@@ -359,6 +361,7 @@ static int check_per_channel_timeouts(originate_global_t *oglobals,
 	return x;
 }
 
+
 static switch_bool_t monitor_callback(switch_core_session_t *session, const char *app, const char *data)
 {
 	if (app) {
@@ -448,7 +451,7 @@ static void inherit_codec(switch_channel_t *caller_channel, switch_core_session_
 	}
 }
 
-static uint8_t check_channel_status(originate_global_t *oglobals, originate_status_t *originate_status, uint32_t len, switch_call_cause_t *force_reason)
+static uint8_t check_channel_status(originate_global_t *oglobals, uint32_t len, switch_call_cause_t *force_reason, time_t start)
 {
 
 	uint32_t i;
@@ -473,29 +476,29 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 
 
 	for (i = 0; i < len; i++) {
-		if (originate_status[i].peer_channel && switch_channel_test_flag(originate_status[i].peer_channel, CF_CHANNEL_SWAP)) {
-			const char *key = switch_channel_get_variable(originate_status[i].peer_channel, "channel_swap_uuid");
+		if (oglobals->originate_status[i].peer_channel && switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_CHANNEL_SWAP)) {
+			const char *key = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "channel_swap_uuid");
 			switch_core_session_t *swap_session, *old_session;
-
+			
 			if ((swap_session = switch_core_session_locate(key))) {
-				switch_channel_clear_flag(originate_status[i].peer_channel, CF_CHANNEL_SWAP);
-				switch_channel_hangup(originate_status[i].peer_channel, SWITCH_CAUSE_PICKED_OFF);
+				switch_channel_clear_flag(oglobals->originate_status[i].peer_channel, CF_CHANNEL_SWAP);
+				switch_channel_hangup(oglobals->originate_status[i].peer_channel, SWITCH_CAUSE_PICKED_OFF);
 
-				switch_log_printf(SWITCH_CHANNEL_CHANNEL_LOG(originate_status[i].peer_channel), SWITCH_LOG_DEBUG, "Swapping %s for %s\n",
-								  switch_core_session_get_name(swap_session), switch_channel_get_name(originate_status[i].peer_channel));
+				switch_log_printf(SWITCH_CHANNEL_CHANNEL_LOG(oglobals->originate_status[i].peer_channel), SWITCH_LOG_DEBUG, "Swapping %s for %s\n",
+								  switch_core_session_get_name(swap_session), switch_channel_get_name(oglobals->originate_status[i].peer_channel));
 
 
-				old_session = originate_status[i].peer_session;
-				originate_status[i].peer_session = swap_session;
-				originate_status[i].peer_channel = switch_core_session_get_channel(originate_status[i].peer_session);
-				originate_status[i].caller_profile = switch_channel_get_caller_profile(originate_status[i].peer_channel);
-				switch_channel_set_flag(originate_status[i].peer_channel, CF_ORIGINATING);
+				old_session = oglobals->originate_status[i].peer_session;
+				oglobals->originate_status[i].peer_session = swap_session;
+				oglobals->originate_status[i].peer_channel = switch_core_session_get_channel(oglobals->originate_status[i].peer_session);
+				oglobals->originate_status[i].caller_profile = switch_channel_get_caller_profile(oglobals->originate_status[i].peer_channel);
+				switch_channel_set_flag(oglobals->originate_status[i].peer_channel, CF_ORIGINATING);
 
-				switch_channel_answer(originate_status[i].peer_channel);
+				switch_channel_answer(oglobals->originate_status[i].peer_channel);
 
-				switch_channel_set_variable(originate_status[i].peer_channel, "picked_up_uuid", switch_core_session_get_uuid(old_session));
-				switch_channel_execute_on(originate_status[i].peer_channel, "execute_on_pickup");
-				switch_channel_api_on(originate_status[i].peer_channel, "api_on_pickup");
+				switch_channel_set_variable(oglobals->originate_status[i].peer_channel, "picked_up_uuid", switch_core_session_get_uuid(old_session));
+				switch_channel_execute_on(oglobals->originate_status[i].peer_channel, "execute_on_pickup");
+				switch_channel_api_on(oglobals->originate_status[i].peer_channel, "api_on_pickup");
 
 				switch_core_session_rwunlock(old_session);
 				break;
@@ -506,15 +509,15 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 	for (i = 0; i < len; i++) {
 		switch_channel_state_t state;
 
-		if (originate_status[i].tagged && originate_status[i].peer_session) {
-			switch_channel_t *channel = switch_core_session_get_channel(originate_status[i].peer_session);
+		if (oglobals->originate_status[i].tagged && oglobals->originate_status[i].peer_session) {
+			switch_channel_t *channel = switch_core_session_get_channel(oglobals->originate_status[i].peer_session);
 			uint32_t j;
 
 			if (switch_channel_down_nosig(channel)) {
 				switch_call_cause_t cause = switch_channel_get_cause(channel);
 
 				for (j = 0; j < len; j++) {
-					channel = switch_core_session_get_channel(originate_status[j].peer_session);
+					channel = switch_core_session_get_channel(oglobals->originate_status[j].peer_session);
 					switch_channel_hangup(channel, cause);
 				}
 				oglobals->hups = len;
@@ -524,20 +527,20 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 		}
 
 
-		if (originate_status[i].peer_channel && switch_channel_test_flag(originate_status[i].peer_channel, CF_PICKUP)) {
-			if (originate_status[i].per_channel_timelimit_sec == 0) {
+		if (oglobals->originate_status[i].peer_channel && switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_PICKUP)) {
+			if (oglobals->originate_status[i].per_channel_timelimit_sec == 0) {
 				pickups_without_timelimit++;
 			}
 		}
 
-		if (!(originate_status[i].peer_channel && originate_status[i].peer_session)) {
+		if (!(oglobals->originate_status[i].peer_channel && oglobals->originate_status[i].peer_session)) {
 			oglobals->hups++;
 			continue;
 		}
 
-		if ((ring_ready_val = (uint8_t)switch_channel_test_flag(originate_status[i].peer_channel, CF_RING_READY))) {
-			if (!originate_status[i].ring_ready) {
-				originate_status[i].ring_ready = ring_ready_val;
+		if ((ring_ready_val = (uint8_t)switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_RING_READY))) {
+			if (!oglobals->originate_status[i].ring_ready) {
+				oglobals->originate_status[i].ring_ready = ring_ready_val;
 			}
 
 			if (oglobals->sending_ringback == 1) {
@@ -548,7 +551,7 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 					oglobals->ring_ready = ring_ready_val;
 					if (caller_channel && !oglobals->ignore_ring_ready) {
 						if (len == 1) {
-							switch_channel_pass_callee_id(originate_status[0].peer_channel, caller_channel);
+							switch_channel_pass_callee_id(oglobals->originate_status[0].peer_channel, caller_channel);
 						}
 						switch_channel_ring_ready_value(caller_channel, ring_ready_val);
 						oglobals->sent_ring = ring_ready_val;
@@ -557,7 +560,7 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 			}
 		}
 
-		if (switch_channel_test_flag(originate_status[i].peer_channel, CF_EARLY_MEDIA)) {
+		if (switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_EARLY_MEDIA)) {
 
 			if (oglobals->ignore_early_media == 3 && oglobals->bridge_early_media == -1) {
 				oglobals->bridge_early_media = i;
@@ -568,19 +571,19 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 				send_ringback++;
 				pindex = (uint32_t) i;
 			} else if (!oglobals->sent_ring && oglobals->ignore_early_media == 2 && len == 1 && caller_channel && !oglobals->ignore_ring_ready) {
-				switch_channel_pass_callee_id(originate_status[0].peer_channel, caller_channel);
+				switch_channel_pass_callee_id(oglobals->originate_status[0].peer_channel, caller_channel);
 				switch_channel_ring_ready(caller_channel);
 				oglobals->sent_ring = 1;
 			}
 
-			if (!originate_status[i].early_media) {
-				originate_status[i].early_media = 1;
+			if (!oglobals->originate_status[i].early_media) {
+				oglobals->originate_status[i].early_media = 1;
 				if (oglobals->early_ok) {
 					pindex = i;
 				}
 
 				if (oglobals->monitor_early_media_fail) {
-					const char *var = switch_channel_get_variable(originate_status[i].peer_channel, "monitor_early_media_fail");
+					const char *var = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "monitor_early_media_fail");
 					if (!zstr(var)) {
 						char *fail_array[128] = { 0 };
 						int fail_count = 0;
@@ -597,14 +600,14 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 							char *p, *q;
 
 							if (!(p = strchr(cause, ':'))) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 							*p++ = '\0';
 
 
 							if (!p) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 
@@ -615,13 +618,13 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 
 
 							if (!(p = strchr(p, ':'))) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 							*p++ = '\0';
 
 							if (!p) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 
@@ -631,7 +634,7 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 								}
 							}
 							switch_snprintf(bug_key, sizeof(bug_key), "monitor_early_media_fail_%d", ++y);
-							switch_ivr_tone_detect_session(originate_status[i].peer_session, bug_key, p, "r", 0, hits, "fail", cause, monitor_callback);
+							switch_ivr_tone_detect_session(oglobals->originate_status[i].peer_session, bug_key, p, "r", 0, hits, "fail", cause, monitor_callback);
 
 						}
 
@@ -641,8 +644,8 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 				}
 
 				if (oglobals->monitor_early_media_ring) {
-					const char *var = switch_channel_get_variable(originate_status[i].peer_channel, "monitor_early_media_ring");
-					const char *var_total = switch_channel_get_variable(originate_status[i].peer_channel, "monitor_early_media_ring_total");
+					const char *var = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "monitor_early_media_ring");
+					const char *var_total = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "monitor_early_media_ring_total");
 					if (!zstr(var)) {
 						char *ring_array[128] = { 0 };
 						int ring_count = 0;
@@ -658,7 +661,7 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 							char *p = ring_array[fx], *q;
 
 							if (!p) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 
@@ -667,13 +670,13 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 							}
 
 							if (!(p = strchr(p, ':'))) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 							*p++ = '\0';
 
 							if (!p) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_ERROR, "Parse Error\n");
 								continue;
 							}
 
@@ -683,17 +686,17 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 								}
 							}
 
-							switch_channel_set_private(originate_status[i].peer_channel, "_oglobals_", oglobals);
+							switch_channel_set_private(oglobals->originate_status[i].peer_channel, "_oglobals_", oglobals);
 							switch_snprintf(bug_key, sizeof(bug_key), "monitor_early_media_ring_%d", ++y);
-							switch_ivr_tone_detect_session(originate_status[i].peer_session, bug_key, p, "r", 0, hits, "ring", NULL, monitor_callback);
+							switch_ivr_tone_detect_session(oglobals->originate_status[i].peer_session, bug_key, p, "r", 0, hits, "ring", NULL, monitor_callback);
 
 						}
 
 						if (var_total) {
 							int tmp = atoi(var_total);
 							if (tmp > 0 && tmp < 100) {
-								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(originate_status[i].peer_session), SWITCH_LOG_DEBUG,
-												  "%s setting ring total to %d\n", switch_channel_get_name(originate_status[i].peer_channel), tmp);
+								switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(oglobals->originate_status[i].peer_session), SWITCH_LOG_DEBUG,
+												  "%s setting ring total to %d\n", switch_channel_get_name(oglobals->originate_status[i].peer_channel), tmp);
 								oglobals->monitor_early_media_ring_total = tmp;
 							}
 						}
@@ -717,22 +720,22 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 			}
 		}
 
-		if (!switch_channel_test_flag(originate_status[i].peer_channel, CF_PARK) &&
-			!switch_channel_test_flag(originate_status[i].peer_channel, CF_CONSUME_ON_ORIGINATE)) {
-			if (switch_core_session_messages_waiting(originate_status[i].peer_session)) {
-				if (switch_channel_test_flag(originate_status[i].peer_channel, CF_THREAD_SLEEPING)) {
-					switch_core_session_wake_session_thread(originate_status[i].peer_session);
+		if (!switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_PARK) &&
+			!switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_CONSUME_ON_ORIGINATE)) {
+			if (switch_core_session_messages_waiting(oglobals->originate_status[i].peer_session)) {
+				if (switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_THREAD_SLEEPING)) {
+					switch_core_session_wake_session_thread(oglobals->originate_status[i].peer_session);
 				} else {
-					switch_ivr_parse_all_events(originate_status[i].peer_session);
+					switch_ivr_parse_all_events(oglobals->originate_status[i].peer_session);
 				}
 			}
 		}
 
-		if (switch_channel_test_flag(originate_status[i].peer_channel, CF_EARLY_OK)) {
+		if (switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_EARLY_OK)) {
 			if (!oglobals->early_ok) {
 				oglobals->early_ok = 1;
 			}
-			switch_channel_clear_flag(originate_status[i].peer_channel, CF_EARLY_OK);
+			switch_channel_clear_flag(oglobals->originate_status[i].peer_channel, CF_EARLY_OK);
 		}
 
 		if (caller_channel && switch_channel_test_flag(caller_channel, CF_EARLY_OK)) {
@@ -742,55 +745,81 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 			switch_channel_clear_flag(caller_channel, CF_EARLY_OK);
 		}
 
-		state = switch_channel_get_state(originate_status[i].peer_channel);
-		if (state >= CS_HANGUP || state == CS_RESET || switch_channel_test_flag(originate_status[i].peer_channel, CF_TRANSFER) ||
-			switch_channel_test_flag(originate_status[i].peer_channel, CF_REDIRECT) ||
-			switch_channel_test_flag(originate_status[i].peer_channel, CF_BRIDGED) ||
-			!switch_channel_test_flag(originate_status[i].peer_channel, CF_ORIGINATING)
+		state = switch_channel_get_state(oglobals->originate_status[i].peer_channel);
+		if (state >= CS_HANGUP || state == CS_RESET || switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_TRANSFER) ||
+			switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_REDIRECT) ||
+			switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_BRIDGED) ||
+			!switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_ORIGINATING)
 			) {
 			(oglobals->hups)++;
-			if (switch_channel_test_flag(originate_status[i].peer_channel, CF_PICKUP)) {
-				if (originate_status[i].per_channel_timelimit_sec == 0) {
+			if (switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_PICKUP)) {
+				if (oglobals->originate_status[i].per_channel_timelimit_sec == 0) {
 					pickups_without_timelimit--;
 				}
 			}
-		} else if ((switch_channel_test_flag(originate_status[i].peer_channel, CF_ANSWERED) ||
-					(oglobals->early_ok && switch_channel_test_flag(originate_status[i].peer_channel, CF_EARLY_MEDIA)) ||
+		} else if ((switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_ANSWERED) ||
+					(oglobals->early_ok && switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_EARLY_MEDIA)) ||
 					(oglobals->ring_ready && oglobals->return_ring_ready && len == 1 &&
-					 switch_channel_test_flag(originate_status[i].peer_channel, CF_RING_READY))
+					 switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_RING_READY))
 				   )
-				   && !switch_channel_test_flag(originate_status[i].peer_channel, CF_TAGGED)
+				   && !switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_TAGGED)
 			) {
 
-			if (!zstr(oglobals->key)) {
+			const char *group_confirm_key = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "group_confirm_key");
+			const char *group_confirm_file = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "group_confirm_file");
+
+			if (!zstr(oglobals->key) || !zstr(group_confirm_key)) {
 				struct key_collect *collect;
+				const char *group_confirm_timeout = switch_channel_get_variable(oglobals->originate_status[i].peer_channel, "group_confirm_timeout");
+				int extend_timeout = 0;
+				int cancel_timeout = 0;
+				if (group_confirm_timeout && switch_is_number(group_confirm_timeout)) {
+					// leg var overrides global group_confirm_timeout
+					extend_timeout = atoi(group_confirm_timeout);
+					if (extend_timeout == 0) {
+						cancel_timeout = 1;
+					}
+				} else {
+					extend_timeout = oglobals->confirm_timeout;
+				}
 
-				if (oglobals->cancel_timeout == SWITCH_TRUE) {
+				if (extend_timeout > 0) {
+					/* extend timeout for this leg only */
+					time_t elapsed = switch_epoch_time_now(NULL) - start;
+					oglobals->originate_status[i].per_channel_progress_timelimit_sec = elapsed + extend_timeout;
+					oglobals->originate_status[i].per_channel_timelimit_sec = elapsed + extend_timeout;
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG, "elapsed %" TIME_T_FMT ", timelimit extended to %u\n", elapsed, oglobals->originate_status[i].per_channel_timelimit_sec);
+				} else if (oglobals->cancel_timeout || cancel_timeout) {
 					/* cancel timeout for this leg only */
-					originate_status[i].per_channel_progress_timelimit_sec = 0;
-					originate_status[i].per_channel_timelimit_sec = 0;
+					oglobals->originate_status[i].per_channel_progress_timelimit_sec = 0;
+					oglobals->originate_status[i].per_channel_timelimit_sec = 0;
 				}
 
-				if ((collect = switch_core_session_alloc(originate_status[i].peer_session, sizeof(*collect)))) {
-					switch_channel_set_flag(originate_status[i].peer_channel, CF_TAGGED);
-					if (!zstr(oglobals->key)) {
-						collect->key = switch_core_session_strdup(originate_status[i].peer_session, oglobals->key);
+				if ((collect = switch_core_session_alloc(oglobals->originate_status[i].peer_session, sizeof(*collect)))) {
+					switch_channel_set_flag(oglobals->originate_status[i].peer_channel, CF_TAGGED);
+					if (!zstr(group_confirm_key)) {
+						collect->key = switch_core_session_strdup(oglobals->originate_status[i].peer_session, group_confirm_key);
+					} else {
+						collect->key = switch_core_session_strdup(oglobals->originate_status[i].peer_session, oglobals->key);
 					}
-					if (!zstr(oglobals->file)) {
-						collect->file = switch_core_session_strdup(originate_status[i].peer_session, oglobals->file);
+
+					if (!zstr(group_confirm_file)) {
+						collect->file = switch_core_session_strdup(oglobals->originate_status[i].peer_session, group_confirm_file);
+					} else if (!zstr(oglobals->file)) {
+						collect->file = switch_core_session_strdup(oglobals->originate_status[i].peer_session, oglobals->file);
 					}
 					if (!zstr(oglobals->error_file)) {
-						collect->error_file = switch_core_session_strdup(originate_status[i].peer_session, oglobals->error_file);
+						collect->error_file = switch_core_session_strdup(oglobals->originate_status[i].peer_session, oglobals->error_file);
 					}
 
-					if (oglobals->confirm_timeout) {
-						collect->confirm_timeout = oglobals->confirm_timeout;
+					if (oglobals->confirm_read_timeout) {
+						collect->confirm_read_timeout = oglobals->confirm_read_timeout;
 					} else {
-						collect->confirm_timeout = 5000;
+						collect->confirm_read_timeout = 5000;
 					}
 
-					switch_channel_audio_sync(originate_status[i].peer_channel);
-					collect->session = originate_status[i].peer_session;
+					switch_channel_audio_sync(oglobals->originate_status[i].peer_channel);
+					collect->session = oglobals->originate_status[i].peer_session;
 					launch_collect_thread(collect);
 				}
 			} else {
@@ -800,7 +829,7 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 				goto end;
 
 			}
-		} else if (switch_channel_test_flag(originate_status[i].peer_channel, CF_WINNER)) {
+		} else if (switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_WINNER)) {
 			oglobals->idx = i;
 			rval = 0;
 			pindex = (uint32_t) i;
@@ -819,17 +848,17 @@ static uint8_t check_channel_status(originate_global_t *oglobals, originate_stat
 
 	if (rval == 0 && pickups_without_timelimit) {
 		for (i = 0; i < len; i++) {
-			if (originate_status[i].peer_channel && switch_channel_test_flag(originate_status[i].peer_channel, CF_PICKUP) &&
-				switch_channel_up(originate_status[i].peer_channel)) {
-				switch_channel_hangup(originate_status[i].peer_channel, SWITCH_CAUSE_NO_PICKUP);
+			if (oglobals->originate_status[i].peer_channel && switch_channel_test_flag(oglobals->originate_status[i].peer_channel, CF_PICKUP) &&
+				switch_channel_up(oglobals->originate_status[i].peer_channel)) {
+				switch_channel_hangup(oglobals->originate_status[i].peer_channel, SWITCH_CAUSE_NO_PICKUP);
 			}
 		}
 	}
 
 
 	if (pindex > -1 && caller_channel && switch_channel_ready(caller_channel) && !switch_channel_media_ready(caller_channel) &&
-		switch_channel_media_ready(originate_status[pindex].peer_channel)) {
-		inherit_codec(caller_channel, originate_status[pindex].peer_session);
+		switch_channel_media_ready(oglobals->originate_status[pindex].peer_channel)) {
+		inherit_codec(caller_channel, oglobals->originate_status[pindex].peer_session);
 	}
 
 	if (send_ringback) {
@@ -1791,7 +1820,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_enterprise_originate(switch_core_sess
 
 struct early_state {
 	originate_global_t *oglobals;
-	originate_status_t *originate_status;
 	switch_mutex_t *mutex;
 	switch_buffer_t *buffer;
 	int ready;
@@ -1804,7 +1832,7 @@ typedef struct early_state early_state_t;
 static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *obj)
 {
 	early_state_t *state = (early_state_t *) obj;
-	originate_status_t originate_status[MAX_PEERS] = { {0} };
+	//originate_status_t originate_status[MAX_PEERS] = { {0} };
 	int16_t mux_data[SWITCH_RECOMMENDED_BUFFER_SIZE / 2] = { 0 };
 	int32_t sample;
 	switch_codec_t read_codecs[MAX_PEERS] = { {0} };
@@ -1815,6 +1843,7 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 	switch_frame_t *read_frame = NULL;
 	switch_codec_implementation_t read_impl = { 0 };
 
+#if 0
 	for (i = 0; i < MAX_PEERS && i < state->ttl; i++) {
 		switch_core_session_t *session = state->originate_status[i].peer_session;
 		switch_channel_t *channel = NULL;
@@ -1830,7 +1859,8 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 			originate_status[i].peer_channel = channel;
 		}
 	}
-
+#endif
+	
 	if (state->oglobals->session) {
 		switch_core_session_get_read_impl(state->oglobals->session, &read_impl);
 	}
@@ -1842,8 +1872,8 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 		answered = 0;
 
 		for (i = 0; i < MAX_PEERS && i < state->ttl; i++) {
-			switch_core_session_t *session = originate_status[i].peer_session;
-			switch_channel_t *channel = originate_status[i].peer_channel;
+			switch_core_session_t *session = state->oglobals->originate_status[i].peer_session;
+			switch_channel_t *channel = state->oglobals->originate_status[i].peer_channel;
 
 			if (!session || !channel || !switch_channel_up(channel)) {
 				continue;
@@ -1922,8 +1952,8 @@ static void *SWITCH_THREAD_FUNC early_thread_run(switch_thread_t *thread, void *
 
 
 	for (i = 0; i < MAX_PEERS && i < state->ttl; i++) {
-		switch_core_session_t *session = originate_status[i].peer_session;
-		switch_channel_t *channel = originate_status[i].peer_channel;
+		switch_core_session_t *session = state->oglobals->originate_status[i].peer_session;
+		switch_channel_t *channel = state->oglobals->originate_status[i].peer_channel;
 
 		if (!session) continue;
 
@@ -1976,8 +2006,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 													 switch_call_cause_t *cancel_cause,
 													 switch_dial_handle_t *dh)
 {
-	originate_status_t originate_status[MAX_PEERS] = { {0} };
-	switch_originate_flag_t dftflags = SOF_NONE, myflags = dftflags;
+	//originate_status_t originate_status[MAX_PEERS] = { {0} };
+	switch_originate_flag_t dftflags = SOF_NONE, myflags;
 	char *pipe_names[MAX_PEERS] = { 0 };
 	char *data = NULL;
 	switch_status_t status = SWITCH_STATUS_SUCCESS;
@@ -2293,6 +2323,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					ok = 1;
 				} else if (!strcasecmp((char *) hi->name, "group_confirm_cancel_timeout")) {
 					ok = 1;
+				} else if (!strcasecmp((char *) hi->name, "group_confirm_timeout")) {
+					ok = 1;
 				} else if (!strcasecmp((char *) hi->name, "forked_dial")) {
 					ok = 1;
 				} else if (!strcasecmp((char *) hi->name, "fail_on_single_reject")) {
@@ -2370,10 +2402,22 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 	}
 #endif
 
-	if (switch_true(switch_event_get_header(var_event, "group_confirm_cancel_timeout"))) {
+	if ((var = switch_event_get_header(var_event, "group_confirm_timeout"))) {
+		// has precedence over group_confirm_cancel_timeout
+		if (switch_is_number(var)) {
+			oglobals.confirm_timeout = atoi(var);
+			if (oglobals.confirm_timeout == 0) {
+				oglobals.cancel_timeout = SWITCH_TRUE;
+			}
+		}
+	} else if (switch_true(switch_event_get_header(var_event, "group_confirm_cancel_timeout"))) {
 		oglobals.cancel_timeout = SWITCH_TRUE;
 	}
 
+	if ((var = switch_event_get_header(var_event, "group_confirm_early_ok"))) {
+		oglobals.early_ok = switch_true(var);
+	}
+
 	if ((var = switch_event_get_header(var_event, "group_confirm_key"))) {
 		switch_copy_string(oglobals.key, var, sizeof(oglobals.key));
 		if ((var = switch_event_get_header(var_event, "group_confirm_file"))) {
@@ -2386,7 +2430,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 			int tmp = atoi(var);
 
 			if (tmp >= 0) {
-				oglobals.confirm_timeout = tmp;
+				oglobals.confirm_read_timeout = tmp;
 			}
 
 		}
@@ -2654,7 +2698,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 			reason = SWITCH_CAUSE_NONE;
 			memset(peer_names, 0, sizeof(peer_names));
 			peer_session = NULL;
-			memset(originate_status, 0, sizeof(originate_status));
+			memset(oglobals.originate_status, 0, sizeof(oglobals.originate_status));
 			new_profile = NULL;
 			new_session = NULL;
 			chan_type = NULL;
@@ -2867,9 +2911,9 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 				new_profile->callee_id_name = switch_core_strdup(new_profile->pool, "Outbound Call");
 				new_profile->callee_id_number = switch_sanitize_number(switch_core_strdup(new_profile->pool, new_profile->destination_number));
 
-				originate_status[i].caller_profile = NULL;
-				originate_status[i].peer_channel = NULL;
-				originate_status[i].peer_session = NULL;
+				oglobals.originate_status[i].caller_profile = NULL;
+				oglobals.originate_status[i].peer_channel = NULL;
+				oglobals.originate_status[i].peer_session = NULL;
 
 				new_session = NULL;
 
@@ -2982,27 +3026,27 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					goto done;
 				}
 
-				originate_status[i].peer_channel = switch_core_session_get_channel(new_session);
-				originate_status[i].caller_profile = switch_channel_get_caller_profile(originate_status[i].peer_channel);
-				originate_status[i].peer_session = new_session;
+				oglobals.originate_status[i].peer_channel = switch_core_session_get_channel(new_session);
+				oglobals.originate_status[i].caller_profile = switch_channel_get_caller_profile(oglobals.originate_status[i].peer_channel);
+				oglobals.originate_status[i].peer_session = new_session;
 
-				switch_channel_set_flag(originate_status[i].peer_channel, CF_ORIGINATING);
+				switch_channel_set_flag(oglobals.originate_status[i].peer_channel, CF_ORIGINATING);
 
 				if (caller_channel) {
-					switch_channel_set_variable(originate_status[i].peer_channel, "call_uuid", switch_channel_get_variable(caller_channel, "call_uuid"));
+					switch_channel_set_variable(oglobals.originate_status[i].peer_channel, "call_uuid", switch_channel_get_variable(caller_channel, "call_uuid"));
 				}
 
 
 				if (local_var_event) {
 					const char *device_id = switch_event_get_header(local_var_event, "device_id");
-					switch_channel_set_profile_var(originate_status[i].peer_channel, "device_id", device_id);
+					switch_channel_set_profile_var(oglobals.originate_status[i].peer_channel, "device_id", device_id);
 				}
 
 				if ((lc = switch_event_get_header(var_event, "local_var_clobber"))) {
 					local_clobber = switch_true(lc);
 				}
 
-				if (switch_channel_test_flag(originate_status[i].peer_channel, CF_NO_PRESENCE)) {
+				if (switch_channel_test_flag(oglobals.originate_status[i].peer_channel, CF_NO_PRESENCE)) {
 					if (var_event) {
 						switch_event_del_header(var_event, "presence_id");
 					}
@@ -3017,7 +3061,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						switch_event_header_t *header;
 						/* install the vars from the {} params */
 						for (header = var_event->headers; header; header = header->next) {
-							switch_channel_set_variable_var_check(originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
+							switch_channel_set_variable_var_check(oglobals.originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
 						}
 					}
 				}
@@ -3026,7 +3070,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 				if (local_var_event) {
 					switch_event_header_t *header;
 					for (header = local_var_event->headers; header; header = header->next) {
-						switch_channel_set_variable_var_check(originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
+						switch_channel_set_variable_var_check(oglobals.originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
 					}
 					switch_event_destroy(&local_var_event);
 				}
@@ -3036,62 +3080,62 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						switch_event_header_t *header;
 						/* install the vars from the {} params */
 						for (header = var_event->headers; header; header = header->next) {
-							switch_channel_set_variable_var_check(originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
+							switch_channel_set_variable_var_check(oglobals.originate_status[i].peer_channel, header->name, header->value, oglobals.check_vars);
 						}
 					}
 				}
 
-				if (originate_status[i].peer_channel) {
+				if (oglobals.originate_status[i].peer_channel) {
 					const char *vvar;
 
-					if (switch_true(switch_channel_get_variable(originate_status[i].peer_channel, "leg_required"))) {
-						originate_status[i].tagged = 1;
+					if (switch_true(switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "leg_required"))) {
+						oglobals.originate_status[i].tagged = 1;
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "origination_channel_name"))) {
-						switch_channel_set_name(originate_status[i].peer_channel, vvar);
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "origination_channel_name"))) {
+						switch_channel_set_name(oglobals.originate_status[i].peer_channel, vvar);
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "origination_callee_id_name"))) {
-						switch_channel_set_profile_var(originate_status[i].peer_channel, "callee_id_name", vvar);
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "origination_callee_id_name"))) {
+						switch_channel_set_profile_var(oglobals.originate_status[i].peer_channel, "callee_id_name", vvar);
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "origination_callee_id_number"))) {
-						switch_channel_set_profile_var(originate_status[i].peer_channel, "callee_id_number", vvar);
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "origination_callee_id_number"))) {
+						switch_channel_set_profile_var(oglobals.originate_status[i].peer_channel, "callee_id_number", vvar);
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "leg_timeout"))) {
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "leg_timeout"))) {
 						int val = atoi(vvar);
 
 						if (val > 0) {
 							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s Setting leg timeout to %d\n",
-											  switch_channel_get_name(originate_status[i].peer_channel), val);
-							originate_status[i].per_channel_timelimit_sec = (uint32_t) val;
+											  switch_channel_get_name(oglobals.originate_status[i].peer_channel), val);
+							oglobals.originate_status[i].per_channel_timelimit_sec = (uint32_t) val;
 						}
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "leg_progress_timeout"))) {
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "leg_progress_timeout"))) {
 						int val = atoi(vvar);
 						if (val > 0) {
 							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s Setting leg progress timeout to %d\n",
-											  switch_channel_get_name(originate_status[i].peer_channel), val);
-							originate_status[i].per_channel_progress_timelimit_sec = (uint32_t) val;
+											  switch_channel_get_name(oglobals.originate_status[i].peer_channel), val);
+							oglobals.originate_status[i].per_channel_progress_timelimit_sec = (uint32_t) val;
 						}
 					}
 
-					if ((vvar = switch_channel_get_variable(originate_status[i].peer_channel, "leg_delay_start"))) {
+					if ((vvar = switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "leg_delay_start"))) {
 						int val = atoi(vvar);
 						if (val > 0) {
 							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_DEBUG, "%s Setting leg delay start to %d\n",
-											  switch_channel_get_name(originate_status[i].peer_channel), val);
-							originate_status[i].per_channel_delay_start = (uint32_t) val;
+											  switch_channel_get_name(oglobals.originate_status[i].peer_channel), val);
+							oglobals.originate_status[i].per_channel_delay_start = (uint32_t) val;
 
-							if (originate_status[i].per_channel_progress_timelimit_sec != 0) {
-								originate_status[i].per_channel_progress_timelimit_sec += originate_status[i].per_channel_delay_start;
+							if (oglobals.originate_status[i].per_channel_progress_timelimit_sec != 0) {
+								oglobals.originate_status[i].per_channel_progress_timelimit_sec += oglobals.originate_status[i].per_channel_delay_start;
 							}
 
-							if (originate_status[i].per_channel_timelimit_sec != 0) {
-								originate_status[i].per_channel_timelimit_sec += originate_status[i].per_channel_delay_start;
+							if (oglobals.originate_status[i].per_channel_timelimit_sec != 0) {
+								oglobals.originate_status[i].per_channel_timelimit_sec += oglobals.originate_status[i].per_channel_delay_start;
 							}
 						}
 					}
@@ -3105,12 +3149,12 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						switch_channel_t *channel = switch_core_session_get_channel(a_session);
 						char *val =
 							switch_core_session_sprintf(a_session, "%s;%s;%s",
-														switch_core_session_get_uuid(originate_status[i].peer_session),
-														switch_str_nil(switch_channel_get_variable(originate_status[i].peer_channel, "callee_id_name")),
-														switch_str_nil(switch_channel_get_variable(originate_status[i].peer_channel, "callee_id_number")));
+														switch_core_session_get_uuid(oglobals.originate_status[i].peer_session),
+														switch_str_nil(switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "callee_id_name")),
+														switch_str_nil(switch_channel_get_variable(oglobals.originate_status[i].peer_channel, "callee_id_number")));
 
 
-						switch_channel_set_variable(originate_status[i].peer_channel, "originating_leg_uuid", switch_core_session_get_uuid(a_session));
+						switch_channel_set_variable(oglobals.originate_status[i].peer_channel, "originating_leg_uuid", switch_core_session_get_uuid(a_session));
 
 						switch_channel_add_variable_var_check(channel, "originated_legs", val, SWITCH_FALSE, SWITCH_STACK_PUSH);
 
@@ -3121,32 +3165,32 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						l_session = NULL;
 					}
 
-					switch_channel_execute_on(originate_status[i].peer_channel, SWITCH_CHANNEL_EXECUTE_ON_ORIGINATE_VARIABLE);
-					switch_channel_api_on(originate_status[i].peer_channel, SWITCH_CHANNEL_API_ON_ORIGINATE_VARIABLE);
+					switch_channel_execute_on(oglobals.originate_status[i].peer_channel, SWITCH_CHANNEL_EXECUTE_ON_ORIGINATE_VARIABLE);
+					switch_channel_api_on(oglobals.originate_status[i].peer_channel, SWITCH_CHANNEL_API_ON_ORIGINATE_VARIABLE);
 				}
 
 				if (table) {
-					switch_channel_add_state_handler(originate_status[i].peer_channel, table);
+					switch_channel_add_state_handler(oglobals.originate_status[i].peer_channel, table);
 				}
 
 				if (oglobals.monitor_early_media_ring || oglobals.monitor_early_media_fail || oglobals.ignore_early_media == 4) {
-					switch_channel_set_flag(originate_status[i].peer_channel, CF_CONSUME_ON_ORIGINATE);
+					switch_channel_set_flag(oglobals.originate_status[i].peer_channel, CF_CONSUME_ON_ORIGINATE);
 				}
 
-				switch_channel_add_state_handler(originate_status[i].peer_channel, &originate_state_handlers);
+				switch_channel_add_state_handler(oglobals.originate_status[i].peer_channel, &originate_state_handlers);
 
-				if ((flags & SOF_NOBLOCK) && originate_status[i].peer_session) {
+				if ((flags & SOF_NOBLOCK) && oglobals.originate_status[i].peer_session) {
 					status = SWITCH_STATUS_SUCCESS;
-					*bleg = originate_status[i].peer_session;
+					*bleg = oglobals.originate_status[i].peer_session;
 					*cause = SWITCH_CAUSE_SUCCESS;
 					goto outer_for;
 				}
 
-				if (!switch_core_session_running(originate_status[i].peer_session)) {
-					if (originate_status[i].per_channel_delay_start) {
-						switch_channel_set_flag(originate_status[i].peer_channel, CF_BLOCK_STATE);
+				if (!switch_core_session_running(oglobals.originate_status[i].peer_session)) {
+					if (oglobals.originate_status[i].per_channel_delay_start) {
+						switch_channel_set_flag(oglobals.originate_status[i].peer_channel, CF_BLOCK_STATE);
 					}
-					switch_core_session_thread_launch(originate_status[i].peer_session);
+					switch_core_session_thread_launch(oglobals.originate_status[i].peer_session);
 				}
 			}
 
@@ -3158,11 +3202,11 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					int state;
 					time_t elapsed;
 
-					if (!originate_status[i].peer_channel) {
+					if (!oglobals.originate_status[i].peer_channel) {
 						continue;
 					}
 
-					state = switch_channel_get_state(originate_status[i].peer_channel);
+					state = switch_channel_get_state(oglobals.originate_status[i].peer_channel);
 
 					if (state < CS_HANGUP) {
 						valid_channels++;
@@ -3199,7 +3243,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					switch_cond_next();
 				}
 
-				check_per_channel_timeouts(&oglobals, originate_status, and_argc, start, &force_reason);
+				check_per_channel_timeouts(&oglobals, and_argc, start, &force_reason);
 
 
 				if (valid_channels == 0) {
@@ -3237,9 +3281,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 			}
 
 			while ((!caller_channel || switch_channel_ready(caller_channel) || switch_channel_test_flag(caller_channel, CF_XFER_ZOMBIE)) &&
-				   check_channel_status(&oglobals, originate_status, and_argc, &force_reason)) {
+					check_channel_status(&oglobals, and_argc, &force_reason, start)) {
 				time_t elapsed = switch_epoch_time_now(NULL) - start;
-
 				read_packet = 0;
 
 				if (cancel_cause && *cancel_cause > 0) {
@@ -3250,7 +3293,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					goto notready;
 				}
 
-				check_per_channel_timeouts(&oglobals, originate_status, and_argc, start, &force_reason);
+				check_per_channel_timeouts(&oglobals, and_argc, start, &force_reason);
 
 				if (oglobals.session) {
 					switch_ivr_parse_all_events(oglobals.session);
@@ -3275,10 +3318,10 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 								switch_channel_t *pchannel;
 								const char *cause_str;
 
-								if (!originate_status[i].peer_session) {
+								if (!oglobals.originate_status[i].peer_session) {
 									continue;
 								}
-								pchannel = switch_core_session_get_channel(originate_status[i].peer_session);
+								pchannel = switch_core_session_get_channel(oglobals.originate_status[i].peer_session);
 
 								if (switch_channel_down_nosig(pchannel)) {
 									int neg, pos;
@@ -3352,14 +3395,14 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						if (oglobals.ringback_ok == 1) {
 							switch_status_t rst;
 
-							rst = setup_ringback(&oglobals, originate_status, and_argc, ringback_data, &ringback, &write_frame, &write_codec);
+							rst = setup_ringback(&oglobals, oglobals.originate_status, and_argc, ringback_data, &ringback, &write_frame, &write_codec);
 
 							if (oglobals.bridge_early_media > -1) {
 								switch_threadattr_t *thd_attr = NULL;
 								switch_threadattr_create(&thd_attr, switch_core_session_get_pool(session));
 								switch_threadattr_stacksize_set(thd_attr, SWITCH_THREAD_STACKSIZE);
 								early_state.oglobals = &oglobals;
-								early_state.originate_status = originate_status;
+								//early_state.originate_status = oglobals.originate_status;
 								early_state.ready = 1;
 								early_state.ringback = &ringback;
 								early_state.ttl = and_argc;
@@ -3510,33 +3553,33 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 
 			if (holding) {
 				if (oglobals.idx > IDX_NADA) {
-					peer_session = originate_status[oglobals.idx].peer_session;
-					peer_channel = originate_status[oglobals.idx].peer_channel;
-					originate_status[oglobals.idx].peer_channel = NULL;
+					peer_session = oglobals.originate_status[oglobals.idx].peer_session;
+					peer_channel = oglobals.originate_status[oglobals.idx].peer_channel;
+					oglobals.originate_status[oglobals.idx].peer_channel = NULL;
 				} else if (and_argc == 1) {
-					peer_session = originate_status[0].peer_session;
-					peer_channel = originate_status[0].peer_channel;
-					originate_status[0].peer_channel = NULL;
+					peer_session = oglobals.originate_status[0].peer_session;
+					peer_channel = oglobals.originate_status[0].peer_channel;
+					oglobals.originate_status[0].peer_channel = NULL;
 				} else {
 					for (i = 0; i < and_argc; i++) {
-						if (!peer_eligible(originate_status[i].peer_channel)) {
+						if (!peer_eligible(oglobals.originate_status[i].peer_channel)) {
 							continue;
 						}
-						if (switch_channel_media_ready(originate_status[i].peer_channel)) {
-							peer_session = originate_status[i].peer_session;
-							peer_channel = originate_status[i].peer_channel;
-							originate_status[i].peer_channel = NULL;
+						if (switch_channel_media_ready(oglobals.originate_status[i].peer_channel)) {
+							peer_session = oglobals.originate_status[i].peer_session;
+							peer_channel = oglobals.originate_status[i].peer_channel;
+							oglobals.originate_status[i].peer_channel = NULL;
 							goto end_search;
 						}
 					}
 					for (i = 0; i < and_argc; i++) {
-						if (!peer_eligible(originate_status[i].peer_channel)) {
+						if (!peer_eligible(oglobals.originate_status[i].peer_channel)) {
 							continue;
 						}
-						if (switch_channel_up_nosig(originate_status[i].peer_channel)) {
-							peer_session = originate_status[i].peer_session;
-							peer_channel = originate_status[i].peer_channel;
-							originate_status[i].peer_channel = NULL;
+						if (switch_channel_up_nosig(oglobals.originate_status[i].peer_channel)) {
+							peer_session = oglobals.originate_status[i].peer_session;
+							peer_channel = oglobals.originate_status[i].peer_channel;
+							oglobals.originate_status[i].peer_channel = NULL;
 							break;
 						}
 					}
@@ -3657,7 +3700,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 			}
 
 			for (i = 0; i < and_argc; i++) {
-				if (!peer_eligible(originate_status[i].peer_channel)) {
+				if (!peer_eligible(oglobals.originate_status[i].peer_channel)) {
 					continue;
 				}
 
@@ -3672,17 +3715,17 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						} else {
 							if (and_argc > 1) {
 								reason = SWITCH_CAUSE_LOSE_RACE;
-							} else if (!switch_channel_ready(originate_status[i].peer_channel)) {
-								wait_for_cause(originate_status[i].peer_channel);
-								if (switch_channel_down_nosig(originate_status[i].peer_channel)) {
-									reason = switch_channel_get_cause(originate_status[i].peer_channel);
+							} else if (!switch_channel_ready(oglobals.originate_status[i].peer_channel)) {
+								wait_for_cause(oglobals.originate_status[i].peer_channel);
+								if (switch_channel_down_nosig(oglobals.originate_status[i].peer_channel)) {
+									reason = switch_channel_get_cause(oglobals.originate_status[i].peer_channel);
 								}
 							} else {
 								reason = SWITCH_CAUSE_NO_ANSWER;
 							}
 						}
 					}
-					if (switch_channel_up_nosig(originate_status[i].peer_channel)) {
+					if (switch_channel_up_nosig(oglobals.originate_status[i].peer_channel)) {
 						if (caller_channel && i == 0) {
 							holding = switch_channel_get_variable(caller_channel, SWITCH_SOFT_HOLDING_UUID_VARIABLE);
 							switch_channel_set_variable(caller_channel, SWITCH_SOFT_HOLDING_UUID_VARIABLE, NULL);
@@ -3697,23 +3740,23 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 								switch_channel_set_variable(holding_channel, SWITCH_HANGUP_AFTER_BRIDGE_VARIABLE, "true");
 
 								if (caller_channel && switch_true(switch_channel_get_variable(caller_channel, "recording_follow_transfer"))) {
-									switch_ivr_transfer_recordings(session, originate_status[i].peer_session);
+									switch_ivr_transfer_recordings(session, oglobals.originate_status[i].peer_session);
 								}
 
 								if (switch_true(switch_channel_get_variable(holding_channel, "recording_follow_transfer"))) {
-									switch_ivr_transfer_recordings(holding_session, originate_status[i].peer_session);
+									switch_ivr_transfer_recordings(holding_session, oglobals.originate_status[i].peer_session);
 								}
 
 								switch_core_session_rwunlock(holding_session);
 							}
-							switch_channel_set_flag(originate_status[i].peer_channel, CF_LAZY_ATTENDED_TRANSFER);
-							switch_ivr_uuid_bridge(holding, switch_core_session_get_uuid(originate_status[i].peer_session));
+							switch_channel_set_flag(oglobals.originate_status[i].peer_channel, CF_LAZY_ATTENDED_TRANSFER);
+							switch_ivr_uuid_bridge(holding, switch_core_session_get_uuid(oglobals.originate_status[i].peer_session));
 							holding = NULL;
 						} else {
 							if (force_reason == SWITCH_CAUSE_LOSE_RACE || reason == SWITCH_CAUSE_LOSE_RACE) {
-								switch_channel_set_variable(originate_status[i].peer_channel, "group_dial_status", "loser");
+								switch_channel_set_variable(oglobals.originate_status[i].peer_channel, "group_dial_status", "loser");
 							}
-							switch_channel_hangup(originate_status[i].peer_channel, force_reason ? force_reason : reason);
+							switch_channel_hangup(oglobals.originate_status[i].peer_channel, force_reason ? force_reason : reason);
 						}
 					}
 				}
@@ -3722,8 +3765,8 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 
 
 			if (oglobals.idx > IDX_NADA) {
-				if ((peer_session = originate_status[oglobals.idx].peer_session)) {
-					peer_channel = switch_core_session_get_channel(originate_status[oglobals.idx].peer_session);
+				if ((peer_session = oglobals.originate_status[oglobals.idx].peer_session)) {
+					peer_channel = switch_core_session_get_channel(oglobals.originate_status[oglobals.idx].peer_session);
 				}
 			} else {
 				status = SWITCH_STATUS_FALSE;
@@ -3834,10 +3877,10 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					*cause = switch_channel_get_cause(peer_channel);
 				} else {
 					for (i = 0; i < and_argc; i++) {
-						if (!originate_status[i].peer_channel) {
+						if (!oglobals.originate_status[i].peer_channel) {
 							continue;
 						}
-						*cause = switch_channel_get_cause(originate_status[i].peer_channel);
+						*cause = switch_channel_get_cause(oglobals.originate_status[i].peer_channel);
 						break;
 					}
 				}
@@ -3846,11 +3889,11 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					for (i = 0; i < and_argc; i++) {
 						switch_channel_t *channel;
 
-						if (!originate_status[i].peer_session) {
+						if (!oglobals.originate_status[i].peer_session) {
 							continue;
 						}
 
-						channel = switch_core_session_get_channel(originate_status[i].peer_session);
+						channel = switch_core_session_get_channel(oglobals.originate_status[i].peer_session);
 
 						switch_channel_wait_for_state_timeout(channel, CS_REPORTING, 5000);
 
@@ -3858,7 +3901,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 							switch_channel_set_timestamps(channel);
 						}
 
-						if (switch_ivr_generate_xml_cdr(originate_status[i].peer_session, &cdr) == SWITCH_STATUS_SUCCESS) {
+						if (switch_ivr_generate_xml_cdr(oglobals.originate_status[i].peer_session, &cdr) == SWITCH_STATUS_SUCCESS) {
 							if ((xml_text = switch_xml_toxml(cdr, SWITCH_FALSE))) {
 								switch_snprintf(buf, sizeof(buf), "%s_%d", cdr_var, ++cdr_total);
 								switch_channel_set_variable(caller_channel, buf, xml_text);
@@ -3878,11 +3921,11 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					for (i = 0; i < and_argc; i++) {
 						switch_channel_t *channel;
 
-						if (!originate_status[i].peer_session) {
+						if (!oglobals.originate_status[i].peer_session) {
 							continue;
 						}
 
-						channel = switch_core_session_get_channel(originate_status[i].peer_session);
+						channel = switch_core_session_get_channel(oglobals.originate_status[i].peer_session);
 
 						switch_channel_wait_for_state_timeout(channel, CS_REPORTING, 5000);
 
@@ -3890,7 +3933,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 							switch_channel_set_timestamps(channel);
 						}
 
-						if (switch_ivr_generate_json_cdr(originate_status[i].peer_session, &json_cdr, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
+						if (switch_ivr_generate_json_cdr(oglobals.originate_status[i].peer_session, &json_cdr, SWITCH_TRUE) == SWITCH_STATUS_SUCCESS) {
 							json_text = cJSON_PrintUnformatted(json_cdr);
 							switch_snprintf(buf, sizeof(buf), "%s_%d", json_cdr_var, ++cdr_total);
 							switch_channel_set_variable(caller_channel, buf, json_text);
@@ -3917,14 +3960,14 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 					} else {
 						*cause = SWITCH_CAUSE_DESTINATION_OUT_OF_ORDER;
 						for (i = 0; i < and_argc; i++) {
-							if (!peer_eligible(originate_status[i].peer_channel)) {
+							if (!peer_eligible(oglobals.originate_status[i].peer_channel)) {
 								continue;
 							}
 
-							wait_for_cause(originate_status[i].peer_channel);
+							wait_for_cause(oglobals.originate_status[i].peer_channel);
 
-							if (switch_channel_down_nosig(originate_status[i].peer_channel)) {
-								*cause = switch_channel_get_cause(originate_status[i].peer_channel);
+							if (switch_channel_down_nosig(oglobals.originate_status[i].peer_channel)) {
+								*cause = switch_channel_get_cause(oglobals.originate_status[i].peer_channel);
 								break;
 							}
 
@@ -4014,40 +4057,40 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 				switch_core_session_t *peer_session;
 				char *val;
 
-				if (!originate_status[i].peer_channel) {
+				if (!oglobals.originate_status[i].peer_channel) {
 					continue;
 				}
 
 				if (session) {
-					val = switch_core_session_sprintf(originate_status[i].peer_session, "%s;%s",
-													  switch_core_session_get_uuid(originate_status[i].peer_session),
-													  switch_channel_cause2str(switch_channel_get_cause(originate_status[i].peer_channel)));
+					val = switch_core_session_sprintf(oglobals.originate_status[i].peer_session, "%s;%s",
+													  switch_core_session_get_uuid(oglobals.originate_status[i].peer_session),
+													  switch_channel_cause2str(switch_channel_get_cause(oglobals.originate_status[i].peer_channel)));
 
 					switch_channel_add_variable_var_check(switch_core_session_get_channel(session), "originate_causes", val, SWITCH_FALSE, SWITCH_STACK_PUSH);
 				}
 
 				if (status == SWITCH_STATUS_SUCCESS) {
-					switch_channel_clear_flag(originate_status[i].peer_channel, CF_ORIGINATING);
-					if (bleg && *bleg && *bleg == originate_status[i].peer_session) {
+					switch_channel_clear_flag(oglobals.originate_status[i].peer_channel, CF_ORIGINATING);
+					if (bleg && *bleg && *bleg == oglobals.originate_status[i].peer_session) {
 						continue;
 					}
-				} else if ((state = switch_channel_get_state(originate_status[i].peer_channel)) < CS_HANGUP &&
-						   switch_channel_test_flag(originate_status[i].peer_channel, CF_ORIGINATING)) {
-					if (!(state == CS_RESET || switch_channel_test_flag(originate_status[i].peer_channel, CF_TRANSFER) ||
-						  switch_channel_test_flag(originate_status[i].peer_channel, CF_REDIRECT) ||
-						  switch_channel_test_flag(originate_status[i].peer_channel, CF_BRIDGED))) {
+				} else if ((state = switch_channel_get_state(oglobals.originate_status[i].peer_channel)) < CS_HANGUP &&
+						   switch_channel_test_flag(oglobals.originate_status[i].peer_channel, CF_ORIGINATING)) {
+					if (!(state == CS_RESET || switch_channel_test_flag(oglobals.originate_status[i].peer_channel, CF_TRANSFER) ||
+						  switch_channel_test_flag(oglobals.originate_status[i].peer_channel, CF_REDIRECT) ||
+						  switch_channel_test_flag(oglobals.originate_status[i].peer_channel, CF_BRIDGED))) {
 						if (caller_channel && switch_channel_test_flag(caller_channel, CF_INTERCEPTED)) {
-							switch_channel_set_flag(originate_status[i].peer_channel, CF_INTERCEPT);
+							switch_channel_set_flag(oglobals.originate_status[i].peer_channel, CF_INTERCEPT);
 						}
-						switch_channel_hangup(originate_status[i].peer_channel, *cause);
+						switch_channel_hangup(oglobals.originate_status[i].peer_channel, *cause);
 					}
 				}
-				switch_channel_clear_flag(originate_status[i].peer_channel, CF_ORIGINATING);
+				switch_channel_clear_flag(oglobals.originate_status[i].peer_channel, CF_ORIGINATING);
 
-				peer_session = originate_status[i].peer_session;
-				originate_status[i].down_session = originate_status[i].peer_session;
-				originate_status[i].peer_session = NULL;
-				originate_status[i].peer_channel = NULL;
+				peer_session = oglobals.originate_status[i].peer_session;
+				oglobals.originate_status[i].down_session = oglobals.originate_status[i].peer_session;
+				oglobals.originate_status[i].peer_session = NULL;
+				oglobals.originate_status[i].peer_channel = NULL;
 
 				switch_core_session_rwunlock(peer_session);
 			}
@@ -4062,11 +4105,11 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_originate(switch_core_session_t *sess
 						switch_channel_t *pchannel;
 						const char *cause_str;
 
-						if (!originate_status[i].down_session) {
+						if (!oglobals.originate_status[i].down_session) {
 							continue;
 						}
 
-						pchannel = switch_core_session_get_channel(originate_status[i].down_session);
+						pchannel = switch_core_session_get_channel(oglobals.originate_status[i].down_session);
 						wait_for_cause(pchannel);
 
 						if (switch_channel_down_nosig(pchannel)) {
diff --git a/src/switch_ivr_play_say.c b/src/switch_ivr_play_say.c
index 339515faec4..e2179509584 100644
--- a/src/switch_ivr_play_say.c
+++ b/src/switch_ivr_play_say.c
@@ -1207,6 +1207,7 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 	uint32_t buflen = 0;
 	int flags;
 	int cumulative = 0;
+	int last_speed = -1;
 
 	if (switch_channel_pre_answer(channel) != SWITCH_STATUS_SUCCESS) {
 		return SWITCH_STATUS_FALSE;
@@ -1474,11 +1475,6 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 
 		interval = read_impl.microseconds_per_packet / 1000;
 
-		if (!fh->audio_buffer) {
-			switch_buffer_create_dynamic(&fh->audio_buffer, FILE_BLOCKSIZE, FILE_BUFSIZE, 0);
-			switch_assert(fh->audio_buffer);
-		}
-
 		codec_name = "L16";
 
 		if (!switch_core_codec_ready((&codec))) {
@@ -1589,9 +1585,13 @@ SWITCH_DECLARE(switch_status_t) switch_ivr_play_file(switch_core_session_t *sess
 			switch_event_fire(&event);
 		}
 
+		if (!fh->audio_buffer) {
+			switch_buffer_create_dynamic(&fh->audio_buffer, FILE_BLOCKSIZE, FILE_BUFSIZE, 0);
+			switch_assert(fh->audio_buffer);
+		}
+
 		for (;;) {
 			int do_speed = 1;
-			int last_speed = -1;
 			int f;
 
 			if (!switch_channel_ready(channel)) {
diff --git a/src/switch_loadable_module.c b/src/switch_loadable_module.c
index 2f6c1dca69d..174644faffd 100644
--- a/src/switch_loadable_module.c
+++ b/src/switch_loadable_module.c
@@ -1762,7 +1762,6 @@ static switch_status_t switch_loadable_module_load_file(char *path, char *filena
 		}
 
 		if ((module = switch_core_alloc(pool, sizeof(switch_loadable_module_t))) == 0) {
-			err = "Could not allocate memory\n";
 			abort();
 		}
 
@@ -2059,7 +2058,6 @@ SWITCH_DECLARE(switch_status_t) switch_loadable_module_build_dynamic(char *filen
 		}
 
 		if ((module = switch_core_alloc(pool, sizeof(switch_loadable_module_t))) == 0) {
-			err = "Could not allocate memory\n";
 			abort();
 		}
 
@@ -2229,7 +2227,7 @@ SWITCH_DECLARE(switch_status_t) switch_loadable_module_init(switch_bool_t autolo
 
 	}
 	else {
-		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CONSOLE, "open of %s failed\n", cf);
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CONSOLE, "open of %s failed\n", precf);
 	}
 
 	if (switch_core_sqldb_init(&err) != SWITCH_STATUS_SUCCESS)
@@ -2679,7 +2677,13 @@ static void switch_loadable_module_sort_codecs(const switch_codec_implementation
 #endif
 
 	for (i = 0; i < arraylen; i++) {
-		int this_ptime = array[i]->microseconds_per_packet / 1000;
+		int this_ptime;
+
+		if (!array[i]) {
+			continue;
+		}
+
+		this_ptime = array[i]->microseconds_per_packet / 1000;
 
 		if (!strcasecmp(array[i]->iananame, "ilbc")) {
 			this_ptime = 20;
@@ -2692,7 +2696,7 @@ static void switch_loadable_module_sort_codecs(const switch_codec_implementation
 #endif
 		}
 
-		if (i > 0 && strcasecmp(array[i]->iananame, array[i-1]->iananame) && this_ptime != sorted_ptime) {
+		if (i > 0 && array[i-1] && strcasecmp(array[i]->iananame, array[i-1]->iananame) && this_ptime != sorted_ptime) {
 			int j;
 			int swapped = 0;
 
diff --git a/src/switch_log.c b/src/switch_log.c
index 055298915ca..59452a84b6e 100644
--- a/src/switch_log.c
+++ b/src/switch_log.c
@@ -304,13 +304,37 @@ SWITCH_DECLARE(const char *) switch_log_level2str(switch_log_level_t level)
 	return LEVELS[level];
 }
 
+static int switch_log_to_mask(switch_log_level_t level)
+{
+	switch (level) {
+	case SWITCH_LOG_DEBUG:
+		return (1<<7);
+	case SWITCH_LOG_INFO:
+		return (1<<6);
+	case SWITCH_LOG_NOTICE:
+		return (1<<5);
+	case SWITCH_LOG_WARNING:
+		return (1<<4);
+	case SWITCH_LOG_ERROR:
+		return (1<<3);
+	case SWITCH_LOG_CRIT:
+		return (1<<2);
+	case SWITCH_LOG_ALERT:
+		return (1<<1);
+	case SWITCH_LOG_CONSOLE:
+		return (1<<0);
+	default:
+		return 0;
+	}
+}
+
 SWITCH_DECLARE(uint32_t) switch_log_str2mask(const char *str)
 {
 	int argc = 0, x = 0;
 	char *argv[10] = { 0 };
 	uint32_t mask = 0;
 	char *p = strdup(str);
-	switch_log_level_t level;
+	switch_log_level_t level = SWITCH_LOG_INVALID;
 
 	switch_assert(p);
 
@@ -321,8 +345,9 @@ SWITCH_DECLARE(uint32_t) switch_log_str2mask(const char *str)
 				break;
 			} else {
 				level = switch_log_str2level(argv[x]);
+
 				if (level != SWITCH_LOG_INVALID) {
-					mask |= (1 << level);
+					mask |= switch_log_to_mask(level);
 				}
 			}
 		}
diff --git a/src/switch_msrp.c b/src/switch_msrp.c
index da5e288dc0e..5eba9e34ecb 100644
--- a/src/switch_msrp.c
+++ b/src/switch_msrp.c
@@ -695,7 +695,7 @@ static char *msrp_parse_header(char *start, int skip, const char *end, switch_ms
 static switch_msrp_msg_t *msrp_parse_headers(char *start, int len, switch_msrp_msg_t *msrp_msg, switch_memory_pool_t *pool)
 {
 	char *p = start;
-	char *q = p;
+	char *q;
 	const char *end = start + len;
 
 	while(p < end) {
diff --git a/src/switch_rtp.c b/src/switch_rtp.c
index fb8d5d2cf2a..28a91ee3280 100644
--- a/src/switch_rtp.c
+++ b/src/switch_rtp.c
@@ -82,6 +82,11 @@ static const switch_payload_t INVALID_PT = 255;
 
 #define rtp_session_name(_rtp_session) _rtp_session->session ? switch_core_session_get_name(_rtp_session->session) : "-"
 
+#define STUN_USERNAME_MAX_SIZE 513 /* From RFC5389:  "It MUST contain a UTF-8 [RFC3629] encoded sequence of less than 513 bytes" */
+#define SDP_UFRAG_MAX_SIZE 256 	/* From draft-ietf-mmusic-ice-sip-sdp-24: "the ice-ufrag attribute MUST NOT be longer than 32
+								 * characters when sending, but an implementation MUST accept up to 256
+								 * characters when receiving." */
+
 static switch_port_t START_PORT = RTP_START_PORT;
 static switch_port_t END_PORT = RTP_END_PORT;
 static switch_mutex_t *port_lock = NULL;
@@ -907,8 +912,8 @@ static void handle_ice(switch_rtp_t *rtp_session, switch_rtp_ice_t *ice, void *d
 	switch_stun_packet_t *packet;
 	switch_stun_packet_attribute_t *attr;
 	void *end_buf;
-	char username[34] = { 0 };
-	unsigned char buf[512] = { 0 };
+	char username[STUN_USERNAME_MAX_SIZE] = { 0 };
+	unsigned char buf[1500] = { 0 };
 	switch_size_t cpylen = len;
 	int xlen = 0;
 	int ok = 1;
@@ -3263,7 +3268,7 @@ static int dtls_state_handshake(switch_rtp_t *rtp_session, switch_dtls_t *dtls)
 		case SSL_ERROR_NONE:
 			break;
 		default:
-			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_ERROR, "%s Handshake failure %d. This may happen when you use legacy DTLS v1.0 (legacyDTLS channel var is set) but endpoint requires DTLS v1.2.\n", rtp_type(rtp_session), ret);
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_WARNING, "%s Handshake failure %d. This may happen when you use legacy DTLS v1.0 (legacyDTLS channel var is set) but endpoint requires DTLS v1.2.\n", rtp_type(rtp_session), ret);
 			dtls_set_state(dtls, DS_FAIL);
 			return -1;
 		}
@@ -4838,9 +4843,9 @@ SWITCH_DECLARE(switch_status_t) switch_rtp_activate_ice(switch_rtp_t *rtp_sessio
 														const char *password, const char *rpassword, ice_proto_t proto,
 														switch_core_media_ice_type_t type, ice_t *ice_params)
 {
-	char ice_user[80];
-	char user_ice[80];
-	char luser_ice[80];
+	char ice_user[STUN_USERNAME_MAX_SIZE];
+	char user_ice[STUN_USERNAME_MAX_SIZE];
+	char luser_ice[SDP_UFRAG_MAX_SIZE];
 	switch_rtp_ice_t *ice;
 	char *host = NULL;
 	switch_port_t port = 0;
@@ -5377,7 +5382,8 @@ static void set_dtmf_delay(switch_rtp_t *rtp_session, uint32_t ms, uint32_t max_
 
 	upsamp = ms * (rtp_session->samples_per_second / 1000);
 	max_upsamp = max_ms * (rtp_session->samples_per_second / 1000);
-
+	
+	rtp_session->sending_dtmf = 0;
 	rtp_session->queue_delay = upsamp;
 
 	if (rtp_session->flags[SWITCH_RTP_FLAG_USE_TIMER]) {
@@ -6216,8 +6222,6 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 				}
 
 				if (!(*flags & SFF_PLC) && rtp_session->recv_ctx[rtp_session->srtp_idx_rtp]) {
-					stat = 0;
-					
 					if (!rtp_session->flags[SWITCH_RTP_FLAG_SECURE_RECV_MKI]) {
 						stat = srtp_unprotect(rtp_session->recv_ctx[rtp_session->srtp_idx_rtp], &rtp_session->recv_msg.header, &sbytes);
 					} else {
@@ -6247,10 +6251,10 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 						else msg="";
 						if (errs >= MAX_SRTP_ERRS) {
 							switch_channel_t *channel = switch_core_session_get_channel(rtp_session->session);
-							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_ERROR,
+							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_WARNING,
 											  "SRTP %s unprotect failed with code %d (%s) %ld bytes %d errors\n",
 											  rtp_type(rtp_session), stat, msg, (long)*bytes, errs);
-							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_ERROR,
+							switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_WARNING,
 											  "Ending call due to SRTP error\n");
 							switch_channel_hangup(channel, SWITCH_CAUSE_SRTP_READ_ERROR);
 						} else if (errs >= WARN_SRTP_ERRS && !(errs % WARN_SRTP_ERRS)) {
@@ -6463,7 +6467,6 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 					}
 					if (!xcheck_jitter) {
 						check_jitter(rtp_session);
-						xcheck_jitter = *bytes;
 					}
 				}
 				break;
@@ -6503,7 +6506,6 @@ static switch_status_t read_rtp_packet(switch_rtp_t *rtp_session, switch_size_t
 
 				if (!xcheck_jitter) {
 					check_jitter(rtp_session);
-					xcheck_jitter = *bytes;
 				}
 			}
 		}
@@ -7958,9 +7960,12 @@ SWITCH_DECLARE(switch_size_t) switch_rtp_dequeue_dtmf(switch_rtp_t *rtp_session,
 	switch_mutex_lock(rtp_session->dtmf_data.dtmf_mutex);
 	if (switch_queue_trypop(rtp_session->dtmf_data.dtmf_inqueue, &pop) == SWITCH_STATUS_SUCCESS) {
 
-		_dtmf = (switch_dtmf_t *) pop;
+		_dtmf = (switch_dtmf_t *)pop;
 		*dtmf = *_dtmf;
-		switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_DEBUG, "RTP RECV DTMF %c:%d\n", dtmf->digit, dtmf->duration);
+		/* Only log DTMF buffer if sensitive_dtmf channel variable not set to true */
+		if (!(switch_channel_var_true(switch_core_session_get_channel(rtp_session->session), SWITCH_SENSITIVE_DTMF_VARIABLE))) {			
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(rtp_session->session), SWITCH_LOG_DEBUG,"RTP RECV DTMF %c:%d\n", dtmf->digit, dtmf->duration);
+		}
 		bytes++;
 		free(pop);
 	}
@@ -9238,6 +9243,11 @@ SWITCH_DECLARE(void *) switch_rtp_get_private(switch_rtp_t *rtp_session)
 	return rtp_session->private_data;
 }
 
+SWITCH_DECLARE(switch_core_session_t*) switch_rtp_get_core_session(switch_rtp_t *rtp_session)
+{
+	return rtp_session->session;
+}
+
 /* For Emacs:
  * Local Variables:
  * mode:c
diff --git a/src/switch_scheduler.c b/src/switch_scheduler.c
index 6452fa9cc24..c08d1071b4d 100644
--- a/src/switch_scheduler.c
+++ b/src/switch_scheduler.c
@@ -54,7 +54,7 @@ static struct {
 	int task_thread_running;
 	switch_queue_t *event_queue;
 	switch_memory_pool_t *memory_pool;
-} globals;
+} globals = { 0 };
 
 static void switch_scheduler_execute(switch_scheduler_task_container_t *tp)
 {
diff --git a/src/switch_time.c b/src/switch_time.c
index e0c12aa737a..fc94b48ee4f 100644
--- a/src/switch_time.c
+++ b/src/switch_time.c
@@ -1043,7 +1043,7 @@ SWITCH_MODULE_RUNTIME_FUNCTION(softtimer_runtime)
 	int fwd_errs = 0, rev_errs = 0;
 	int profile_tick = 0;
 	int tfd = -1;
-	uint32_t time_sync = runtime.time_sync;
+	uint32_t time_sync;
 
 #ifdef HAVE_TIMERFD_CREATE
 	int last_MICROSECONDS_PER_TICK = runtime.microseconds_per_tick;
@@ -1079,7 +1079,6 @@ SWITCH_MODULE_RUNTIME_FUNCTION(softtimer_runtime)
 	}
 
 	switch_time_sync();
-	time_sync = runtime.time_sync;
 
 	globals.STARTED = globals.RUNNING = 1;
 	switch_mutex_lock(runtime.throttle_mutex);
@@ -1104,7 +1103,6 @@ SWITCH_MODULE_RUNTIME_FUNCTION(softtimer_runtime)
 		}
 	}
 
-	ts = 0;
 	last = 0;
 	fwd_errs = rev_errs = 0;
 
@@ -1158,11 +1156,11 @@ SWITCH_MODULE_RUNTIME_FUNCTION(softtimer_runtime)
 
 				if (!MONO || time_sync == runtime.time_sync) {
 #if defined(HAVE_CLOCK_NANOSLEEP)
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT,
-									  "If you see this message many times try setting the param enable-clock-nanosleep to true in switch.conf.xml or consider a nicer machine to run me on. I AM *FREE* afterall.\n");
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG,
+									  "If you see this message many times try setting the param enable-clock-nanosleep to true in switch.conf.xml or consider a nicer machine to run me on.\n");
 #else
-					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_CRIT,
-									  "If you see this message many times consider a nicer machine to run me on. I AM *FREE* afterall.\n");
+					switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_DEBUG,
+									  "If you see this message many times consider a nicer machine to run me on.\n");
 #endif
 				}
 			} else {
@@ -1318,7 +1316,6 @@ SWITCH_MODULE_RUNTIME_FUNCTION(softtimer_runtime)
 
 	if (tfd > -1) {
 		close(tfd);
-		tfd = -1;
 	}
 
 
@@ -1494,7 +1491,6 @@ SWITCH_DECLARE(switch_status_t) switch_strftime_tz(const char *tz, const char *f
 		tzdef = switch_lookup_timezone(tz_name);
 	} else {
 		/* We set the default timezone to GMT. */
-		tz_name = "GMT";
 		tzdef = "GMT";
 	}
 
@@ -2256,7 +2252,6 @@ static int tzparse(const char *name, register struct state *const sp, const int
 			 ** Initially we're assumed to be in standard time.
 			 */
 			isdst = FALSE;
-			theiroffset = theirstdoffset;
 			/*
 			 ** Now juggle transition times and types
 			 ** tracking offsets as you do.
diff --git a/src/switch_utils.c b/src/switch_utils.c
index 64ceb391306..e6e4917b1ee 100644
--- a/src/switch_utils.c
+++ b/src/switch_utils.c
@@ -39,6 +39,11 @@
 #include <sys/time.h>
 #include <sys/resource.h>
 #endif
+#include <sys/types.h>
+#include <unistd.h>
+#else
+ /* process.h is required for _getpid() */
+#include <process.h>
 #endif
 #include "private/switch_core_pvt.h"
 #define ESCAPE_META '\\'
@@ -66,6 +71,11 @@ struct switch_network_list {
 	char *name;
 };
 
+SWITCH_DECLARE(void *) switch_calloc(size_t nmemb, size_t size)
+{
+	return calloc(nmemb, size);
+}
+
 #ifndef WIN32
 SWITCH_DECLARE(int) switch_inet_pton(int af, const char *src, void *dst)
 {
@@ -4529,6 +4539,16 @@ SWITCH_DECLARE(char *)switch_html_strip(const char *str)
 	return text;
 }
 
+SWITCH_DECLARE(unsigned long) switch_getpid(void)
+{
+#ifndef WIN32
+	pid_t pid = getpid();
+#else
+	int pid = _getpid();
+#endif
+
+	return (unsigned long)pid;
+}
 
 
 /* For Emacs:
diff --git a/src/switch_vad.c b/src/switch_vad.c
index f4c65438fdd..6118c0237e5 100644
--- a/src/switch_vad.c
+++ b/src/switch_vad.c
@@ -1,6 +1,6 @@
 /*
  * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
- * Copyright (C) 2018, Anthony Minessale II <anthm@freeswitch.org>
+ * Copyright (C) 2018-2020, Anthony Minessale II <anthm@freeswitch.org>
  *
  * Version: MPL 1.1
  *
@@ -24,6 +24,7 @@
  * Contributor(s):
  *
  * Seven Du <dujinfang@gmail.com>
+ * Chris Rienzo <chris@signalwire.com>
  *
  *
  * switch_vad.c VAD code with optional libfvad
@@ -189,11 +190,14 @@ SWITCH_DECLARE(switch_vad_state_t) switch_vad_process(switch_vad_t *vad, int16_t
 	// determine if this is a voice or non-voice frame
 #ifdef SWITCH_HAVE_FVAD
 	if (vad->fvad) {
+		// fvad returns -1, 0, or 1
+		// -1: error
+		//  0: non-voice frame
+		//  1: voice frame
 		int ret = fvad_process(vad->fvad, data, samples);
 
-		// printf("%d ", ret); fflush(stdout);
-
-		score = vad->thresh + ret - 1;
+		// if voice frame set score > threshold
+		score = ret > 0 ? vad->thresh + 100 : 0;
 	} else {
 #endif
 		int energy = 0, j = 0, count = 0;
diff --git a/src/switch_vpx.c b/src/switch_vpx.c
index 3d54b39e435..e35d87712fb 100644
--- a/src/switch_vpx.c
+++ b/src/switch_vpx.c
@@ -1109,13 +1109,15 @@ static switch_status_t buffer_vp9_packets(vpx_context_t *context, switch_frame_t
 	vp9++;
 
 	if (desc->have_pid) {
+#ifdef DEBUG_VP9
 		uint16_t pid = 0;
-
 		pid = *vp9 & 0x7f;	//0 bit is M , 1-7 bit is pid.
-
+#endif
 		if (*vp9 & 0x80) {	//if (M==1)
 			vp9++;
+#ifdef DEBUG_VP9
 			pid = (pid << 8) + *vp9;
+#endif 
 		}
 
 		vp9++;
diff --git a/src/switch_xml.c b/src/switch_xml.c
index f794d6f7440..b05358741c9 100644
--- a/src/switch_xml.c
+++ b/src/switch_xml.c
@@ -2591,7 +2591,6 @@ static char *switch_xml_toxml_r(switch_xml_t xml, char **s, switch_size_t *len,
 
   tailrecurse:
 	off = 0;
-	lcount = 0;
 	txt = "";
 
 	if (loops++) {
@@ -2985,7 +2984,7 @@ SWITCH_DECLARE(switch_xml_t) switch_xml_set_attr(switch_xml_t xml, const char *n
 			return xml;			/* nothing to do */
 		if (xml->attr == SWITCH_XML_NIL) {	/* first attribute */
 			xml->attr = (char **) switch_must_malloc(4 * sizeof(char *));
-			xml->attr[1] = switch_must_strdup("");	/* empty list of malloced names/vals */
+			xml->attr[l + 1] = switch_must_strdup("");	/* empty list of malloced names/vals */
 		} else {
 			xml->attr = (char **) switch_must_realloc(xml->attr, (l + 4) * sizeof(char *));
 		}
diff --git a/tests/unit/Makefile.am b/tests/unit/Makefile.am
index 25aa268be0f..c795fd7d9f4 100644
--- a/tests/unit/Makefile.am
+++ b/tests/unit/Makefile.am
@@ -2,10 +2,14 @@ include $(top_srcdir)/build/modmake.rulesam
 
 noinst_PROGRAMS = switch_event switch_hash switch_ivr_originate switch_utils switch_core switch_console switch_vpx switch_core_file \
 			   switch_ivr_play_say switch_core_codec switch_rtp switch_xml
-noinst_PROGRAMS+= switch_core_video switch_core_db
+noinst_PROGRAMS+= switch_core_video switch_core_db switch_vad
 AM_LDFLAGS  = -avoid-version -no-undefined $(SWITCH_AM_LDFLAGS) $(openssl_LIBS)
 AM_LDFLAGS += $(FREESWITCH_LIBS) $(switch_builddir)/libfreeswitch.la $(CORE_LIBS) $(APR_LIBS)
 AM_CFLAGS   = $(SWITCH_AM_CPPFLAGS)
 AM_CPPFLAGS = $(SWITCH_AM_CPPFLAGS)
 
+if HAVE_FVAD
+AM_CFLAGS += -DSWITCH_HAVE_FVAD
+endif
+
 TESTS = $(noinst_PROGRAMS)
diff --git a/tests/unit/collect-test-logs.sh b/tests/unit/collect-test-logs.sh
index a46102af72e..fd2a5f9d9ae 100755
--- a/tests/unit/collect-test-logs.sh
+++ b/tests/unit/collect-test-logs.sh
@@ -3,14 +3,26 @@
 echo "Collecting test logs"
 LOG_DIR=./logs
 html="<html><h3>There are failed unit-tests:</h3><table>"
-html+="<tr align=\"left\"><th><br>Unit tests</th></tr>"
-logs=$(find $LOG_DIR -type f -iname "*.html" -print)
+logs=$(find $LOG_DIR -type f -iname "*.html" -print | sort)
 logs_found=0
+olddirname=""
 for name in $logs
 do
 	logname=$(basename $name)
 	testname=$(echo $logname | awk -F 'log_run-tests_' '{print $2}' | awk -F '.html' '{print $1}')
-	html+="<tr align=\"left\"><td><a href="$logname">$testname</a></td></tr>"
+	testpath="${testname//!/\/}"
+	dirname=$(dirname $testpath)
+	test=$(basename $testpath)
+	if [ "$olddirname" != "$dirname" ]; then
+		html+="<tr align=\"left\"><th><br>$dirname</th></tr>" ;
+		olddirname=$dirname ;
+	fi
+	html+="<tr align=\"left\"><td><a href="$logname">$test</a>"
+	backtrace="backtrace_$testname.txt"
+	if test -f "${LOG_DIR}/$backtrace"; then
+		html+=". Core dumped, backtrace is available <a href=\"$backtrace\">here</a>"
+	fi
+	html+="</td></tr>"
 	logs_found=1
 done
 
diff --git a/tests/unit/conf/freeswitch.xml b/tests/unit/conf/freeswitch.xml
index 3e25162a8c1..90600975e45 100644
--- a/tests/unit/conf/freeswitch.xml
+++ b/tests/unit/conf/freeswitch.xml
@@ -11,6 +11,10 @@
 		<load module="mod_spandsp"/>
 		<load module="mod_amr"/>
 		<load module="mod_amrwb"/>
+		<load module="mod_tone_stream"/>
+		<load module="mod_dptools"/>
+		<load module="mod_sndfile"/>
+		<load module="mod_dialplan_xml"/>
 		<load module="mod_sndfile"/>
       </modules>
     </configuration>
@@ -36,6 +40,12 @@
 
   <section name="dialplan" description="Regex/XML Dialplan">
     <context name="default">
+      <extension name="loopback">
+        <condition field="destination_number" expression="^loopback$">
+          <action application="bridge" data="null/+1234"/>
+        </condition>
+      </extension>
+
       <extension name="sample">
         <condition>
           <action application="info"/>
diff --git a/tests/unit/run-tests.sh b/tests/unit/run-tests.sh
index 55c41e203d6..bda6a325715 100755
--- a/tests/unit/run-tests.sh
+++ b/tests/unit/run-tests.sh
@@ -1,27 +1,28 @@
 #!/bin/bash
 
-TESTS=$(make -f -  2>/dev/null <<EOF
-include Makefile
-all:
-	@echo \$(TESTS)
-EOF
-)
+# "print_tests" returns relative paths to all the tests
+TESTS=$(make -s -C ../.. print_tests)
 
 echo "-----------------------------------------------------------------";
 echo "Starting tests";
 echo "Tests found: ${TESTS}";
 echo "-----------------------------------------------------------------";
+echo "Starting" > pids.txt
 for i in $TESTS
 do
     echo "Testing $i" ;
-    logfilename="log_run-tests_$i.html";
-    ./$i | tee >(ansi2html > $logfilename) ;
-    exitstatus=${PIPESTATUS[0]} ;
-    if [ "0" -eq $exitstatus ] ; then
-	rm $logfilename ;
-    else
-	echo "*** ./$i exit status is $exitstatus" ;
-	echo "*** $logfilename was saved" ;
-    fi ;
+    ./test.sh "$i" &
+    pid=($!)
+    pids+=($pid)
+    echo "$pid $i" >> pids.txt
     echo "----------------" ;
 done
+
+for pid in "${pids[@]}"
+do
+  echo "$pid waiting" >> pids.txt
+  wait "$pid"
+  echo "$pid finished" >> pids.txt
+done
+
+echo "Done running tests!"
\ No newline at end of file
diff --git a/tests/unit/switch_core_db.c b/tests/unit/switch_core_db.c
index 50a004fed54..5ebf41c8ab8 100644
--- a/tests/unit/switch_core_db.c
+++ b/tests/unit/switch_core_db.c
@@ -34,7 +34,19 @@
 
 #include <test/switch_test.h>
 
-FST_CORE_BEGIN("./conf")
+int max_rows = 150;
+
+int status = 0;
+
+int table_count_func(void *pArg, int argc, char **argv, char **columnNames){
+	if (argc > 0) {
+		status = atoi(argv[0]);
+	}
+
+	return -1;
+}
+
+FST_CORE_DB_BEGIN("./conf")
 {
 	FST_SUITE_BEGIN(switch_core_db)
 	{
@@ -69,6 +81,44 @@ FST_CORE_BEGIN("./conf")
 			fst_check_string_equals(res2, "");
 		}
 		FST_TEST_END()
+
+		FST_TEST_BEGIN(test_switch_cache_db_queue_manager_race)
+		{
+			int i;
+			switch_sql_queue_manager_t *qm = NULL;
+
+			switch_sql_queue_manager_init_name("TEST",
+				&qm,
+				4,
+				"test_switch_cache_db_queue_manager_race",
+				SWITCH_MAX_TRANS,
+				NULL, NULL, NULL, NULL);
+
+			switch_sql_queue_manager_start(qm);
+
+			switch_sql_queue_manager_push_confirm(qm, "DROP TABLE IF EXISTS t;", 0, SWITCH_TRUE);
+			switch_sql_queue_manager_push_confirm(qm, "CREATE TABLE t (col1 INT);", 0, SWITCH_TRUE);
+
+			for (i = 0; i < max_rows; i++) {
+				switch_sql_queue_manager_push(qm, "INSERT INTO t (col1) VALUES (1);", 0, SWITCH_TRUE);
+			}
+
+			switch_sleep(1 * 1000 * 1000);
+
+			switch_sql_queue_manager_execute_sql_callback(qm, "SELECT COUNT(col1) FROM t;", table_count_func, NULL);
+
+			while (switch_sql_queue_manager_size(qm, 0)) {
+				switch_cond_next();
+			}
+
+			switch_sql_queue_manager_stop(qm);
+			switch_sql_queue_manager_destroy(&qm);
+
+			fst_check_int_equals(status, max_rows);
+		}
+		FST_TEST_END()
+
+
 	}
 	FST_SUITE_END()
 }
diff --git a/tests/unit/switch_core_file.c b/tests/unit/switch_core_file.c
index 34edf7d7c0d..54da0f4a1a0 100644
--- a/tests/unit/switch_core_file.c
+++ b/tests/unit/switch_core_file.c
@@ -127,6 +127,103 @@ FST_CORE_BEGIN("./conf")
 		}
 		FST_TEST_END()
 
+		FST_TEST_BEGIN(test_switch_core_file_write_mono)
+		{
+			switch_status_t status = SWITCH_STATUS_FALSE;
+			switch_file_handle_t fhw = { 0 };
+			static char filename[] = "/tmp/fs_write_unit_test.wav";
+			int16_t *buf;
+			int nr_frames = 3, i;
+			switch_size_t len;
+
+			len = 160;
+			switch_malloc(buf, len * sizeof(int16_t));
+
+			status = switch_core_file_open(&fhw, filename, 1, 8000, SWITCH_FILE_FLAG_WRITE | SWITCH_FILE_DATA_SHORT, NULL);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			for (i = 0; i < nr_frames; i++) {
+				memset(buf, i, len * sizeof(int16_t));
+				switch_core_file_write(&fhw, buf, &len);
+			}
+
+			status = switch_core_file_close(&fhw);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			switch_safe_free(buf);
+			unlink(filename);
+		}
+		FST_TEST_END()
+		FST_TEST_BEGIN(test_switch_core_file_write_mono_to_stereo)
+		{
+			switch_status_t status = SWITCH_STATUS_FALSE;
+			switch_file_handle_t fhw = { 0 };
+			static char filename[] = "/tmp/fs_write_unit_test.wav";
+			int16_t *buf;
+			int nr_frames = 3, i, want_channels = 2;
+			switch_size_t len;
+
+			len = 160;
+			switch_malloc(buf, len * sizeof(int16_t));
+
+			status = switch_core_file_open(&fhw, filename, want_channels, 8000, SWITCH_FILE_FLAG_WRITE | SWITCH_FILE_DATA_SHORT, NULL);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			fhw.real_channels = 1; 
+
+			for (i = 0; i < nr_frames; i++) {
+				memset(buf, i, len * sizeof(int16_t));
+				status = switch_core_file_write(&fhw, buf, &len);
+				fst_requires(status == SWITCH_STATUS_SUCCESS);
+			}
+
+			status = switch_core_file_close(&fhw);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			switch_safe_free(buf);
+			unlink(filename);
+
+		}
+		FST_TEST_END()
+		FST_TEST_BEGIN(test_switch_core_file_open_max_audio_channels)
+		{
+			switch_file_handle_t fh = { 0 };
+			switch_status_t status = SWITCH_STATUS_FALSE;
+			static char filename[] = "/tmp/fs_unit_test.wav";
+			static char hdr[] = /*simplest wav file, hardcoded 8 channels*/
+				"\x52\x49\x46\x46"
+				"\x24\x00\x00\x00"
+				"\x57\x41\x56\x45"
+				"\x66\x6d\x74\x20"
+				"\x10\x00\x00\x00"
+				"\x01\x00\x08\x00"
+				"\x44\xac\x00\x00"
+				"\x88\x58\x01\x00"
+				"\x02\x00\x10\x00"
+				"\x64\x61\x74\x61"
+				"\x00\x00";
+			FILE *f = NULL;
+
+			f = fopen(filename, "w"); 
+			fst_check(f != NULL);
+			fwrite(hdr, 1, sizeof(hdr) ,f);
+			fclose(f);
+
+			switch_core_max_audio_channels(6);
+			status = switch_core_file_open(&fh, filename, 1, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+			fst_check(status == SWITCH_STATUS_FALSE);
+
+			switch_core_max_audio_channels(8);
+			status = switch_core_file_open(&fh, filename, 1, 8000, SWITCH_FILE_FLAG_READ | SWITCH_FILE_DATA_SHORT, NULL);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			status = switch_core_file_close(&fh);
+			fst_check(status == SWITCH_STATUS_SUCCESS);
+
+			unlink(filename);
+		}
+		FST_TEST_END()
+
 	}
 	FST_SUITE_END()
 }
diff --git a/tests/unit/switch_ivr_originate.c b/tests/unit/switch_ivr_originate.c
index 1354e3b921b..49174f3b336 100644
--- a/tests/unit/switch_ivr_originate.c
+++ b/tests/unit/switch_ivr_originate.c
@@ -1,6 +1,6 @@
 /*
  * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
- * Copyright (C) 2005-2018, Anthony Minessale II <anthm@freeswitch.org>
+ * Copyright (C) 2005-2020, Anthony Minessale II <anthm@freeswitch.org>
  *
  * Version: MPL 1.1
  *
@@ -50,7 +50,7 @@ static switch_status_t my_on_destroy(switch_core_session_t *session)
 {
 	switch_assert(session);
 	destroy++;
-	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_ERROR, "session destroy %d\n", destroy);
+	switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_NOTICE, "session destroy %d\n", destroy);
 
 	return SWITCH_STATUS_SUCCESS;
 }
@@ -71,6 +71,16 @@ static switch_state_handler_table_t state_handlers = {
     SSH_FLAG_STICKY
 };
 
+static int application_hit = 0;
+
+static void loopback_group_confirm_event_handler(switch_event_t *event) // general event handler
+{
+	if (event->event_id == SWITCH_EVENT_CHANNEL_APPLICATION) {
+		application_hit++;
+		switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "application_hit = %d\n", application_hit);
+	}
+}
+
 FST_CORE_BEGIN("./conf")
 {
 	FST_SUITE_BEGIN(switch_ivr_originate)
@@ -78,6 +88,7 @@ FST_CORE_BEGIN("./conf")
 		FST_SETUP_BEGIN()
 		{
 			fst_requires_module("mod_loopback");
+			application_hit = 0;
 		}
 		FST_SETUP_END()
 
@@ -220,6 +231,413 @@ FST_CORE_BEGIN("./conf")
 			switch_dial_handle_destroy(&dh);
 		}
 		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_one_leg)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_no_pre_answer)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_global_var(dh, "null_auto_answer_delay", "2000");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+
+			fst_check_duration(3000, 500);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_exec_in_pre_answer)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_global_var(dh, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_global_var(dh, "null_pre_answer", "true");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+
+			fst_check_duration(1000, 500);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_exec_after_answer_early_ok)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_global_var(dh, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_global_var(dh, "null_pre_answer", "true");
+			switch_dial_handle_add_global_var(dh, "group_confirm_early_ok", "false");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+
+			fst_check_duration(3000, 500);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_exec_after_answer_ignore_early_media)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_global_var(dh, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_global_var(dh, "null_pre_answer", "true");
+			switch_dial_handle_add_global_var(dh, "ignore_early_media", "true");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+
+			fst_check_duration(3000, 500);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_2_legs)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test1");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://1000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test2");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://500");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			const char *name = switch_core_session_get_name(session);
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "channel %s\n", name);
+			fst_check_string_equals(name, "null/test2");
+			switch_core_session_rwunlock(session);
+
+			switch_dial_handle_destroy(&dh);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_global_var)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			const char *dialstring = "{group_confirm_file='playback silence_stream://1000',group_confirm_key=exec}null/test";
+
+			status = switch_ivr_originate(NULL, &session, &cause, dialstring, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			const char *name = switch_core_session_get_name(session);
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "channel %s\n", name);
+			fst_check_string_equals(name, "null/test");
+			switch_core_session_rwunlock(session);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_local_var)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			const char *dialstring = "[group_confirm_file='playback silence_stream://1000',group_confirm_key=exec]null/test1,"
+				"[group_confirm_file='playback silence_stream://500',group_confirm_key=exec]null/test2";
+
+			status = switch_ivr_originate(NULL, &session, &cause, dialstring, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			const char *name = switch_core_session_get_name(session);
+			switch_log_printf(SWITCH_CHANNEL_SESSION_LOG(session), SWITCH_LOG_INFO, "channel %s\n", name);
+			fst_check_string_equals(name, "null/test2");
+			switch_core_session_rwunlock(session);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_loopback_endpoint_originate)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			const char *dialstring = "[group_confirm_key=exec,group_confirm_file='event a=1']loopback/loopback";
+
+			switch_event_bind("test", SWITCH_EVENT_ALL, SWITCH_EVENT_SUBCLASS_ANY, loopback_group_confirm_event_handler, NULL);
+			status = switch_ivr_originate(NULL, &session, &cause, dialstring, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			switch_yield(2000000);
+			switch_channel_hangup(switch_core_session_get_channel(session), SWITCH_CAUSE_NORMAL_CLEARING);
+			switch_core_session_rwunlock(session);
+			switch_event_unbind_callback(loopback_group_confirm_event_handler);
+			fst_check(application_hit == 1);
+		}
+		FST_TEST_END()
+
+		FST_SESSION_BEGIN(originate_test_group_confirm_loopback_endpoint_bridge)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			const char *dialstring = "[group_confirm_key=exec,group_confirm_file='event a=1']loopback/loopback";
+
+			switch_event_bind("test", SWITCH_EVENT_ALL, SWITCH_EVENT_SUBCLASS_ANY, loopback_group_confirm_event_handler, NULL);
+
+			switch_core_session_execute_application(fst_session, "bridge", dialstring);
+			switch_yield(2000000);
+			switch_channel_hangup(fst_channel, SWITCH_CAUSE_NORMAL_CLEARING);
+			fst_check(application_hit == 1);
+		}
+		FST_SESSION_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_leg_timeout_not_finished)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_leg_var(leg, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_leg_var(leg, "leg_timeout", "4");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://6000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_FALSE);
+			fst_check(session == NULL);
+			fst_check_duration(4500, 600); // (>= 3.9 sec, <= 5.1 sec)
+			switch_dial_handle_destroy(&dh);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_leg_timeout_finished)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_leg_var(leg, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_leg_var(leg, "leg_timeout", "6");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://2000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			fst_xcheck(switch_channel_test_flag(switch_core_session_get_channel(session), CF_WINNER), "Expect session is group confirm winner");
+			switch_channel_hangup(switch_core_session_get_channel(session), SWITCH_CAUSE_NORMAL_CLEARING);
+			switch_core_session_rwunlock(session);
+			switch_dial_handle_destroy(&dh);
+			fst_check_duration(4000, 500);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_timeout_leg)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "leg_timeout", "15");
+			switch_dial_handle_add_leg_var(leg, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_leg_var(leg, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://10000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_timeout", "3");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_FALSE);
+			fst_check(session == NULL);
+			switch_dial_handle_destroy(&dh);
+			fst_check_duration(5500, 600); // (> 4.9 sec < 6.1 sec) only 1 second resolution with these timeouts
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_timeout_global)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "leg_timeout", "15");
+			switch_dial_handle_add_leg_var(leg, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_leg_var(leg, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://10000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "group_confirm_timeout", "3");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_FALSE);
+			fst_check(session == NULL);
+			fst_check_duration(5500, 600); // (>= 4.9 sec, <= 6.1 sec) only 1 second resolution with these timeouts
+			switch_dial_handle_destroy(&dh);
+		}
+		FST_TEST_END()
+
+		FST_TEST_BEGIN(originate_test_group_confirm_cancel_timeout_global)
+		{
+			switch_core_session_t *session = NULL;
+			switch_channel_t *channel = NULL;
+			switch_status_t status;
+			switch_call_cause_t cause;
+			switch_dial_handle_t *dh;
+			switch_dial_leg_list_t *ll;
+			switch_dial_leg_t *leg = NULL;
+
+			switch_dial_handle_create(&dh);
+			switch_dial_handle_add_leg_list(dh, &ll);
+
+			switch_dial_leg_list_add_leg(ll, &leg, "null/test");
+			switch_dial_handle_add_leg_var(leg, "leg_timeout", "3");
+			switch_dial_handle_add_leg_var(leg, "null_enable_auto_answer", "1");
+			switch_dial_handle_add_leg_var(leg, "null_auto_answer_delay", "2000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_file", "playback silence_stream://2000");
+			switch_dial_handle_add_leg_var(leg, "group_confirm_key", "exec");
+			switch_dial_handle_add_global_var(dh, "group_confirm_cancel_timeout", "true");
+
+			status = switch_ivr_originate(NULL, &session, &cause, NULL, 0, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, dh);
+			fst_requires(status == SWITCH_STATUS_SUCCESS);
+			fst_requires(session);
+			fst_xcheck(switch_channel_test_flag(switch_core_session_get_channel(session), CF_WINNER), "Expect session is group confirm winner");
+			switch_channel_hangup(switch_core_session_get_channel(session), SWITCH_CAUSE_NORMAL_CLEARING);
+			switch_core_session_rwunlock(session);
+			switch_dial_handle_destroy(&dh);
+			fst_check_duration(4500, 600); // (>= 3.9 sec, <= 5.1 sec)
+		}
+		FST_TEST_END()
 	}
 	FST_SUITE_END()
 }
diff --git a/tests/unit/switch_rtp.c b/tests/unit/switch_rtp.c
index 8c41764eba7..54597753938 100644
--- a/tests/unit/switch_rtp.c
+++ b/tests/unit/switch_rtp.c
@@ -15,7 +15,6 @@ switch_rtp_packet_t rtp_packet;
 switch_frame_flag_t *frame_flags;
 switch_io_flag_t io_flags;
 switch_payload_t read_pt;
-uint datalen;
 
 FST_CORE_BEGIN("./conf")
 {
@@ -23,6 +22,7 @@ FST_SUITE_BEGIN(switch_rtp)
 {
 FST_SETUP_BEGIN()
 {
+	fst_requires_module("mod_loopback");
 }
 FST_SETUP_END()
 
@@ -30,28 +30,72 @@ FST_TEARDOWN_BEGIN()
 {
 }
 FST_TEARDOWN_END()
+	FST_TEST_BEGIN(test_rtp)
+	{
+		switch_core_new_memory_pool(&pool);
+		
+		rtp_session = switch_rtp_new(rx_host, rx_port, tx_host, tx_port, TEST_PT, 8000, 20 * 1000, flags, "soft", &err, pool, 0, 0);
+		fst_xcheck(rtp_session != NULL, "get RTP session");
+		fst_requires(rtp_session);
+		fst_requires(switch_rtp_ready(rtp_session));
+		switch_rtp_activate_rtcp(rtp_session, 5, rx_port + 1, 0);
+		switch_rtp_set_default_payload(rtp_session, TEST_PT);
+		fst_xcheck(switch_rtp_get_default_payload(rtp_session) == TEST_PT, "get Payload Type")
+		switch_rtp_set_ssrc(rtp_session, 0xabcd);
+		switch_rtp_set_remote_ssrc(rtp_session, 0xcdef);
+		fst_xcheck(switch_rtp_get_ssrc(rtp_session) == 0xabcd, "get SSRC");
+		switch_rtp_stats_t *stats = switch_rtp_get_stats(rtp_session, pool);
+		fst_requires(stats);
+		switch_rtp_destroy(&rtp_session);
 
-FST_TEST_BEGIN(test_rtp)
-{
-	switch_core_new_memory_pool(&pool);
-	
-	rtp_session = switch_rtp_new(rx_host, rx_port, tx_host, tx_port, TEST_PT, 8000, 20 * 1000, flags, "soft", &err, pool, 0, 0);
-	fst_xcheck(rtp_session != NULL, "get RTP session");
-	fst_requires(rtp_session);
-	fst_requires(switch_rtp_ready(rtp_session));
-	switch_rtp_activate_rtcp(rtp_session, 5, rx_port + 1, 0);
-	switch_rtp_set_default_payload(rtp_session, TEST_PT);
-	fst_xcheck(switch_rtp_get_default_payload(rtp_session) == TEST_PT, "get Payload Type")
-	switch_rtp_set_ssrc(rtp_session, 0xabcd);
-	switch_rtp_set_remote_ssrc(rtp_session, 0xcdef);
-	fst_xcheck(switch_rtp_get_ssrc(rtp_session) == 0xabcd, "get SSRC");
-	switch_rtp_stats_t *stats = switch_rtp_get_stats(rtp_session, pool);
-	fst_requires(stats);
-	switch_rtp_destroy(&rtp_session);
-
-	switch_core_destroy_memory_pool(&pool);
-}
-FST_TEST_END()
+		switch_core_destroy_memory_pool(&pool);
+	}
+	FST_TEST_END()
+
+	FST_TEST_BEGIN(test_session_with_rtp)
+	{
+		switch_core_session_t *session = NULL;
+		switch_channel_t *channel = NULL;
+		switch_status_t status;
+		switch_call_cause_t cause;
+
+		switch_core_new_memory_pool(&pool);
+
+		status = switch_ivr_originate(NULL, &session, &cause, "null/+15553334444", 2, NULL, NULL, NULL, NULL, NULL, SOF_NONE, NULL, NULL);
+		fst_requires(session);
+		fst_check(status == SWITCH_STATUS_SUCCESS);
+
+		channel = switch_core_session_get_channel(session);
+		fst_requires(channel);
+
+		switch_core_memory_pool_set_data(pool, "__session", session);
+		session = switch_core_memory_pool_get_data(pool, "__session");
+		fst_requires(session);
+		rtp_session = switch_rtp_new(rx_host, rx_port, tx_host, tx_port, TEST_PT, 8000, 20 * 1000, flags, "soft", &err, pool, 0, 0);
+		fst_xcheck(rtp_session != NULL, "switch_rtp_new()");
+		fst_requires(switch_rtp_ready(rtp_session));
+		switch_rtp_activate_rtcp(rtp_session, 5, rx_port + 1, 0);
+		switch_rtp_set_default_payload(rtp_session, TEST_PT);
+		switch_core_media_set_rtp_session(session, SWITCH_MEDIA_TYPE_AUDIO, rtp_session);
+		channel = switch_core_session_get_channel(session);
+		fst_requires(channel);
+		session = switch_rtp_get_core_session(rtp_session);
+		fst_requires(session);
+		status = switch_rtp_activate_jitter_buffer(rtp_session, 1, 10, 80, 8000);
+		fst_xcheck(status == SWITCH_STATUS_SUCCESS, "switch_rtp_activate_jitter_buffer()")
+		status = switch_rtp_debug_jitter_buffer(rtp_session, "debug");
+		fst_xcheck(status == SWITCH_STATUS_SUCCESS, "switch_rtp_debug_jitter_buffer()")
+		fst_requires(switch_rtp_get_jitter_buffer(rtp_session));
+		status = switch_rtp_pause_jitter_buffer(rtp_session, SWITCH_TRUE);
+		fst_xcheck(status == SWITCH_STATUS_SUCCESS, "switch_rtp_pause_jitter_buffer()");
+		status = switch_rtp_deactivate_jitter_buffer(rtp_session);
+		fst_xcheck(status == SWITCH_STATUS_SUCCESS, "switch_rtp_deactivate_jitter_buffer()");
+
+		switch_rtp_destroy(&rtp_session);
+		switch_core_session_rwunlock(session);
+		switch_core_destroy_memory_pool(&pool);
+	}
+	FST_TEST_END()
 }
 FST_SUITE_END()
 }
diff --git a/tests/unit/switch_vad.c b/tests/unit/switch_vad.c
new file mode 100644
index 00000000000..9b97d8bd182
--- /dev/null
+++ b/tests/unit/switch_vad.c
@@ -0,0 +1,228 @@
+/*
+ * FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ * Copyright (C) 2020, Anthony Minessale II <anthm@freeswitch.org>
+ *
+ * Version: MPL 1.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is FreeSWITCH Modular Media Switching Software Library / Soft-Switch Application
+ *
+ * The Initial Developer of the Original Code is
+ * Anthony Minessale II <anthm@freeswitch.org>
+ * Portions created by the Initial Developer are Copyright (C)
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ * Chris Rienzo <chris@signalwire.com>
+ *
+ *
+ * switch_vad.c -- VAD tests
+ *
+ */
+#include <switch.h>
+#include <stdlib.h>
+
+#include <test/switch_test.h>
+
+
+static float next_tone_frame(int16_t *buf, unsigned int samples, float pos)
+{
+	// make sine wave of amplitude 7000 for 8000Hz sample rate VAD
+	float step = 600.0 / 8000.0 * 2.0 * M_PI;
+	unsigned int i;
+	for (i = 0; i < samples; i++) {
+		buf[i] = (int16_t)(7000.0 * sinf(pos));
+		pos += step;
+	}
+	return pos;
+}
+
+static void next_silence_frame(int16_t *buf, unsigned int samples)
+{
+	unsigned int i;
+	for (i = 0; i < samples; i++) {
+		buf[i] = 0;
+	}
+}
+
+FST_CORE_BEGIN("./conf")
+{
+	FST_SUITE_BEGIN(switch_vad)
+	{
+		FST_SETUP_BEGIN()
+		{
+		}
+		FST_SETUP_END()
+
+		FST_TEARDOWN_BEGIN()
+		{
+		}
+		FST_TEARDOWN_END()
+
+#ifdef SWITCH_HAVE_FVAD
+		FST_TEST_BEGIN(fvad_mode_0)
+		{
+			int16_t *buf = malloc(sizeof(int16_t) * 160);
+			int duration;
+			float pos = 0.0;
+			int got_transition = 0;
+			switch_vad_state_t cur_state = SWITCH_VAD_STATE_NONE;
+
+			switch_vad_t *vad = switch_vad_init(8000, 1);
+			fst_requires(vad);
+			int res = switch_vad_set_mode(vad, 0); // tone is detected as speech in mode 0
+			fst_requires(res == 0);
+			switch_vad_set_param(vad, "silence_ms", 400);
+			switch_vad_set_param(vad, "voice_ms", 80);
+			switch_vad_set_param(vad, "debug", 10);
+
+			// generate a tone and pump it into the vad
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 200 ms tone\n");
+			duration = 200 / 20; // 200 ms
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 2);
+			fst_requires(cur_state == SWITCH_VAD_STATE_TALKING);
+
+			// feed silence frames into the vad
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 1000 ms silence\n");
+			duration = 1000 / 20; // 1000 ms
+			got_transition = 0;
+			next_silence_frame(buf, 160);
+			while (--duration >= 0) {
+				switch_vad_state_t new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 2);
+			fst_requires(cur_state == SWITCH_VAD_STATE_NONE);
+
+			// generate a tone < voice_ms
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 40 ms tone\n");
+			duration = 40 / 20; // 40 ms
+			got_transition = 0;
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 0);
+			fst_requires(cur_state == SWITCH_VAD_STATE_NONE);
+
+			// continue tone > voice_ms
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Continue with 60 ms tone\n");
+			duration = 60 / 20; // 60 ms
+			got_transition = 0;
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 1);
+			fst_requires(cur_state == SWITCH_VAD_STATE_START_TALKING);
+
+			free(buf);
+			switch_vad_destroy(&vad);
+			fst_check(vad == NULL);
+		}
+		FST_TEST_END()
+#endif
+
+		FST_TEST_BEGIN(energy)
+		{
+			int16_t *buf = malloc(sizeof(int16_t) * 160);
+			int duration;
+			float pos = 0.0;
+			int got_transition = 0;
+			switch_vad_state_t cur_state = SWITCH_VAD_STATE_NONE;
+
+			switch_vad_t *vad = switch_vad_init(8000, 1);
+			fst_requires(vad);
+			int res = switch_vad_set_mode(vad, -1);
+			fst_requires(res == 0);
+			switch_vad_set_param(vad, "silence_ms", 400);
+			switch_vad_set_param(vad, "voice_ms", 80);
+			switch_vad_set_param(vad, "debug", 10);
+
+			// generate a tone and pump it into the vad
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 200 ms tone\n");
+			duration = 200 / 20; // 200 ms
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 2);
+			fst_requires(cur_state == SWITCH_VAD_STATE_TALKING);
+
+			// feed silence frames into the vad
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 1000 ms silence\n");
+			duration = 1000 / 20; // 1000 ms
+			got_transition = 0;
+			next_silence_frame(buf, 160);
+			while (--duration >= 0) {
+				switch_vad_state_t new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 2);
+			fst_requires(cur_state == SWITCH_VAD_STATE_NONE);
+
+			// generate a tone < voice_ms
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Start 40 ms tone\n");
+			duration = 40 / 20; // 40 ms
+			got_transition = 0;
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 0);
+			fst_requires(cur_state == SWITCH_VAD_STATE_NONE);
+
+			// continue tone > voice_ms
+			switch_log_printf(SWITCH_CHANNEL_LOG, SWITCH_LOG_INFO, "Continue with 60 ms tone\n");
+			duration = 60 / 20; // 60 ms
+			got_transition = 0;
+			while (--duration >= 0) {
+				switch_vad_state_t new_state;
+				pos = next_tone_frame(buf, 160, pos);
+				new_state = switch_vad_process(vad, buf, 160);
+				if (new_state != cur_state) got_transition++;
+				cur_state = new_state;
+			}
+			fst_requires(got_transition == 1);
+			fst_requires(cur_state == SWITCH_VAD_STATE_START_TALKING);
+
+			free(buf);
+			switch_vad_destroy(&vad);
+			fst_check(vad == NULL);
+		}
+		FST_TEST_END()
+	}
+	FST_SUITE_END()
+}
+FST_CORE_END()
+
diff --git a/tests/unit/test.sh b/tests/unit/test.sh
new file mode 100755
index 00000000000..aebddc8cd25
--- /dev/null
+++ b/tests/unit/test.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# All output will be collected here
+TESTSUNITPATH=$PWD
+
+# All relative paths are based on the tree's root
+FSBASEDIR=$(realpath "$PWD/../../")
+
+i=$1
+
+echo "----------------------------------" ;
+echo "Starting test: $i" ;
+echo "----------------------------------" ;
+
+# Change folder to where the test is
+currenttestpath="$FSBASEDIR/$i"
+cd $(dirname "$currenttestpath")
+
+# Tests are unique per module, so need to distinguish them by their directory
+relativedir=$(dirname "$i")
+echo "Relative dir is $relativedir"
+
+file=$(basename -- "$currenttestpath")
+log="$TESTSUNITPATH/log_run-tests_${relativedir//\//!}!$file.html";
+
+# Execute the test
+echo "Start executing $currenttestpath"
+$currenttestpath | tee >(ansi2html > $log) ;
+exitstatus=${PIPESTATUS[0]} ;
+echo "End executing $currenttestpath"
+echo "Exit status is $exitstatus"
+
+if [ "0" -eq $exitstatus ] ; then
+	rm $log ;
+else
+	echo "*** ./$i exit status is $exitstatus" ;
+	corefilesearch=/cores/core.*.!drone!src!${relativedir//\//!}!.libs!$file.* ;
+	echo $corefilesearch ;
+	if ls $corefilesearch 1> /dev/null 2>&1; then
+	    echo "coredump found";
+	    coredump=$(ls $corefilesearch) ;
+	    echo $coredump;
+	    echo "set logging file $TESTSUNITPATH/backtrace_${i//\//!}.txt" ;
+	    gdb -ex "set logging file $TESTSUNITPATH/backtrace_${i//\//!}.txt" -ex "set logging on" -ex "set pagination off" -ex "bt full" -ex "bt" -ex "info threads" -ex "thread apply all bt" -ex "thread apply all bt full" -ex "quit" /drone/src/$relativedir/.libs/$file $coredump ;
+	fi ;
+	echo "*** $log was saved" ;
+fi ;
+echo "----------------" ;
diff --git a/w32/download_OGG.props b/w32/download_OGG.props
index 1761c949a55..cd9c91f62a6 100644
--- a/w32/download_OGG.props
+++ b/w32/download_OGG.props
@@ -29,7 +29,7 @@
 
   <Target Name="OGGDownloadTarget" BeforeTargets="CustomBuild;PreBuildEvent;" DependsOnTargets="7za">
       <DownloadPackageTask
-           package="http://downloads.xiph.org/releases/ogg/libogg-1.1.3.tar.gz"
+           package="http://files.freeswitch.org/downloads/libs/libogg-1.1.3.tar.gz"
            expectfileordirectory="$(BaseDir)libs\libogg-1.1.3\Makefile.am"
            outputfolder=""
            outputfilename=""